612 files changed, 187233 insertions, 0 deletions
diff --git a/src/gallium/drivers/Makefile b/src/gallium/drivers/Makefile
new file mode 100644
index 0000000000..9fe9b2c11d
--- /dev/null
+++ b/src/gallium/drivers/Makefile
@@ -0,0 +1,12 @@
+# src/gallium/drivers/Makefile
+TOP = ../../..
+include $(TOP)/configs/current
+
+SUBDIRS = $(GALLIUM_DRIVERS_DIRS)
+
+default install clean:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE) $@) || exit 1; \
+		fi \
+	done
diff --git a/src/gallium/drivers/cell/Makefile b/src/gallium/drivers/cell/Makefile
new file mode 100644
index 0000000000..47aef7b05f
--- /dev/null
+++ b/src/gallium/drivers/cell/Makefile
@@ -0,0 +1,12 @@
+# Cell Gallium driver Makefile
+
+
+default:
+	( cd spu ; make )
+	( cd ppu ; make )
+
+
+
+clean:
+	( cd spu ; make clean )
+	( cd ppu ; make clean )
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
new file mode 100644
index 0000000000..bbb112fd33
--- /dev/null
+++ b/src/gallium/drivers/cell/common.h
@@ -0,0 +1,377 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Types and tokens which are common to the SPU and PPU code.
+ */
+
+
+#ifndef CELL_COMMON_H
+#define CELL_COMMON_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+#include <stdio.h>
+
+/** The standard assert macro doesn't seem to work reliably */
+#define ASSERT(x) \
+   if (!(x)) { \
+      ubyte *p = NULL; \
+      fprintf(stderr, "%s:%d: %s(): assertion %s failed.\n", \
+              __FILE__, __LINE__, __FUNCTION__, #x);             \
+      *p = 0; \
+      exit(1); \
+   }
+
+
+#define JOIN(x, y) JOIN_AGAIN(x, y)
+#define JOIN_AGAIN(x, y) x ## y
+
+#define STATIC_ASSERT(e) \
+{typedef char JOIN(assertion_failed_at_line_, __LINE__) [(e) ? 1 : -1];}
+
+
+
+/** for sanity checking */
+#define ASSERT_ALIGN16(ptr) \
+  ASSERT((((unsigned long) (ptr)) & 0xf) == 0);
+
+
+/** round up value to next multiple of 4 */
+#define ROUNDUP4(k)  (((k) + 0x3) & ~0x3)
+
+/** round up value to next multiple of 8 */
+#define ROUNDUP8(k)  (((k) + 0x7) & ~0x7)
+
+/** round up value to next multiple of 16 */
+#define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
+
+
+#define CELL_MAX_SPUS 8
+
+#define CELL_MAX_SAMPLERS 4
+#define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+#define CELL_MAX_WIDTH 1024    /**< max framebuffer width */
+#define CELL_MAX_HEIGHT 1024   /**< max framebuffer width */
+
+#define TILE_SIZE 32
+
+
+/**
+ * The low byte of a mailbox word contains the command opcode.
+ * Remaining higher bytes are command specific.
+ */
+#define CELL_CMD_OPCODE_MASK 0xff
+
+#define CELL_CMD_EXIT                 1
+#define CELL_CMD_CLEAR_SURFACE        2
+#define CELL_CMD_FINISH               3
+#define CELL_CMD_RENDER               4
+#define CELL_CMD_BATCH                5
+#define CELL_CMD_RELEASE_VERTS        6
+#define CELL_CMD_STATE_FRAMEBUFFER   10
+#define CELL_CMD_STATE_FRAGMENT_OPS  11
+#define CELL_CMD_STATE_SAMPLER       12
+#define CELL_CMD_STATE_TEXTURE       13
+#define CELL_CMD_STATE_VERTEX_INFO   14
+#define CELL_CMD_STATE_VIEWPORT      15
+#define CELL_CMD_STATE_UNIFORMS      16
+#define CELL_CMD_STATE_VS_ARRAY_INFO 17
+#define CELL_CMD_STATE_BIND_VS       18
+#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
+#define CELL_CMD_STATE_ATTRIB_FETCH  20
+#define CELL_CMD_STATE_FS_CONSTANTS  21
+#define CELL_CMD_STATE_RASTERIZER    22
+#define CELL_CMD_VS_EXECUTE          23
+#define CELL_CMD_FLUSH_BUFFER_RANGE  24
+#define CELL_CMD_FENCE               25
+
+
+/** Command/batch buffers */
+#define CELL_NUM_BUFFERS 4
+#define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
+
+#define CELL_BUFFER_STATUS_FREE 10
+#define CELL_BUFFER_STATUS_USED 20
+
+/** Debug flags */
+#define CELL_DEBUG_CHECKER              (1 << 0)
+#define CELL_DEBUG_ASM                  (1 << 1)
+#define CELL_DEBUG_SYNC                 (1 << 2)
+#define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
+#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
+#define CELL_DEBUG_CMD                  (1 << 5)
+#define CELL_DEBUG_CACHE                (1 << 6)
+
+#define CELL_FENCE_IDLE      0
+#define CELL_FENCE_EMITTED   1
+#define CELL_FENCE_SIGNALLED 2
+
+#define CELL_FACING_FRONT    0
+#define CELL_FACING_BACK     1
+
+struct cell_fence
+{
+   /** There's a 16-byte status qword per SPU */
+   volatile uint status[CELL_MAX_SPUS][4];
+};
+
+#ifdef __SPU__
+typedef vector unsigned int opcode_t;
+#else
+typedef unsigned int opcode_t[4];
+#endif
+
+/**
+ * Fence command sent to SPUs.  In response, the SPUs will write
+ * CELL_FENCE_STATUS_SIGNALLED back to the fence status word in main memory.
+ */
+struct cell_command_fence
+{
+   opcode_t opcode;      /**< CELL_CMD_FENCE */
+   struct cell_fence *fence;
+   uint32_t pad_[3];
+};
+
+
+/**
+ * Command to specify per-fragment operations state and generated code.
+ * Note that this is a variant-length structure, allocated with as 
+ * much memory as needed to hold the generated code; the "code"
+ * field *must* be the last field in the structure.  Also, the entire
+ * length of the structure (including the variant code field) must be
+ * a multiple of 8 bytes; we require that this structure itself be
+ * a multiple of 8 bytes, and that the generated code also be a multiple
+ * of 8 bytes.
+ *
+ * Also note that the dsa, blend, blend_color fields are really only needed
+ * for the fallback/C per-pixel code.  They're not used when we generate
+ * dynamic SPU fragment code (which is the normal case), and will eventually
+ * be removed from this structure.
+ */
+struct cell_command_fragment_ops
+{
+   opcode_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+
+   /* Fields for the fallback case */
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
+
+   /* Fields for the generated SPU code */
+   unsigned total_code_size;
+   unsigned front_code_index;
+   unsigned back_code_index;
+   /* this field has variant length, and must be the last field in 
+    * the structure
+    */
+   unsigned code[0];
+};
+
+
+/** Max instructions for fragment programs */
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 512
+
+/**
+ * Command to send a fragment program to SPUs.
+ */
+struct cell_command_fragment_program
+{
+   opcode_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
+   uint num_inst;        /**< Number of instructions */
+   uint32_t pad[3];
+   unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+};
+
+
+/**
+ * Tell SPUs about the framebuffer size, location
+ */
+struct cell_command_framebuffer
+{
+   opcode_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
+   int width, height;
+   void *color_start, *depth_start;
+   enum pipe_format color_format, depth_format;
+   uint32_t pad_[2];
+};
+
+
+/**
+ * Tell SPUs about rasterizer state.
+ */
+struct cell_command_rasterizer
+{
+   opcode_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
+   struct pipe_rasterizer_state rasterizer;
+   uint32_t pad[1];
+};
+
+
+/**
+ * Clear framebuffer to the given value/color.
+ */
+struct cell_command_clear_surface
+{
+   opcode_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
+   uint surface; /**< Temporary: 0=color, 1=Z */
+   uint value;
+   uint32_t pad[2];
+};
+
+
+/**
+ * Array info used by the vertex shader's vertex puller.
+ */
+struct cell_array_info
+{
+   uint64_t base;      /**< Base address of the 0th element. */
+   uint attr;          /**< Attribute that this state is for. */
+   uint pitch;         /**< Byte pitch from one entry to the next. */
+   uint size;
+   uint function_offset;
+};
+
+
+struct cell_attribute_fetch_code
+{
+   uint64_t base;
+   uint size;
+};
+
+
+struct cell_buffer_range
+{
+   uint64_t base;
+   unsigned size;
+};
+
+
+struct cell_shader_info
+{
+   uint64_t declarations;
+   uint64_t instructions;
+   uint64_t  immediates;
+
+   unsigned num_outputs;
+   unsigned num_declarations;
+   unsigned num_instructions;
+   unsigned num_immediates;
+};
+
+
+#define SPU_VERTS_PER_BATCH 64
+struct cell_command_vs
+{
+   opcode_t opcode;       /**< CELL_CMD_VS_EXECUTE */
+   uint64_t vOut[SPU_VERTS_PER_BATCH];
+   unsigned num_elts;
+   unsigned elts[SPU_VERTS_PER_BATCH];
+   float plane[12][4];
+   unsigned nr_planes;
+   unsigned nr_attrs;
+};
+
+
+struct cell_command_render
+{
+   opcode_t opcode;   /**< CELL_CMD_RENDER */
+   uint prim_type;    /**< PIPE_PRIM_x */
+   uint num_verts;
+   uint vertex_size;  /**< bytes per vertex */
+   uint num_indexes;
+   uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
+   float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
+   uint min_index;
+   boolean inline_verts;
+   uint32_t pad_[1];
+};
+
+
+struct cell_command_release_verts
+{
+   opcode_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
+   uint32_t pad_[3];
+};
+
+
+struct cell_command_sampler
+{
+   opcode_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
+   uint unit;
+   struct pipe_sampler_state state;
+   uint32_t pad_[3];
+};
+
+
+struct cell_command_texture
+{
+   opcode_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint target;         /**< PIPE_TEXTURE_x */
+   uint unit;
+   void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
+   ushort width[CELL_MAX_TEXTURE_LEVELS];
+   ushort height[CELL_MAX_TEXTURE_LEVELS];
+   ushort depth[CELL_MAX_TEXTURE_LEVELS];
+};
+
+
+#define MAX_SPU_FUNCTIONS 12
+/**
+ * Used to tell the PPU about the address of particular functions in the
+ * SPU's address space.
+ */
+struct cell_spu_function_info
+{
+   uint num;
+   char names[MAX_SPU_FUNCTIONS][16];
+   uint addrs[MAX_SPU_FUNCTIONS];
+   char pad[12];   /**< Pad struct to multiple of 16 bytes (256 currently) */
+};
+
+
+/** This is the object passed to spe_create_thread() */
+PIPE_ALIGN_TYPE(16,
+struct cell_init_info
+{
+   unsigned id;
+   unsigned num_spus;
+   unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
+   float inv_timebase;    /**< 1.0/timebase, for perf measurement */
+
+   /** Buffers for command batches, vertex/index data */
+   ubyte *buffers[CELL_NUM_BUFFERS];
+   uint *buffer_status;  /**< points at cell_context->buffer_status */
+
+   struct cell_spu_function_info *spu_functions;
+});
+
+
+#endif /* CELL_COMMON_H */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
new file mode 100644
index 0000000000..c92f8e5cba
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -0,0 +1,86 @@
+# Gallium3D Cell driver: PPU code
+
+# This makefile builds the libcell.a library which gets pulled into
+# the main libGL.so library
+
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+
+# This is the "top-level" cell PPU driver code, will get pulled into libGL.so
+# by the winsys Makefile.
+CELL_LIB = ../libcell.a
+
+
+# This is the SPU code.  We'd like to be able to put this into the libcell.a
+# archive with the PPU code, but nesting .a libs doesn't seem to work.
+# So, it's pulled into libGL.so in gallium/winsys/xlib/Makefile
+SPU_CODE_MODULE = ../spu/g3d_spu.a
+
+
+SOURCES = \
+	cell_batch.c \
+	cell_clear.c \
+	cell_context.c \
+	cell_draw_arrays.c \
+	cell_fence.c \
+	cell_flush.c \
+	cell_gen_fragment.c \
+	cell_gen_fp.c \
+	cell_state_derived.c \
+	cell_state_emit.c \
+	cell_state_shader.c \
+	cell_pipe_state.c \
+	cell_screen.c \
+	cell_state_vertex.c \
+	cell_spu.c \
+	cell_surface.c \
+	cell_texture.c \
+	cell_vbuf.c \
+	cell_vertex_fetch.c \
+	cell_vertex_shader.c
+
+
+OBJECTS = $(SOURCES:.c=.o) \
+
+INCLUDE_DIRS = \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/drivers
+
+.c.o:
+	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
+
+
+.c.s:
+	$(CC) -S $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
+
+
+default: $(CELL_LIB)
+
+
+$(CELL_LIB): $(OBJECTS) $(SPU_CODE_MODULE)
+#	ar -ru $(CELL_LIB) $(OBJECTS) $(SPU_CODE_MODULE) # doesn't work
+	ar -ru $(CELL_LIB) $(OBJECTS)
+
+#$(PROG): $(PPU_OBJECTS)
+#	$(CC) -o $(PROG) $(PPU_OBJECTS) $(SPU_CODE_MODULE) $(PPU_LFLAGS)
+
+
+
+clean:
+	rm -f *.o *~ $(CELL_LIB)
+
+
+
+depend: $(SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(INCLUDE_DIRS) $(SOURCES) 2> /dev/null
+
+include depend
+
+
+
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
new file mode 100644
index 0000000000..fe144f8b84
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -0,0 +1,260 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_spu.h"
+
+
+
+/**
+ * Search the buffer pool for an empty/free buffer and return its index.
+ * Buffers are used for storing vertex data, state and commands which
+ * will be sent to the SPUs.
+ * If no empty buffers are available, wait for one.
+ * \return buffer index in [0, CELL_NUM_BUFFERS-1]
+ */
+uint
+cell_get_empty_buffer(struct cell_context *cell)
+{
+   static uint prev_buffer = 0;
+   uint buf = (prev_buffer + 1) % CELL_NUM_BUFFERS;
+   uint tries = 0;
+
+   /* Find a buffer that's marked as free by all SPUs */
+   while (1) {
+      uint spu, num_free = 0;
+
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (cell->buffer_status[spu][buf][0] == CELL_BUFFER_STATUS_FREE) {
+            num_free++;
+
+            if (num_free == cell->num_spus) {
+               /* found a free buffer, now mark status as used */
+               for (spu = 0; spu < cell->num_spus; spu++) {
+                  cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+               }
+               /*
+               printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries);
+               */
+               prev_buffer = buf;
+
+               /* release tex buffer associated w/ prev use of this batch buf */
+               cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]);
+
+               return buf;
+            }
+         }
+         else {
+            break;
+         }
+      }
+
+      /* try next buf */
+      buf = (buf + 1) % CELL_NUM_BUFFERS;
+
+      tries++;
+      if (tries == 100) {
+         /*
+         printf("PPU WAITING for buffer...\n");
+         */
+      }
+   }
+}
+
+
+/**
+ * Append a fence command to the current batch buffer.
+ * Note that we're sure there's always room for this because of the
+ * adjusted size check in cell_batch_free_space().
+ */
+static void
+emit_fence(struct cell_context *cell)
+{
+   const uint batch = cell->cur_batch;
+   const uint size = cell->buffer_size[batch];
+   struct cell_command_fence *fence_cmd;
+   struct cell_fence *fence = &cell->fenced_buffers[batch].fence;
+   uint i;
+
+   /* set fence status to emitted, not yet signalled */
+   for (i = 0; i < cell->num_spus; i++) {
+      fence->status[i][0] = CELL_FENCE_EMITTED;
+   }
+
+   STATIC_ASSERT(sizeof(struct cell_command_fence) % 16 == 0);
+   ASSERT(size % 16 == 0);
+   ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
+
+   fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
+   fence_cmd->opcode[0] = CELL_CMD_FENCE;
+   fence_cmd->fence = fence;
+
+   /* update batch buffer size */
+   cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
+}
+
+
+/**
+ * Flush the current batch buffer to the SPUs.
+ * An empty buffer will be found and set as the new current batch buffer
+ * for subsequent commands/data.
+ */
+void
+cell_batch_flush(struct cell_context *cell)
+{
+   static boolean flushing = FALSE;
+   uint batch = cell->cur_batch;
+   uint size = cell->buffer_size[batch];
+   uint spu, cmd_word;
+
+   assert(!flushing);
+
+   if (size == 0)
+      return;
+
+   /* Before we use this batch buffer, make sure any fenced texture buffers
+    * are released.
+    */
+   if (cell->fenced_buffers[batch].head) {
+      emit_fence(cell);
+      size = cell->buffer_size[batch];
+   }
+
+   flushing = TRUE;
+
+   assert(batch < CELL_NUM_BUFFERS);
+
+   /*
+   printf("cell_batch_dispatch: buf %u at %p, size %u\n",
+          batch, &cell->buffer[batch][0], size);
+   */
+     
+   /*
+    * Build "BATCH" command and send to all SPUs.
+    */
+   cmd_word = CELL_CMD_BATCH | (batch << 8) | (size << 16);
+
+   for (spu = 0; spu < cell->num_spus; spu++) {
+      assert(cell->buffer_status[spu][batch][0] == CELL_BUFFER_STATUS_USED);
+      send_mbox_message(cell_global.spe_contexts[spu], cmd_word);
+   }
+
+   /* When the SPUs are done copying the buffer into their locals stores
+    * they'll write a BUFFER_STATUS_FREE message into the buffer_status[]
+    * array indicating that the PPU can re-use the buffer.
+    */
+
+   batch = cell_get_empty_buffer(cell);
+
+   cell->buffer_size[batch] = 0;  /* empty */
+   cell->cur_batch = batch;
+
+   flushing = FALSE;
+}
+
+
+/**
+ * Return the number of bytes free in the current batch buffer.
+ */
+uint
+cell_batch_free_space(const struct cell_context *cell)
+{
+   uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
+   free -= sizeof(struct cell_command_fence);
+   return free;
+}
+
+
+/**
+ * Allocate space in the current batch buffer for 'bytes' space.
+ * Bytes must be a multiple of 16 bytes.  Allocation will be 16 byte aligned.
+ * \return address in batch buffer to put data
+ */
+void *
+cell_batch_alloc16(struct cell_context *cell, uint bytes)
+{
+   void *pos;
+   uint size;
+
+   ASSERT(bytes % 16 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
+   ASSERT(cell->cur_batch >= 0);
+
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
+
+   size = cell->buffer_size[cell->cur_batch];
+
+   if (bytes > cell_batch_free_space(cell)) {
+      cell_batch_flush(cell);
+      size = 0;
+   }
+
+   ASSERT(size % 16 == 0);
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
+
+   pos = (void *) (cell->buffer[cell->cur_batch] + size);
+
+   cell->buffer_size[cell->cur_batch] = size + bytes;
+
+   return pos;
+}
+
+
+/**
+ * One-time init of batch buffers.
+ */
+void
+cell_init_batch_buffers(struct cell_context *cell)
+{
+   uint spu, buf;
+
+   /* init command, vertex/index buffer info */
+   for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) {
+      cell->buffer_size[buf] = 0;
+
+      /* init batch buffer status values,
+       * mark 0th buffer as used, rest as free.
+       */
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (buf == 0)
+            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+         else
+            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_FREE;
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.h b/src/gallium/drivers/cell/ppu/cell_batch.h
new file mode 100644
index 0000000000..290136031a
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_batch.h
@@ -0,0 +1,54 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_BATCH_H
+#define CELL_BATCH_H
+
+#include "pipe/p_compiler.h"
+
+
+struct cell_context;
+
+
+extern uint
+cell_get_empty_buffer(struct cell_context *cell);
+
+extern void
+cell_batch_flush(struct cell_context *cell);
+
+extern uint
+cell_batch_free_space(const struct cell_context *cell);
+
+extern void *
+cell_batch_alloc16(struct cell_context *cell, uint bytes);
+
+extern void
+cell_init_batch_buffers(struct cell_context *cell);
+
+
+#endif /* CELL_BATCH_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
new file mode 100644
index 0000000000..246fe21054
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -0,0 +1,92 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Authors
+ *  Brian Paul
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <stdint.h>
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+#include "cell/common.h"
+#include "cell_clear.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_state.h"
+
+
+/**
+ * Called via pipe->clear()
+ */
+void
+cell_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
+           double depth, unsigned stencil)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   if (cell->dirty)
+      cell_update_derived(cell);
+
+   if (buffers & PIPE_CLEAR_COLOR) {
+      uint surfIndex = 0;
+      union util_color uc;
+
+      util_pack_color(rgba, cell->framebuffer.cbufs[0]->format, &uc);
+
+      /* Build a CLEAR command and place it in the current batch buffer */
+      STATIC_ASSERT(sizeof(struct cell_command_clear_surface) % 16 == 0);
+      struct cell_command_clear_surface *clr
+         = (struct cell_command_clear_surface *)
+         cell_batch_alloc16(cell, sizeof(*clr));
+      clr->opcode[0] = CELL_CMD_CLEAR_SURFACE;
+      clr->surface = surfIndex;
+      clr->value = uc.ui;
+   }
+
+   if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
+      uint surfIndex = 1;
+      uint clearValue;
+
+      clearValue = util_pack_z_stencil(cell->framebuffer.zsbuf->format,
+                                       depth, stencil);
+
+      /* Build a CLEAR command and place it in the current batch buffer */
+      STATIC_ASSERT(sizeof(struct cell_command_clear_surface) % 16 == 0);
+      struct cell_command_clear_surface *clr
+         = (struct cell_command_clear_surface *)
+         cell_batch_alloc16(cell, sizeof(*clr));
+      clr->opcode[0] = CELL_CMD_CLEAR_SURFACE;
+      clr->surface = surfIndex;
+      clr->value = clearValue;
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.h b/src/gallium/drivers/cell/ppu/cell_clear.h
new file mode 100644
index 0000000000..08e091adfd
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_clear.h
@@ -0,0 +1,41 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_CLEAR_H
+#define CELL_CLEAR_H
+
+
+struct pipe_context;
+
+
+extern void
+cell_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
+           double depth, unsigned stencil);
+
+
+#endif /* CELL_CLEAR_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
new file mode 100644
index 0000000000..143eca848f
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -0,0 +1,198 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include <stdio.h>
+
+#include "pipe/p_defines.h"
+#include "pipe/p_format.h"
+#include "util/u_memory.h"
+#include "pipe/p_screen.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+#include "cell/common.h"
+#include "cell_batch.h"
+#include "cell_clear.h"
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_fence.h"
+#include "cell_flush.h"
+#include "cell_state.h"
+#include "cell_surface.h"
+#include "cell_spu.h"
+#include "cell_pipe_state.h"
+#include "cell_texture.h"
+#include "cell_vbuf.h"
+
+
+
+static void
+cell_destroy_context( struct pipe_context *pipe )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   util_delete_keymap(cell->fragment_ops_cache, NULL);
+
+   cell_spu_exit(cell);
+
+   align_free(cell);
+}
+
+
+static struct draw_context *
+cell_draw_create(struct cell_context *cell)
+{
+   struct draw_context *draw = draw_create(&cell->pipe);
+
+#if 0 /* broken */
+   if (getenv("GALLIUM_CELL_VS")) {
+      /* plug in SPU-based vertex transformation code */
+      draw->shader_queue_flush = cell_vertex_shader_queue_flush;
+      draw->driver_private = cell;
+   }
+#endif
+
+   return draw;
+}
+
+
+static const struct debug_named_value cell_debug_flags[] = {
+   {"checker", CELL_DEBUG_CHECKER, NULL},/**< modulate tile clear color by SPU ID */
+   {"asm", CELL_DEBUG_ASM, NULL},        /**< dump SPU asm code */
+   {"sync", CELL_DEBUG_SYNC, NULL},      /**< SPUs do synchronous DMA */
+   {"fragops", CELL_DEBUG_FRAGMENT_OPS, NULL}, /**< SPUs emit fragment ops debug messages*/
+   {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK, NULL}, /**< SPUs use reference implementation for fragment ops*/
+   {"cmd", CELL_DEBUG_CMD, NULL},       /**< SPUs dump command buffer info */
+   {"cache", CELL_DEBUG_CACHE, NULL},   /**< report texture cache stats on exit */
+   DEBUG_NAMED_VALUE_END
+};
+
+static unsigned int
+cell_is_resource_referenced( struct pipe_context *pipe,
+			    struct pipe_resource *texture,
+			    unsigned face, unsigned level)
+{
+   /**
+    * FIXME: Optimize.
+    */
+
+   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+}
+
+
+struct pipe_context *
+cell_create_context(struct pipe_screen *screen,
+                    void *priv )
+{
+   struct cell_context *cell;
+   uint i;
+
+   /* some fields need to be 16-byte aligned, so align the whole object */
+   cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
+   if (!cell)
+      return NULL;
+
+   memset(cell, 0, sizeof(*cell));
+
+   cell->winsys = NULL;		/* XXX: fixme - get this from screen? */
+   cell->pipe.winsys = NULL;
+   cell->pipe.screen = screen;
+   cell->pipe.priv = priv;
+   cell->pipe.destroy = cell_destroy_context;
+
+   cell->pipe.clear = cell_clear;
+   cell->pipe.flush = cell_flush;
+
+   cell->pipe.is_resource_referenced = cell_is_resource_referenced;
+
+#if 0
+   cell->pipe.begin_query = cell_begin_query;
+   cell->pipe.end_query = cell_end_query;
+   cell->pipe.wait_query = cell_wait_query;
+#endif
+
+   cell_init_draw_functions(cell);
+   cell_init_state_functions(cell);
+   cell_init_shader_functions(cell);
+   cell_init_surface_functions(cell);
+   cell_init_vertex_functions(cell);
+   cell_init_texture_transfer_funcs(cell);
+
+   cell->draw = cell_draw_create(cell);
+
+   /* Create cache of fragment ops generated code */
+   cell->fragment_ops_cache =
+      util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL);
+
+   cell_init_vbuf(cell);
+
+   draw_set_rasterize_stage(cell->draw, cell->vbuf);
+
+   /* convert all points/lines to tris for the time being */
+   draw_wide_point_threshold(cell->draw, 0.0);
+   draw_wide_line_threshold(cell->draw, 0.0);
+
+   /* get env vars or read config file to get debug flags */
+   cell->debug_flags = debug_get_flags_option("CELL_DEBUG", 
+                                              cell_debug_flags, 
+                                              0 );
+
+   for (i = 0; i < CELL_NUM_BUFFERS; i++)
+      cell_fence_init(&cell->fenced_buffers[i].fence);
+
+
+   /*
+    * SPU stuff
+    */
+   /* This call only works with SDK 3.0.  Anyone still using 2.1??? */
+   cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1);
+   cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
+   if (cell->debug_flags) {
+      printf("Cell: found %d Cell(s) with %u SPUs\n",
+             cell->num_cells, cell->num_spus);
+   }
+   if (getenv("CELL_NUM_SPUS")) {
+      cell->num_spus = atoi(getenv("CELL_NUM_SPUS"));
+      assert(cell->num_spus > 0);
+   }
+
+   cell_start_spus(cell);
+
+   cell_init_batch_buffers(cell);
+
+   /* make sure SPU initializations are done before proceeding */
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
+
+   return &cell->pipe;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
new file mode 100644
index 0000000000..07b6eebc69
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -0,0 +1,209 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_CONTEXT_H
+#define CELL_CONTEXT_H
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_vbuf.h"
+/*#include "cell_winsys.h"*/
+#include "cell/common.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "tgsi/tgsi_scan.h"
+#include "util/u_keymap.h"
+
+
+struct cell_vbuf_render;
+
+
+/**
+ * Cell vertex shader state, subclass of pipe_shader_state.
+ */
+struct cell_vertex_shader_state
+{
+   struct pipe_shader_state shader;
+   struct tgsi_shader_info info;
+   void *draw_data;
+};
+
+
+/**
+ * Cell fragment shader state, subclass of pipe_shader_state.
+ */
+struct cell_fragment_shader_state
+{
+   struct pipe_shader_state shader;
+   struct tgsi_shader_info info;
+   struct spe_function code;
+   void *data;
+};
+
+
+/**
+ * Key for mapping per-fragment state to cached SPU machine code.
+ *  keymap(cell_fragment_ops_key) => cell_command_fragment_ops
+ */
+struct cell_fragment_ops_key
+{
+   struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
+   struct pipe_depth_stencil_alpha_state dsa;
+   enum pipe_format color_format;
+   enum pipe_format zs_format;
+};
+
+
+struct cell_buffer_node;
+
+/**
+ * Fenced buffer list.  List of buffers which can be unreferenced after
+ * the fence has been executed/signalled.
+ */
+struct cell_buffer_list
+{
+   PIPE_ALIGN_VAR(16) struct cell_fence fence;
+   struct cell_buffer_node *head;
+};
+
+struct cell_velems_state
+{
+   unsigned count;
+   struct pipe_vertex_element velem[PIPE_MAX_ATTRIBS];
+};
+
+/**
+ * Per-context state, subclass of pipe_context.
+ */
+struct cell_context
+{
+   struct pipe_context pipe;
+
+   struct cell_winsys *winsys;
+
+   const struct pipe_blend_state *blend;
+   const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   uint num_samplers;
+   const struct pipe_depth_stencil_alpha_state *depth_stencil;
+   const struct pipe_rasterizer_state *rasterizer;
+   const struct cell_vertex_shader_state *vs;
+   const struct cell_fragment_shader_state *fs;
+   const struct cell_velems_state *velems;
+
+   struct spe_function logic_op;
+
+   struct pipe_blend_color blend_color;
+   struct pipe_stencil_ref stencil_ref;
+   struct pipe_clip_state clip;
+   struct pipe_resource *constants[2];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct cell_resource *texture[PIPE_MAX_SAMPLERS];
+   struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
+   uint num_textures;
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   uint num_vertex_buffers;
+
+   ubyte *cbuf_map[PIPE_MAX_COLOR_BUFS];
+   ubyte *zsbuf_map;
+
+   uint dirty;
+   uint dirty_textures;  /* bitmask of texture units */
+   uint dirty_samplers;  /* bitmask of sampler units */
+
+   /** Cache of code generated for per-fragment ops */
+   struct keymap *fragment_ops_cache;
+
+   /** The primitive drawing context */
+   struct draw_context *draw;
+   struct draw_stage *render_stage;
+
+   /** For post-transformed vertex buffering: */
+   struct cell_vbuf_render *vbuf_render;
+   struct draw_stage *vbuf;
+
+   struct vertex_info vertex_info;
+
+   /** Mapped constant buffers */
+   void *mapped_constants[PIPE_SHADER_TYPES];
+
+   PIPE_ALIGN_VAR(16) struct cell_spu_function_info spu_functions;
+
+   uint num_cells, num_spus;
+
+   /** Buffers for command batches, vertex/index data */
+   uint buffer_size[CELL_NUM_BUFFERS];
+   PIPE_ALIGN_VAR(16) ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE];
+
+   int cur_batch;  /**< which buffer is being filled w/ commands */
+
+   /** [4] to ensure 16-byte alignment for each status word */
+   PIPE_ALIGN_VAR(16) uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4];
+
+
+   /** Associated with each command/batch buffer is a list of pipe_buffers
+    * that are fenced.  When the last command in a buffer is executed, the
+    * fence will be signalled, indicating that any pipe_buffers preceeding
+    * that fence can be unreferenced (and probably freed).
+    */
+   struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS];
+
+
+   struct spe_function attrib_fetch;
+   unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
+
+   unsigned debug_flags;
+};
+
+
+
+
+static INLINE struct cell_context *
+cell_context(struct pipe_context *pipe)
+{
+   return (struct cell_context *) pipe;
+}
+
+
+struct pipe_context *
+cell_create_context(struct pipe_screen *screen,
+                    void *priv );
+
+extern void
+cell_vertex_shader_queue_flush(struct draw_context *draw);
+
+
+/* XXX find a better home for this */
+extern void cell_update_vertex_fetch(struct draw_context *draw);
+
+
+#endif /* CELL_CONTEXT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
new file mode 100644
index 0000000000..6a1e4d8a64
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -0,0 +1,146 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "util/u_inlines.h"
+
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_state.h"
+#include "cell_flush.h"
+#include "cell_texture.h"
+
+#include "draw/draw_context.h"
+
+
+
+
+
+
+/**
+ * Draw vertex arrays, with optional indexing.
+ * Basically, map the vertex buffers (and drawing surfaces), then hand off
+ * the drawing to the 'draw' module.
+ *
+ * XXX should the element buffer be specified/bound with a separate function?
+ */
+static void
+cell_draw_range_elements(struct pipe_context *pipe,
+                         struct pipe_resource *indexBuffer,
+                         unsigned indexSize,
+                         int indexBias,
+                         unsigned min_index,
+                         unsigned max_index,
+                         unsigned mode, unsigned start, unsigned count)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct draw_context *draw = cell->draw;
+   unsigned i;
+
+   if (cell->dirty)
+      cell_update_derived( cell );
+
+#if 0
+   cell_map_surfaces(cell);
+#endif
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < cell->num_vertex_buffers; i++) {
+      void *buf = cell_resource(cell->vertex_buffer[i].buffer)->data;
+      draw_set_mapped_vertex_buffer(draw, i, buf);
+   }
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      void *mapped_indexes = cell_resource(indexBuffer)->data;
+      draw_set_mapped_element_buffer(draw, indexSize, indexBias, mapped_indexes);
+   }
+   else {
+      /* no index/element buffer */
+      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
+   }
+
+
+   /* draw! */
+   draw_arrays(draw, mode, start, count);
+
+   /*
+    * unmap vertex/index buffers - will cause draw module to flush
+    */
+   for (i = 0; i < cell->num_vertex_buffers; i++) {
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+   }
+   if (indexBuffer) {
+      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
+   }
+
+   /*
+    * TODO: Flush only when a user vertex/index buffer is present
+    * (or even better, modify draw module to do this
+    * internally when this condition is seen?)
+    */
+   draw_flush(draw);
+}
+
+
+static void
+cell_draw_elements(struct pipe_context *pipe,
+                   struct pipe_resource *indexBuffer,
+                   unsigned indexSize, int indexBias,
+                   unsigned mode, unsigned start, unsigned count)
+{
+   cell_draw_range_elements( pipe, indexBuffer,
+                             indexSize, indexBias,
+                             0, 0xffffffff,
+                             mode, start, count );
+}
+
+
+static void
+cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   cell_draw_elements(pipe, NULL, 0, 0, mode, start, count);
+}
+
+
+void
+cell_init_draw_functions(struct cell_context *cell)
+{
+   cell->pipe.draw_arrays = cell_draw_arrays;
+   cell->pipe.draw_elements = cell_draw_elements;
+   cell->pipe.draw_range_elements = cell_draw_range_elements;
+}
+
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.h b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
new file mode 100644
index 0000000000..148873aa67
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
@@ -0,0 +1,36 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_DRAW_ARRAYS_H
+#define CELL_DRAW_ARRAYS_H
+
+
+extern void
+cell_init_draw_functions(struct cell_context *cell);
+
+
+#endif /* CELL_DRAW_ARRAYS_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
new file mode 100644
index 0000000000..34ca864155
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -0,0 +1,168 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <unistd.h>
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_texture.h"
+
+
+void
+cell_fence_init(struct cell_fence *fence)
+{
+   uint i;
+   ASSERT_ALIGN16(fence->status);
+   for (i = 0; i < CELL_MAX_SPUS; i++) {
+      fence->status[i][0] = CELL_FENCE_IDLE;
+   }
+}
+
+
+boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < cell->num_spus; i++) {
+      if (fence->status[i][0] != CELL_FENCE_SIGNALLED)
+         return FALSE;
+      /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/
+   }
+   return TRUE;
+}
+
+
+void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence)
+{
+   while (!cell_fence_signalled(cell, fence)) {
+      usleep(10);
+   }
+
+#ifdef DEBUG
+   {
+      uint i;
+      for (i = 0; i < cell->num_spus; i++) {
+         assert(fence->status[i][0] == CELL_FENCE_SIGNALLED);
+      }
+   }
+#endif
+}
+
+
+
+
+struct cell_buffer_node
+{
+   struct pipe_resource *buffer;
+   struct cell_buffer_node *next;
+};
+
+
+static void
+cell_add_buffer_to_list(struct cell_context *cell,
+                        struct cell_buffer_list *list,
+                        struct pipe_resource *buffer)
+{
+   struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node);
+   /* create new list node which references the buffer, insert at head */
+   if (node) {
+      pipe_resource_reference(&node->buffer, buffer);
+      node->next = list->head;
+      list->head = node;
+   }
+}
+
+
+/**
+ * Wait for completion of the given fence, then unreference any buffers
+ * on the list.
+ * This typically unrefs/frees texture buffers after any rendering which uses
+ * them has completed.
+ */
+void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list)
+{
+   if (list->head) {
+      struct pipe_screen *ps = cell->pipe.screen;
+      struct cell_buffer_node *node;
+
+      cell_fence_finish(cell, &list->fence);
+
+      /* traverse the list, unreferencing buffers, freeing nodes */
+      node = list->head;
+      while (node) {
+         struct cell_buffer_node *next = node->next;
+         assert(node->buffer);
+         /* XXX need this? pipe_buffer_unmap(ps, node->buffer);*/
+#if 0
+         printf("Unref buffer %p\n", node->buffer);
+         if (node->buffer->reference.count == 1)
+            printf("   Delete!\n");
+#endif
+         pipe_resource_reference(&node->buffer, NULL);
+         FREE(node);
+         node = next;
+      }
+      list->head = NULL;
+   }
+}
+
+
+/**
+ * This should be called for each render command.
+ * Any texture buffers that are current bound will be added to a fenced
+ * list to be freed later when the fence is executed/signalled.
+ */
+void
+cell_add_fenced_textures(struct cell_context *cell)
+{
+   struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch];
+   uint i;
+
+   for (i = 0; i < cell->num_textures; i++) {
+      struct cell_resource *ct = cell->texture[i];
+      if (ct) {
+#if 0
+         printf("Adding texture %p buffer %p to list\n",
+                ct, ct->tiled_buffer[level]);
+#endif
+#if 00
+         /* XXX this needs to be fixed/restored!
+          * Maybe keep pointers to textures, not buffers.
+          */
+         if (ct->base.buffer)
+            cell_add_buffer_to_list(cell, list, ct->buffer);
+#endif
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.h b/src/gallium/drivers/cell/ppu/cell_fence.h
new file mode 100644
index 0000000000..536b4ba411
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FENCE_H
+#define CELL_FENCE_H
+
+
+extern void
+cell_fence_init(struct cell_fence *fence);
+
+
+extern boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence);
+
+
+extern void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence);
+
+
+
+extern void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list);
+
+
+extern void
+cell_add_fenced_textures(struct cell_context *cell);
+
+
+#endif /* CELL_FENCE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
new file mode 100644
index 0000000000..8275c9dc9c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -0,0 +1,112 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_render.h"
+#include "draw/draw_context.h"
+
+
+/**
+ * Called via pipe->flush()
+ */
+void
+cell_flush(struct pipe_context *pipe, unsigned flags,
+           struct pipe_fence_handle **fence)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   if (fence) {
+      *fence = NULL;
+      /* XXX: Implement real fencing */
+      flags |= CELL_FLUSH_WAIT;
+   }
+
+   if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE))
+      flags |= CELL_FLUSH_WAIT;
+
+   draw_flush( cell->draw );
+   cell_flush_int(cell, flags);
+}
+
+
+/**
+ * Cell internal flush function.  Send the current batch buffer to all SPUs.
+ * If flags & CELL_FLUSH_WAIT, do not return until the SPUs are idle.
+ * \param flags  bitmask of flags CELL_FLUSH_WAIT, or zero
+ */
+void
+cell_flush_int(struct cell_context *cell, unsigned flags)
+{
+   static boolean flushing = FALSE;  /* recursion catcher */
+   uint i;
+
+   ASSERT(!flushing);
+   flushing = TRUE;
+
+   if (flags & CELL_FLUSH_WAIT) {
+      STATIC_ASSERT(sizeof(opcode_t) % 16 == 0);
+      opcode_t *cmd = (opcode_t*) cell_batch_alloc16(cell, sizeof(opcode_t));
+      *cmd[0] = CELL_CMD_FINISH;
+   }
+
+   cell_batch_flush(cell);
+
+#if 0
+   /* Send CMD_FINISH to all SPUs */
+   for (i = 0; i < cell->num_spus; i++) {
+      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_FINISH);
+   }
+#endif
+
+   if (flags & CELL_FLUSH_WAIT) {
+      /* Wait for ack */
+      for (i = 0; i < cell->num_spus; i++) {
+         uint k = wait_mbox_message(cell_global.spe_contexts[i]);
+         assert(k == CELL_CMD_FINISH);
+      }
+   }
+
+   flushing = FALSE;
+}
+
+
+void
+cell_flush_buffer_range(struct cell_context *cell, void *ptr,
+			unsigned size)
+{
+   STATIC_ASSERT((sizeof(opcode_t) + sizeof(struct cell_buffer_range)) % 16 == 0);
+   uint32_t *batch = (uint32_t*)cell_batch_alloc16(cell, 
+      sizeof(opcode_t) + sizeof(struct cell_buffer_range));
+   struct cell_buffer_range *br = (struct cell_buffer_range *) &batch[4];
+   batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE;
+   br->base = (uintptr_t) ptr;
+   br->size = size;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.h b/src/gallium/drivers/cell/ppu/cell_flush.h
new file mode 100644
index 0000000000..509ae6239a
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_flush.h
@@ -0,0 +1,45 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FLUSH
+#define CELL_FLUSH
+
+#define CELL_FLUSH_WAIT 0x80000000
+
+extern void
+cell_flush(struct pipe_context *pipe, unsigned flags,
+           struct pipe_fence_handle **fence);
+
+extern void
+cell_flush_int(struct cell_context *cell, unsigned flags);
+
+extern void
+cell_flush_buffer_range(struct cell_context *cell, void *ptr,
+			unsigned size);
+
+#endif
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
new file mode 100644
index 0000000000..1d8a11a4ac
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -0,0 +1,2036 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU fragment program/shader code.
+ *
+ * Note that we generate SOA-style code here.  So each TGSI instruction
+ * operates on four pixels (and is translated into four SPU instructions,
+ * generally speaking).
+ *
+ * \author Brian Paul
+ */
+
+#include <math.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "util/u_memory.h"
+#include "cell_context.h"
+#include "cell_gen_fp.h"
+
+
+#define MAX_TEMPS 16
+#define MAX_IMMED  8
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+/**
+ * Context needed during code generation.
+ */
+struct codegen
+{
+   struct cell_context *cell;
+   int inputs_reg;      /**< 1st function parameter */
+   int outputs_reg;     /**< 2nd function parameter */
+   int constants_reg;   /**< 3rd function parameter */
+   int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
+   int imm_regs[MAX_IMMED][4];  /**< maps TGSI immediates to SPE registers */
+
+   int num_imm;  /**< number of immediates */
+
+   int one_reg;         /**< register containing {1.0, 1.0, 1.0, 1.0} */
+
+   int addr_reg;        /**< address register, integer values */
+
+   /** Per-instruction temps / intermediate temps */
+   int num_itemps;
+   int itemps[12];
+
+   /** Current IF/ELSE/ENDIF nesting level */
+   int if_nesting;
+   /** Current BGNLOOP/ENDLOOP nesting level */
+   int loop_nesting;
+   /** Location of start of current loop */
+   int loop_start;
+
+   /** Index of if/conditional mask register */
+   int cond_mask_reg;
+   /** Index of loop mask register */
+   int loop_mask_reg;
+
+   /** Index of master execution mask register */
+   int exec_mask_reg;
+
+   /** KIL mask: indicates which fragments have been killed */
+   int kill_mask_reg;
+
+   int frame_size;  /**< Stack frame size, in words */
+
+   struct spe_function *f;
+   boolean error;
+};
+
+
+/**
+ * Allocate an intermediate temporary register.
+ */
+static int
+get_itemp(struct codegen *gen)
+{
+   int t = spe_allocate_available_register(gen->f);
+   assert(gen->num_itemps < Elements(gen->itemps));
+   gen->itemps[gen->num_itemps++] = t;
+   return t;
+}
+
+/**
+ * Free all intermediate temporary registers.  To be called after each
+ * instruction has been emitted.
+ */
+static void
+free_itemps(struct codegen *gen)
+{
+   int i;
+   for (i = 0; i < gen->num_itemps; i++) {
+      spe_release_register(gen->f, gen->itemps[i]);
+   }
+   gen->num_itemps = 0;
+}
+
+
+/**
+ * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}.
+ * The register is allocated and initialized upon the first call.
+ */
+static int
+get_const_one_reg(struct codegen *gen)
+{
+   if (gen->one_reg <= 0) {
+      gen->one_reg = spe_allocate_available_register(gen->f);
+
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "init constant reg = 1.0:");
+
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(gen->f, gen->one_reg, 1.0f);
+
+      spe_indent(gen->f, -4);
+   }
+
+   return gen->one_reg;
+}
+
+
+/**
+ * Return index of the address register.
+ * Used for indirect register loads/stores.
+ */
+static int
+get_address_reg(struct codegen *gen)
+{
+   if (gen->addr_reg <= 0) {
+      gen->addr_reg = spe_allocate_available_register(gen->f);
+
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "init address reg = 0:");
+
+      /* init addr = {0, 0, 0, 0} */
+      spe_zero(gen->f, gen->addr_reg);
+
+      spe_indent(gen->f, -4);
+   }
+
+   return gen->addr_reg;
+}
+
+
+/**
+ * Return index of the master execution mask.
+ * The register is allocated an initialized upon the first call.
+ *
+ * The master execution mask controls which pixels in a quad are
+ * modified, according to surrounding conditionals, loops, etc.
+ */
+static int
+get_exec_mask_reg(struct codegen *gen)
+{
+   if (gen->exec_mask_reg <= 0) {
+      gen->exec_mask_reg = spe_allocate_available_register(gen->f);
+
+      /* XXX this may not be needed */
+      spe_comment(gen->f, 0*-4, "initialize master execution mask = ~0");
+      spe_load_int(gen->f, gen->exec_mask_reg, ~0);
+   }
+
+   return gen->exec_mask_reg;
+}
+
+
+/** Return index of the conditional (if/else) execution mask register */
+static int
+get_cond_mask_reg(struct codegen *gen)
+{
+   if (gen->cond_mask_reg <= 0) {
+      gen->cond_mask_reg = spe_allocate_available_register(gen->f);
+   }
+
+   return gen->cond_mask_reg;
+}
+
+
+/** Return index of the loop execution mask register */
+static int
+get_loop_mask_reg(struct codegen *gen)
+{
+   if (gen->loop_mask_reg <= 0) {
+      gen->loop_mask_reg = spe_allocate_available_register(gen->f);
+   }
+
+   return gen->loop_mask_reg;
+}
+
+
+
+static boolean
+is_register_src(struct codegen *gen, int channel,
+                const struct tgsi_full_src_register *src)
+{
+   int swizzle = tgsi_util_get_full_src_register_swizzle(src, channel);
+   int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+
+   if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) {
+      return FALSE;
+   }
+   if (src->Register.File == TGSI_FILE_TEMPORARY ||
+       src->Register.File == TGSI_FILE_IMMEDIATE) {
+      return TRUE;
+   }
+   return FALSE;
+}
+
+  
+static boolean
+is_memory_dst(struct codegen *gen, int channel,
+              const struct tgsi_full_dst_register *dst)
+{
+   if (dst->Register.File == TGSI_FILE_OUTPUT) {
+      return TRUE;
+   }
+   else {
+      return FALSE;
+   }
+}
+
+  
+/**
+ * Return the index of the SPU temporary containing the named TGSI
+ * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
+ * just return the corresponding SPE register.  If the TGIS register
+ * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register
+ * and emit an SPE load instruction.
+ */
+static int
+get_src_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_src_register *src)
+{
+   int reg = -1;
+   int swizzle = tgsi_util_get_full_src_register_swizzle(src, channel);
+   boolean reg_is_itemp = FALSE;
+   uint sign_op;
+
+   assert(swizzle >= TGSI_SWIZZLE_X);
+   assert(swizzle <= TGSI_SWIZZLE_W);
+
+   {
+      int index = src->Register.Index;
+
+      assert(swizzle < 4);
+
+      if (src->Register.Indirect) {
+         /* XXX unfinished */
+      }
+
+      switch (src->Register.File) {
+      case TGSI_FILE_TEMPORARY:
+         reg = gen->temp_regs[index][swizzle];
+         break;
+      case TGSI_FILE_INPUT:
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         reg = gen->imm_regs[index][swizzle];
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
+         }
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   /*
+    * Handle absolute value, negate or set-negative of src register.
+    */
+   sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+   if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+      /*
+       * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
+       */
+      const int bit31mask_reg = get_itemp(gen);
+      int result_reg;
+
+      if (reg_is_itemp) {
+         /* re-use 'reg' for the result */
+         result_reg = reg;
+      }
+      else {
+         /* alloc a new reg for the result */
+         result_reg = get_itemp(gen);
+      }
+
+      /* mask with bit 31 set, the rest cleared */
+      spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
+
+      if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
+         spe_andc(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else if (sign_op == TGSI_UTIL_SIGN_SET) {
+         spe_and(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else {
+         assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
+         spe_xor(gen->f, result_reg, reg, bit31mask_reg);
+      }
+
+      reg = result_reg;
+   }
+
+   return reg;
+}
+
+
+/**
+ * Return the index of an SPE register to use for the given TGSI register.
+ * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the
+ * corresponding SPE register is returned.  If the TGSI register is
+ * TGSI_FILE_OUTPUT we allocate an intermediate temporary register.
+ * See store_dest_reg() below...
+ */
+static int
+get_dst_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_dst_register *dest)
+{
+   int reg = -1;
+
+   switch (dest->Register.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (gen->if_nesting > 0 || gen->loop_nesting > 0)
+         reg = get_itemp(gen);
+      else
+         reg = gen->temp_regs[dest->Register.Index][channel];
+      break;
+   case TGSI_FILE_OUTPUT:
+      reg = get_itemp(gen);
+      break;
+   default:
+      assert(0);
+   }
+
+   return reg;
+}
+
+
+/**
+ * When a TGSI instruction is writing to an output register, this
+ * function emits the SPE store instruction to store the value_reg.
+ * \param value_reg  the SPE register containing the value to store.
+ *                   This would have been returned by get_dst_reg().
+ */
+static void
+store_dest_reg(struct codegen *gen,
+               int value_reg, int channel,
+               const struct tgsi_full_dst_register *dest)
+{
+   /*
+    * XXX need to implement dst reg clamping/saturation
+    */
+#if 0
+   switch (inst->Instruction.Saturate) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      break;
+   default:
+      assert( 0 );
+   }
+#endif
+
+   switch (dest->Register.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (gen->if_nesting > 0 || gen->loop_nesting > 0) {
+         int d_reg = gen->temp_regs[dest->Register.Index][channel];
+         int exec_reg = get_exec_mask_reg(gen);
+         /* Mix d with new value according to exec mask:
+          * d[i] = mask_reg[i] ? value_reg : d_reg
+          */
+         spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
+      }
+      else {
+         /* we're not inside a condition or loop: do nothing special */
+
+      }
+      break;
+   case TGSI_FILE_OUTPUT:
+      {
+         /* offset is measured in quadwords, not bytes */
+         int offset = dest->Register.Index * 4 + channel;
+         if (gen->if_nesting > 0 || gen->loop_nesting > 0) {
+            int exec_reg = get_exec_mask_reg(gen);
+            int curval_reg = get_itemp(gen);
+            /* First read the current value from memory:
+             * Load:  curval = memory[(machine_reg) + offset]
+             */
+            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+            /* Mix curval with newvalue according to exec mask:
+             * d[i] = mask_reg[i] ? value_reg : d_reg
+             */
+            spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
+            /* Store: memory[(machine_reg) + offset] = curval */
+            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+         }
+         else {
+            /* Store: memory[(machine_reg) + offset] = reg */
+            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+static void
+emit_prologue(struct codegen *gen)
+{
+   gen->frame_size = 1024; /* XXX temporary, should be dynamic */
+
+   spe_comment(gen->f, 0, "Function prologue:");
+
+   /* save $lr on stack     # stqd $lr,16($sp) */
+   spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      int sp_reg = spe_allocate_available_register(gen->f);
+      /* offset = -framesize */
+      spe_load_int(gen->f, offset_reg, -gen->frame_size);
+      /* sp = $sp */
+      spe_move(gen->f, sp_reg, SPE_REG_SP);
+      /* $sp = $sp + offset_reg */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* save $sp in stack frame */
+      spe_stqd(gen->f, sp_reg, SPE_REG_SP, 0);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+      spe_release_register(gen->f, sp_reg);
+   }
+   else {
+      /* save stack pointer    # stqd $sp,-frameSize($sp) */
+      spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+      /* adjust stack pointer  # ai $sp,$sp,-frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+   }
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+   const int return_reg = 3;
+
+   spe_comment(gen->f, 0, "Function epilogue:");
+
+   spe_comment(gen->f, 0, "return the killed mask");
+   if (gen->kill_mask_reg > 0) {
+      /* shader called KIL, return the "alive" mask */
+      spe_move(gen->f, return_reg, gen->kill_mask_reg);
+   }
+   else {
+      /* return {0,0,0,0} */
+      spe_load_uint(gen->f, return_reg, 0);
+   }
+
+   spe_comment(gen->f, 0, "restore stack and return");
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      /* offset = framesize */
+      spe_load_int(gen->f, offset_reg, gen->frame_size);
+      /* $sp = $sp + offset */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+   }
+   else {
+      /* restore stack pointer    # ai $sp,$sp,frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+   }
+
+   /* restore $lr              # lqd $lr,16($sp) */
+   spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
+#define FOR_EACH_ENABLED_CHANNEL(inst, ch) \
+   for (ch = 0; ch < 4; ch++) \
+      if (inst->Dst[0].Register.WriteMask & (1 << ch))
+
+
+static boolean
+emit_ARL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch = 0, src_reg, addr_reg;
+
+   src_reg = get_src_reg(gen, ch, &inst->Src[0]);
+   addr_reg = get_address_reg(gen);
+
+   /* convert float to int */
+   spe_cflts(gen->f, addr_reg, src_reg, 0);
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, src_reg[4], dst_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      src_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      dst_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      if (is_register_src(gen, ch, &inst->Src[0]) &&
+          is_memory_dst(gen, ch, &inst->Dst[0])) {
+         /* special-case: register to memory store */
+         store_dest_reg(gen, src_reg[ch], ch, &inst->Dst[0]);
+      }
+      else {
+         spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+         store_dest_reg(gen, dst_reg[ch], ch, &inst->Dst[0]);
+      }
+   }
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+/**
+ * Emit binary operation
+ */
+static boolean
+emit_binop(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+
+   /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
+   }
+
+   /* Loop over Red/Green/Blue/Alpha channels, do the op, store results */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      /* Emit actual SPE instruction: d = s1 + s2 */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      case TGSI_OPCODE_SUB:
+         spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      case TGSI_OPCODE_MUL:
+         spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      default:
+         ;
+      }
+   }
+
+   /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
+   }
+
+   /* Free any intermediate temps we allocated */
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+/**
+ * Emit multiply add.  See emit_ADD for comments.
+ */
+static boolean
+emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]);
+      s3_reg[ch] = get_src_reg(gen, ch, &inst->Src[2]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
+   }
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit linear interpolate.  See emit_ADD for comments.
+ */
+static boolean
+emit_LRP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4];
+
+   /* setup/get src/dst/temp regs */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]);
+      s3_reg[ch] = get_src_reg(gen, ch, &inst->Src[2]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
+      tmp_reg[ch] = get_itemp(gen);
+   }
+
+   /* d = s3 + s1(s2 - s3) */
+   /* do all subtracts, then all fma, then all stores to better pipeline */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
+   }
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+
+/**
+ * Emit reciprocal or recip sqrt.
+ */
+static boolean
+emit_RCP_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4], tmp_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
+      tmp_reg[ch] = get_itemp(gen);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      if (inst->Instruction.Opcode == TGSI_OPCODE_RCP) {
+         /* tmp = 1/s1 */
+         spe_frest(gen->f, tmp_reg[ch], s1_reg[ch]);
+      }
+      else {
+         /* tmp = 1/sqrt(s1) */
+         spe_frsqest(gen->f, tmp_reg[ch], s1_reg[ch]);
+      }
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      /* d = float_interp(s1, tmp) */
+      spe_fi(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit absolute value.  See emit_ADD for comments.
+ */
+static boolean
+emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4];
+   const int bit31mask_reg = get_itemp(gen);
+
+   /* mask with bit 31 set, the rest cleared */  
+   spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
+   }
+
+   /* d = sign bit cleared in s1 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_andc(gen->f, d_reg[ch], s1_reg[ch], bit31mask_reg);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+/**
+ * Emit 3 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s1x_reg, s1y_reg, s1z_reg;
+   int s2x_reg, s2y_reg, s2z_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]);
+   s2x_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]);
+   s2y_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]);
+   s2z_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]);
+
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
+
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg);
+
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
+      spe_move(gen->f, d_reg, t0_reg);
+      store_dest_reg(gen, d_reg, ch, &inst->Dst[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+/**
+ * Emit 4 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s0x_reg, s0y_reg, s0z_reg, s0w_reg;
+   int s1x_reg, s1y_reg, s1z_reg, s1w_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   s0x_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]);
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]);
+   s0y_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]);
+   s0z_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]);
+   s0w_reg = get_src_reg(gen, CHAN_W, &inst->Src[0]);
+   s1w_reg = get_src_reg(gen, CHAN_W, &inst->Src[1]);
+
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg);
+
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s0y_reg, s1y_reg);
+
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s0z_reg, s1z_reg, t0_reg);
+
+   /* t1 = w0 * w1 + t1 */
+   spe_fma(gen->f, t1_reg, s0w_reg, s1w_reg, t1_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
+      spe_move(gen->f, d_reg, t0_reg);
+      store_dest_reg(gen, d_reg, ch, &inst->Dst[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+/**
+ * Emit homogeneous dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   /* XXX rewrite this function to look more like DP3/DP4 */
+   int ch;
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = x0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]);
+   /* t = y0 * y1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]);
+   /* t = z0 * z1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s2_reg = get_src_reg(gen, CHAN_W, &inst->Src[1]);
+   /* t = w1 + t */
+   spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
+      spe_move(gen->f, d_reg, tmp_reg);
+      store_dest_reg(gen, tmp_reg, ch, &inst->Dst[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+/**
+ * Emit 3-component vector normalize.
+ */
+static boolean
+emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int src_reg[3];
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   src_reg[0] = get_src_reg(gen, CHAN_X, &inst->Src[0]);
+   src_reg[1] = get_src_reg(gen, CHAN_Y, &inst->Src[0]);
+   src_reg[2] = get_src_reg(gen, CHAN_Z, &inst->Src[0]);
+
+   /* t0 = x * x */
+   spe_fm(gen->f, t0_reg, src_reg[0], src_reg[0]);
+
+   /* t1 = y * y */
+   spe_fm(gen->f, t1_reg, src_reg[1], src_reg[1]);
+
+   /* t0 = z * z + t0 */
+   spe_fma(gen->f, t0_reg, src_reg[2], src_reg[2], t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   /* t1 = 1.0 / sqrt(t0) */
+   spe_frsqest(gen->f, t1_reg, t0_reg);
+   spe_fi(gen->f, t1_reg, t0_reg, t1_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
+      /* dst = src[ch] * t1 */
+      spe_fm(gen->f, d_reg, src_reg[ch], t1_reg);
+      store_dest_reg(gen, d_reg, ch, &inst->Dst[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit cross product.  See emit_ADD for comments.
+ */
+static boolean
+emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int s1_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]);
+   int s2_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = z0 * y1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]);
+   /* t = y0 * z1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->Dst[0].Register.WriteMask & (1 << CHAN_X)) {
+      store_dest_reg(gen, tmp_reg, CHAN_X, &inst->Dst[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]);
+   /* t = x0 * z1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]);
+   /* t = z0 * x1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->Dst[0].Register.WriteMask & (1 << CHAN_Y)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->Dst[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]);
+   /* t = y0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]);
+   /* t = x0 * y1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->Dst[0].Register.WriteMask & (1 << CHAN_Z)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->Dst[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit inequality instruction.
+ * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
+ * the result but OpenGL/TGSI needs 0.0 and 1.0 results.
+ * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
+ */
+static boolean
+emit_inequality(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4], one_reg;
+   boolean complement = FALSE;
+
+   one_reg = get_const_one_reg(gen);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SGT:
+         spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      case TGSI_OPCODE_SLT:
+         spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]);
+         break;
+      case TGSI_OPCODE_SGE:
+         spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]);
+         complement = TRUE;
+         break;
+      case TGSI_OPCODE_SLE:
+         spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         complement = TRUE;
+         break;
+      case TGSI_OPCODE_SEQ:
+         spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      case TGSI_OPCODE_SNE:
+         spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         complement = TRUE;
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      /* d = d & one_reg */
+      if (complement)
+         spe_andc(gen->f, d_reg[ch], one_reg, d_reg[ch]);
+      else
+         spe_and(gen->f, d_reg[ch], one_reg, d_reg[ch]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit compare.
+ */
+static boolean
+emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int s1_reg = get_src_reg(gen, ch, &inst->Src[0]);
+      int s2_reg = get_src_reg(gen, ch, &inst->Src[1]);
+      int s3_reg = get_src_reg(gen, ch, &inst->Src[2]);
+      int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
+      int zero_reg = get_itemp(gen);
+   
+      spe_zero(gen->f, zero_reg);
+
+      /* d = (s1 < 0) ? s2 : s3 */
+      spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+      spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
+
+      store_dest_reg(gen, d_reg, ch, &inst->Dst[0]);
+      free_itemps(gen);
+   }
+
+   return TRUE;
+}
+
+/**
+ * Emit trunc.  
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
+   }
+
+   /* Convert float to int */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_cflts(gen->f, d_reg[ch], s1_reg[ch], 0);
+   }
+
+   /* Convert int to float */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_csflt(gen->f, d_reg[ch], d_reg[ch], 0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit floor.  
+ * If negative int subtract one
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg;
+
+   zero_reg = get_itemp(gen);
+   spe_zero(gen->f, zero_reg);
+   one_reg = get_const_one_reg(gen);
+   
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
+      tmp_reg[ch] = get_itemp(gen);
+   }
+
+   /* If negative, subtract 1.0 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]);
+   }
+
+   /* Convert float to int */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0);
+   }
+
+   /* Convert int to float */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_csflt(gen->f, d_reg[ch], tmp_reg[ch], 0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Compute frac = Input - FLR(Input)
+ */
+static boolean
+emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg;
+
+   zero_reg = get_itemp(gen);
+   spe_zero(gen->f, zero_reg);
+   one_reg = get_const_one_reg(gen);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
+      tmp_reg[ch] = get_itemp(gen);
+   }
+
+   /* If negative, subtract 1.0 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]);
+   }
+
+   /* Convert float to int */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0);
+   }
+
+   /* Convert int to float */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_csflt(gen->f, tmp_reg[ch], tmp_reg[ch], 0);
+   }
+
+   /* d = s1 - FLR(s1) */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fs(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]);
+   }
+
+   /* store result */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+#if 0
+static void
+print_functions(struct cell_context *cell)
+{
+   struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i;
+   for (i = 0; i < funcs->num; i++) {
+      printf("SPU func %u: %s at %u\n",
+             i, funcs->names[i], funcs->addrs[i]);
+   }
+}
+#endif
+
+
+static uint
+lookup_function(struct cell_context *cell, const char *funcname)
+{
+   const struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i, addr = 0;
+   for (i = 0; i < funcs->num; i++) {
+      if (strcmp(funcs->names[i], funcname) == 0) {
+         addr = funcs->addrs[i];
+      }
+   }
+   assert(addr && "spu function not found");
+   return addr / 4;  /* discard 2 least significant bits */
+}
+
+
+/**
+ * Emit code to call a SPU function.
+ * Used to implement instructions like SIN/COS/POW/TEX/etc.
+ * If scalar, only the X components of the src regs are used, and the
+ * result is replicated across the dest register's XYZW components.
+ */
+static boolean
+emit_function_call(struct codegen *gen,
+                   const struct tgsi_full_instruction *inst,
+                   char *funcname, uint num_args, boolean scalar)
+{
+   const uint addr = lookup_function(gen->cell, funcname);
+   char comment[100];
+   int s_regs[3];
+   int func_called = FALSE;
+   uint a, ch;
+   int retval_reg = -1;
+
+   assert(num_args <= 3);
+
+   snprintf(comment, sizeof(comment), "CALL %s:", funcname);
+   spe_comment(gen->f, -4, comment);
+
+   if (scalar) {
+      for (a = 0; a < num_args; a++) {
+         s_regs[a] = get_src_reg(gen, CHAN_X, &inst->Src[a]);
+      }
+      /* we'll call the function, put the return value in this register,
+       * then replicate it across all write-enabled components in d_reg.
+       */
+      retval_reg = spe_allocate_available_register(gen->f);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg;
+      ubyte usedRegs[SPE_NUM_REGS];
+      uint i, numUsed;
+
+      if (!scalar) {
+         for (a = 0; a < num_args; a++) {
+            s_regs[a] = get_src_reg(gen, ch, &inst->Src[a]);
+         }
+      }
+
+      d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
+
+      if (!scalar || !func_called) {
+         /* for a scalar function, we'll really only call the function once */
+
+         numUsed = spe_get_registers_used(gen->f, usedRegs);
+         assert(numUsed < gen->frame_size / 16 - 2);
+
+         /* save registers to stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            int offset = 2 + i;
+            spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+
+         /* setup function arguments */
+         for (a = 0; a < num_args; a++) {
+            spe_move(gen->f, 3 + a, s_regs[a]);
+         }
+
+         /* branch to function, save return addr */
+         spe_brasl(gen->f, SPE_REG_RA, addr);
+
+         /* save function's return value */
+         if (scalar)
+            spe_move(gen->f, retval_reg, 3);
+         else
+            spe_move(gen->f, d_reg, 3);
+
+         /* restore registers from stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            if (reg != d_reg && reg != retval_reg) {
+               int offset = 2 + i;
+               spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            }
+         }
+
+         func_called = TRUE;
+      }
+
+      if (scalar) {
+         spe_move(gen->f, d_reg, retval_reg);
+      }
+
+      store_dest_reg(gen, d_reg, ch, &inst->Dst[0]);
+      free_itemps(gen);
+   }
+
+   if (scalar) {
+      spe_release_register(gen->f, retval_reg);
+   }
+
+   return TRUE;
+}
+
+
+static boolean
+emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const uint target = inst->Texture.Texture;
+   const uint unit = inst->Src[1].Register.Index;
+   uint addr;
+   int ch;
+   int coord_regs[4], d_regs[4];
+
+   switch (target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_2D:
+      addr = lookup_function(gen->cell, "spu_tex_2d");
+      break;
+   case TGSI_TEXTURE_3D:
+      addr = lookup_function(gen->cell, "spu_tex_3d");
+      break;
+   case TGSI_TEXTURE_CUBE:
+      addr = lookup_function(gen->cell, "spu_tex_cube");
+      break;
+   default:
+      ASSERT(0 && "unsupported texture target");
+      return FALSE;
+   }
+
+   assert(inst->Src[1].Register.File == TGSI_FILE_SAMPLER);
+
+   spe_comment(gen->f, -4, "CALL tex:");
+
+   /* get src/dst reg info */
+   for (ch = 0; ch < 4; ch++) {
+      coord_regs[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      d_regs[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
+   }
+
+   {
+      ubyte usedRegs[SPE_NUM_REGS];
+      uint i, numUsed;
+
+      numUsed = spe_get_registers_used(gen->f, usedRegs);
+      assert(numUsed < gen->frame_size / 16 - 2);
+
+      /* save registers to stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         int offset = 2 + i;
+         spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+      }
+
+      /* setup function arguments (XXX depends on target) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, 3 + i, coord_regs[i]);
+      }
+      spe_load_uint(gen->f, 7, unit); /* sampler unit */
+
+      /* branch to function, save return addr */
+      spe_brasl(gen->f, SPE_REG_RA, addr);
+
+      /* save function's return values (four pixel's colors) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, d_regs[i], 3 + i);
+      }
+
+      /* restore registers from stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         if (reg != d_regs[0] &&
+             reg != d_regs[1] &&
+             reg != d_regs[2] &&
+             reg != d_regs[3]) {
+            int offset = 2 + i;
+            spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+      }
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_regs[ch], ch, &inst->Dst[0]);
+      free_itemps(gen);
+   }
+
+   return TRUE;
+}
+
+
+/**
+ * KILL if any of src reg values are less than zero.
+ */
+static boolean
+emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s_regs[4], kil_reg = -1, cmp_reg, zero_reg;
+
+   spe_comment(gen->f, -4, "CALL kil:");
+
+   /* zero = {0,0,0,0} */
+   zero_reg = get_itemp(gen);
+   spe_zero(gen->f, zero_reg);
+
+   cmp_reg = get_itemp(gen);
+
+   /* get src regs */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s_regs[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+   }
+
+   /* test if any src regs are < 0 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      if (kil_reg >= 0) {
+         /* cmp = 0 > src ? : ~0 : 0 */
+         spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]);
+         /* kil = kil | cmp */
+         spe_or(gen->f, kil_reg, kil_reg, cmp_reg);
+      }
+      else {
+         kil_reg = get_itemp(gen);
+         /* kil = 0 > src ? : ~0 : 0 */
+         spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]);
+      }
+   }
+
+   if (gen->if_nesting || gen->loop_nesting) {
+      /* may have been a conditional kil */
+      spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg);
+   }
+
+   /* allocate the kill mask reg if needed */
+   if (gen->kill_mask_reg <= 0) {
+      gen->kill_mask_reg = spe_allocate_available_register(gen->f);
+      spe_move(gen->f, gen->kill_mask_reg, kil_reg);
+   }
+   else {
+      spe_or(gen->f, gen->kill_mask_reg, gen->kill_mask_reg, kil_reg);
+   }
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+
+/**
+ * Emit min or max.
+ */
+static boolean
+emit_MIN_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s0_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
+      tmp_reg[ch] = get_itemp(gen);         
+   }
+
+   /* d = (s0 > s1) ? s0 : s1 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      if (inst->Instruction.Opcode == TGSI_OPCODE_MAX)
+         spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]);
+      else
+         spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit code to update the execution mask.
+ * This needs to be done whenever the execution status of a conditional
+ * or loop is changed.
+ */
+static void
+emit_update_exec_mask(struct codegen *gen)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+   const int cond_reg = gen->cond_mask_reg;
+   const int loop_reg = gen->loop_mask_reg;
+
+   spe_comment(gen->f, 0, "Update master execution mask");
+
+   if (gen->if_nesting > 0 && gen->loop_nesting > 0) {
+      /* exec_mask = cond_mask & loop_mask */
+      assert(cond_reg > 0);
+      assert(loop_reg > 0);
+      spe_and(gen->f, exec_reg, cond_reg, loop_reg);
+   }
+   else if (gen->if_nesting > 0) {
+      assert(cond_reg > 0);
+      spe_move(gen->f, exec_reg, cond_reg);
+   }
+   else if (gen->loop_nesting > 0) {
+      assert(loop_reg > 0);
+      spe_move(gen->f, exec_reg, loop_reg);
+   }
+   else {
+      spe_load_int(gen->f, exec_reg, ~0x0);
+   }
+}
+
+
+static boolean
+emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int channel = 0;
+   int cond_reg;
+
+   cond_reg = get_cond_mask_reg(gen);
+
+   /* XXX push cond exec mask */
+
+   spe_comment(gen->f,  0, "init conditional exec mask = ~0:");
+   spe_load_int(gen->f, cond_reg, ~0);
+
+   /* update conditional execution mask with the predicate register */
+   int tmp_reg = get_itemp(gen);
+   int s1_reg = get_src_reg(gen, channel, &inst->Src[0]);
+
+   /* tmp = (s1_reg == 0) */
+   spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
+   /* tmp = !tmp */
+   spe_complement(gen->f, tmp_reg, tmp_reg);
+   /* cond_mask = cond_mask & tmp */
+   spe_and(gen->f, cond_reg, cond_reg, tmp_reg);
+
+   gen->if_nesting++;
+
+   /* update the master execution mask */
+   emit_update_exec_mask(gen);
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int cond_reg = get_cond_mask_reg(gen);
+
+   spe_comment(gen->f, 0, "cond exec mask = !cond exec mask");
+   spe_complement(gen->f, cond_reg, cond_reg);
+   emit_update_exec_mask(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   /* XXX todo: pop cond exec mask */
+
+   gen->if_nesting--;
+
+   emit_update_exec_mask(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_BGNLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int exec_reg, loop_reg;
+
+   exec_reg = get_exec_mask_reg(gen);
+   loop_reg = get_loop_mask_reg(gen);
+
+   /* XXX push loop_exec mask */
+
+   spe_comment(gen->f,  0*-4, "initialize loop exec mask = ~0");
+   spe_load_int(gen->f, loop_reg, ~0x0);
+
+   gen->loop_nesting++;
+   gen->loop_start = spe_code_size(gen->f);  /* in bytes */
+
+   return TRUE;
+}
+
+
+static boolean
+emit_ENDLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int loop_reg = get_loop_mask_reg(gen);
+   const int tmp_reg = get_itemp(gen);
+   int offset;
+
+   /* tmp_reg = exec[0] | exec[1] | exec[2] | exec[3] */
+   spe_orx(gen->f, tmp_reg, loop_reg);
+
+   offset = gen->loop_start - spe_code_size(gen->f); /* in bytes */
+
+   /* branch back to top of loop if tmp_reg != 0 */
+   spe_brnz(gen->f, tmp_reg, offset / 4);
+
+   /* XXX pop loop_exec mask */
+
+   gen->loop_nesting--;
+
+   emit_update_exec_mask(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_BRK(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+   const int loop_reg = get_loop_mask_reg(gen);
+
+   assert(gen->loop_nesting > 0);
+
+   spe_comment(gen->f, 0, "loop exec mask &= ~master exec mask");
+   spe_andc(gen->f, loop_reg, loop_reg, exec_reg);
+
+   emit_update_exec_mask(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_CONT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   assert(gen->loop_nesting > 0);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
+             boolean ddx)
+{
+   int ch;
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int s_reg = get_src_reg(gen, ch, &inst->Src[0]);
+      int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
+
+      int t1_reg = get_itemp(gen);
+      int t2_reg = get_itemp(gen);
+
+      spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
+      if (ddx) {
+         spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
+      }
+      else {
+         spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
+      }
+      spe_fs(gen->f, d_reg, t2_reg, t1_reg);
+
+      free_itemps(gen);
+   }
+
+   return TRUE;
+}
+
+
+
+
+/**
+ * Emit END instruction.
+ * We just return from the shader function at this point.
+ *
+ * Note that there may be more code after this that would be
+ * called by TGSI_OPCODE_CALL.
+ */
+static boolean
+emit_END(struct codegen *gen)
+{
+   emit_epilogue(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit code for the given instruction.  Just a big switch stmt.
+ */
+static boolean
+emit_instruction(struct codegen *gen,
+                 const struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      return emit_ARL(gen, inst);
+   case TGSI_OPCODE_MOV:
+      return emit_MOV(gen, inst);
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+      return emit_binop(gen, inst);
+   case TGSI_OPCODE_MAD:
+      return emit_MAD(gen, inst);
+   case TGSI_OPCODE_LRP:
+      return emit_LRP(gen, inst);
+   case TGSI_OPCODE_DP3:
+      return emit_DP3(gen, inst);
+   case TGSI_OPCODE_DP4:
+      return emit_DP4(gen, inst);
+   case TGSI_OPCODE_DPH:
+      return emit_DPH(gen, inst);
+   case TGSI_OPCODE_NRM:
+      return emit_NRM3(gen, inst);
+   case TGSI_OPCODE_XPD:
+      return emit_XPD(gen, inst);
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_RSQ:
+      return emit_RCP_RSQ(gen, inst);
+   case TGSI_OPCODE_ABS:
+      return emit_ABS(gen, inst);
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGE:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+      return emit_inequality(gen, inst);
+   case TGSI_OPCODE_CMP:
+      return emit_CMP(gen, inst);
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      return emit_MIN_MAX(gen, inst);
+   case TGSI_OPCODE_TRUNC:
+      return emit_TRUNC(gen, inst);
+   case TGSI_OPCODE_FLR:
+      return emit_FLR(gen, inst);
+   case TGSI_OPCODE_FRC:
+      return emit_FRC(gen, inst);
+   case TGSI_OPCODE_END:
+      return emit_END(gen);
+
+   case TGSI_OPCODE_COS:
+      return emit_function_call(gen, inst, "spu_cos", 1, TRUE);
+   case TGSI_OPCODE_SIN:
+      return emit_function_call(gen, inst, "spu_sin", 1, TRUE);
+   case TGSI_OPCODE_POW:
+      return emit_function_call(gen, inst, "spu_pow", 2, TRUE);
+   case TGSI_OPCODE_EX2:
+      return emit_function_call(gen, inst, "spu_exp2", 1, TRUE);
+   case TGSI_OPCODE_LG2:
+      return emit_function_call(gen, inst, "spu_log2", 1, TRUE);
+   case TGSI_OPCODE_TEX:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXD:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXB:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXL:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXP:
+      return emit_TEX(gen, inst);
+   case TGSI_OPCODE_KIL:
+      return emit_KIL(gen, inst);
+
+   case TGSI_OPCODE_IF:
+      return emit_IF(gen, inst);
+   case TGSI_OPCODE_ELSE:
+      return emit_ELSE(gen, inst);
+   case TGSI_OPCODE_ENDIF:
+      return emit_ENDIF(gen, inst);
+
+   case TGSI_OPCODE_BGNLOOP:
+      return emit_BGNLOOP(gen, inst);
+   case TGSI_OPCODE_ENDLOOP:
+      return emit_ENDLOOP(gen, inst);
+   case TGSI_OPCODE_BRK:
+      return emit_BRK(gen, inst);
+   case TGSI_OPCODE_CONT:
+      return emit_CONT(gen, inst);
+
+   case TGSI_OPCODE_DDX:
+      return emit_DDX_DDY(gen, inst, TRUE);
+   case TGSI_OPCODE_DDY:
+      return emit_DDX_DDY(gen, inst, FALSE);
+
+   /* XXX lots more cases to do... */
+
+   default:
+      fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
+              inst->Instruction.Opcode);
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+
+/**
+ * Emit code for a TGSI immediate value (vector of four floats).
+ * This involves register allocation and initialization.
+ * XXX the initialization should be done by a "prepare" stage, not
+ * per quad execution!
+ */
+static boolean
+emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
+{
+   int ch;
+
+   assert(gen->num_imm < MAX_TEMPS);
+
+   for (ch = 0; ch < 4; ch++) {
+      float val = immed->u[ch].Float;
+
+      if (ch > 0 && val == immed->u[ch - 1].Float) {
+         /* re-use previous register */
+         gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
+      }
+      else {
+         char str[100];
+         int reg = spe_allocate_available_register(gen->f);
+
+         if (reg < 0)
+            return FALSE;
+
+         sprintf(str, "init $%d = %f", reg, val);
+         spe_comment(gen->f, 0, str);
+
+         /* update immediate map */
+         gen->imm_regs[gen->num_imm][ch] = reg;
+
+         /* emit initializer instruction */
+         spe_load_float(gen->f, reg, val);
+      }
+   }
+
+   gen->num_imm++;
+
+   return TRUE;
+}
+
+
+
+/**
+ * Emit "code" for a TGSI declaration.
+ * We only care about TGSI TEMPORARY register declarations at this time.
+ * For each TGSI TEMPORARY we allocate four SPE registers.
+ */
+static boolean
+emit_declaration(struct cell_context *cell,
+                 struct codegen *gen, const struct tgsi_full_declaration *decl)
+{
+   int i, ch;
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_TEMPORARY:
+      for (i = decl->Range.First;
+           i <= decl->Range.Last;
+           i++) {
+         assert(i < MAX_TEMPS);
+         for (ch = 0; ch < 4; ch++) {
+            gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+            if (gen->temp_regs[i][ch] < 0)
+               return FALSE; /* out of regs */
+         }
+
+         /* XXX if we run out of SPE registers, we need to spill
+          * to SPU memory.  someday...
+          */
+
+         {
+            char buf[100];
+            sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i,
+                    gen->temp_regs[i][0], gen->temp_regs[i][1],
+                    gen->temp_regs[i][2], gen->temp_regs[i][3]);
+            spe_comment(gen->f, 0, buf);
+         }
+      }
+      break;
+   default:
+      ; /* ignore */
+   }
+
+   return TRUE;
+}
+
+
+
+/**
+ * Translate TGSI shader code to SPE instructions.  This is done when
+ * the state tracker gives us a new shader (via pipe->create_fs_state()).
+ *
+ * \param cell    the rendering context (in)
+ * \param tokens  the TGSI shader (in)
+ * \param f       the generated function (out)
+ */
+boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f)
+{
+   struct tgsi_parse_context parse;
+   struct codegen gen;
+   uint ic = 0;
+
+   memset(&gen, 0, sizeof(gen));
+   gen.cell = cell;
+   gen.f = f;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   gen.inputs_reg = 3;     /* pointer to inputs array */
+   gen.outputs_reg = 4;    /* pointer to outputs array */
+   gen.constants_reg = 5;  /* pointer to constants array */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+   spe_allocate_register(f, gen.inputs_reg);
+   spe_allocate_register(f, gen.outputs_reg);
+   spe_allocate_register(f, gen.constants_reg);
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, TRUE);
+      spe_indent(f, 2*8);
+      printf("Begin %s\n", __FUNCTION__);
+      tgsi_dump(tokens, 0);
+   }
+
+   tgsi_parse_init(&parse, tokens);
+
+   emit_prologue(&gen);
+
+   while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         if (f->print) {
+            _debug_printf("    # ");
+            tgsi_dump_immediate(&parse.FullToken.FullImmediate);
+         }
+         if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
+            gen.error = TRUE;
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (f->print) {
+            _debug_printf("    # ");
+            tgsi_dump_declaration(&parse.FullToken.FullDeclaration);
+         }
+         if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
+            gen.error = TRUE;
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (f->print) {
+            _debug_printf("    # ");
+            ic++;
+            tgsi_dump_instruction(&parse.FullToken.FullInstruction, ic);
+         }
+         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
+            gen.error = TRUE;
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   if (gen.error) {
+      /* terminate the SPE code */
+      return emit_END(&gen);
+   }
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+      printf("End %s\n", __FUNCTION__);
+   }
+
+   tgsi_parse_free( &parse );
+
+   return !gen.error;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.h b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
new file mode 100644
index 0000000000..99faea7046
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef CELL_GEN_FP_H
+#define CELL_GEN_FP_H
+
+
+
+extern boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f);
+
+
+#endif /* CELL_GEN_FP_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
new file mode 100644
index 0000000000..628bc1c694
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -0,0 +1,2189 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2009 VMware, Inc.  All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Generate SPU per-fragment code (actually per-quad code).
+ * \author Brian Paul
+ * \author Bob Ellison
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+
+
+
+/** Do extra optimizations? */
+#define OPTIMIZATIONS 1
+
+
+/**
+ * Generate SPE code to perform Z/depth testing.
+ *
+ * \param dsa         Gallium depth/stencil/alpha state to gen code for
+ * \param f           SPE function to append instruction onto.
+ * \param mask_reg    register containing quad/pixel "alive" mask (in/out)
+ * \param ifragZ_reg  register containing integer fragment Z values (in)
+ * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
+ * \param zmask_reg   register containing result of Z test/comparison (out)
+ *
+ * Returns TRUE if the Z-buffer needs to be updated.
+ */
+static boolean
+gen_depth_test(struct spe_function *f,
+               const struct pipe_depth_stencil_alpha_state *dsa,
+               int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
+{
+   /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
+    * quantities.  This only makes a difference for 32-bit Z values though.
+    */
+   ASSERT(dsa->depth.enabled);
+
+   switch (dsa->depth.func) {
+   case PIPE_FUNC_EQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* zmask = (ifragZ > ref) */
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* zmask = (ref > ifragZ) */
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* zmask = (ifragZ > ref) */
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* zmask = (ref > ifragZ) */
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = {0,0,0,0} */
+      spe_move(f, zmask_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* mask unchanged */
+      spe_il(f, zmask_reg, ~0);  /* zmask = {~0,~0,~0,~0} */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+   if (dsa->depth.writemask) {
+      /*
+       * If (ztest passed) {
+       *    framebufferZ = fragmentZ;
+       * }
+       * OR,
+       * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
+       */
+      spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+      return TRUE;
+   }
+
+   return FALSE;
+}
+
+
+/**
+ * Generate SPE code to perform alpha testing.
+ *
+ * \param dsa        Gallium depth/stencil/alpha state to gen code for
+ * \param f          SPE function to append instruction onto.
+ * \param mask_reg   register containing quad/pixel "alive" mask (in/out)
+ * \param fragA_reg  register containing four fragment alpha values (in)
+ */
+static void
+gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f, int mask_reg, int fragA_reg)
+{
+   int ref_reg = spe_allocate_available_register(f);
+   int amask_reg = spe_allocate_available_register(f);
+
+   ASSERT(dsa->alpha.enabled);
+
+   if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
+       (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      /* load/splat the alpha reference float value */
+      spe_load_float(f, ref_reg, dsa->alpha.ref_value);
+   }
+
+   /* emit code to do the alpha comparison, updating 'mask' */
+   switch (dsa->alpha.func) {
+   case PIPE_FUNC_EQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* no-op, mask unchanged */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+#if OPTIMIZATIONS
+   /* if mask == {0,0,0,0} we're all done, return */
+   {
+      /* re-use amask reg here */
+      int tmp_reg = amask_reg;
+      /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
+      spe_orx(f, tmp_reg, mask_reg);
+      /* if tmp[0] == 0 then return from function call */
+      spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
+   }
+#endif
+
+   spe_release_register(f, ref_reg);
+   spe_release_register(f, amask_reg);
+}
+
+
+/**
+ * This pair of functions is used inline to allocate and deallocate
+ * optional constant registers.  Once a constant is discovered to be 
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
+ */
+static INLINE void
+setup_optional_register(struct spe_function *f,
+                        int *r)
+{
+   if (*r < 0)
+      *r = spe_allocate_available_register(f);
+}
+
+static INLINE void
+release_optional_register(struct spe_function *f,
+                          int r)
+{
+   if (r >= 0)
+      spe_release_register(f, r);
+}
+
+static INLINE void
+setup_const_register(struct spe_function *f,
+                     int *r,
+                     float value)
+{
+   if (*r >= 0)
+      return;
+   setup_optional_register(f, r);
+   spe_load_float(f, *r, value);
+}
+
+static INLINE void
+release_const_register(struct spe_function *f,
+                       int r)
+{
+   release_optional_register(f, r);
+}
+
+
+
+/**
+ * Unpack/convert framebuffer colors from four 32-bit packed colors
+ * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
+ * Each 8-bit color component is expanded into a float in [0.0, 1.0].
+ */
+static void
+unpack_colors(struct spe_function *f,
+              enum pipe_format color_format,
+              int fbRGBA_reg,
+              int fbR_reg, int fbG_reg, int fbB_reg, int fbA_reg)
+{
+   int mask0_reg = spe_allocate_available_register(f);
+   int mask1_reg = spe_allocate_available_register(f);
+   int mask2_reg = spe_allocate_available_register(f);
+   int mask3_reg = spe_allocate_available_register(f);
+
+   spe_load_int(f, mask0_reg, 0xff);
+   spe_load_int(f, mask1_reg, 0xff00);
+   spe_load_int(f, mask2_reg, 0xff0000);
+   spe_load_int(f, mask3_reg, 0xff000000);
+
+   spe_comment(f, 0, "Unpack framebuffer colors, convert to floats");
+
+   switch (color_format) {
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      /* fbB = fbRGBA & mask */
+      spe_and(f, fbB_reg, fbRGBA_reg, mask0_reg);
+
+      /* fbG = fbRGBA & mask */
+      spe_and(f, fbG_reg, fbRGBA_reg, mask1_reg);
+
+      /* fbR = fbRGBA & mask */
+      spe_and(f, fbR_reg, fbRGBA_reg, mask2_reg);
+
+      /* fbA = fbRGBA & mask */
+      spe_and(f, fbA_reg, fbRGBA_reg, mask3_reg);
+
+      /* fbG = fbG >> 8 */
+      spe_roti(f, fbG_reg, fbG_reg, -8);
+
+      /* fbR = fbR >> 16 */
+      spe_roti(f, fbR_reg, fbR_reg, -16);
+
+      /* fbA = fbA >> 24 */
+      spe_roti(f, fbA_reg, fbA_reg, -24);
+      break;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      /* fbA = fbRGBA & mask */
+      spe_and(f, fbA_reg, fbRGBA_reg, mask0_reg);
+
+      /* fbR = fbRGBA & mask */
+      spe_and(f, fbR_reg, fbRGBA_reg, mask1_reg);
+
+      /* fbG = fbRGBA & mask */
+      spe_and(f, fbG_reg, fbRGBA_reg, mask2_reg);
+
+      /* fbB = fbRGBA & mask */
+      spe_and(f, fbB_reg, fbRGBA_reg, mask3_reg);
+
+      /* fbR = fbR >> 8 */
+      spe_roti(f, fbR_reg, fbR_reg, -8);
+
+      /* fbG = fbG >> 16 */
+      spe_roti(f, fbG_reg, fbG_reg, -16);
+
+      /* fbB = fbB >> 24 */
+      spe_roti(f, fbB_reg, fbB_reg, -24);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+
+   /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
+   spe_cuflt(f, fbR_reg, fbR_reg, 8);
+   spe_cuflt(f, fbG_reg, fbG_reg, 8);
+   spe_cuflt(f, fbB_reg, fbB_reg, 8);
+   spe_cuflt(f, fbA_reg, fbA_reg, 8);
+
+   spe_release_register(f, mask0_reg);
+   spe_release_register(f, mask1_reg);
+   spe_release_register(f, mask2_reg);
+   spe_release_register(f, mask3_reg);
+}
+
+
+/**
+ * Generate SPE code to implement the given blend mode for a quad of pixels.
+ * \param f          SPE function to append instruction onto.
+ * \param fragR_reg  register with fragment red values (float) (in/out)
+ * \param fragG_reg  register with fragment green values (float) (in/out)
+ * \param fragB_reg  register with fragment blue values (float) (in/out)
+ * \param fragA_reg  register with fragment alpha values (float) (in/out)
+ * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
+ */
+static void
+gen_blend(const struct pipe_blend_state *blend,
+          const struct pipe_blend_color *blend_color,
+          struct spe_function *f,
+          enum pipe_format color_format,
+          int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
+          int fbRGBA_reg)
+{
+   int term1R_reg = spe_allocate_available_register(f);
+   int term1G_reg = spe_allocate_available_register(f);
+   int term1B_reg = spe_allocate_available_register(f);
+   int term1A_reg = spe_allocate_available_register(f);
+
+   int term2R_reg = spe_allocate_available_register(f);
+   int term2G_reg = spe_allocate_available_register(f);
+   int term2B_reg = spe_allocate_available_register(f);
+   int term2A_reg = spe_allocate_available_register(f);
+
+   int fbR_reg = spe_allocate_available_register(f);
+   int fbG_reg = spe_allocate_available_register(f);
+   int fbB_reg = spe_allocate_available_register(f);
+   int fbA_reg = spe_allocate_available_register(f);
+
+   int tmp_reg = spe_allocate_available_register(f);
+
+   /* Optional constant registers we might or might not end up using;
+    * if we do use them, make sure we only allocate them once by
+    * keeping a flag on each one.
+    */
+   int one_reg = -1;
+   int constR_reg = -1, constG_reg = -1, constB_reg = -1, constA_reg = -1;
+
+   ASSERT(blend->rt[0].blend_enable);
+
+   /* packed RGBA -> float colors */
+   unpack_colors(f, color_format, fbRGBA_reg,
+                 fbR_reg, fbG_reg, fbB_reg, fbA_reg);
+
+   /*
+    * Compute Src RGB terms.  We're actually looking for the value
+    * of (the appropriate RGB factors) * (the incoming source RGB color),
+    * because in some cases (like PIPE_BLENDFACTOR_ONE and 
+    * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
+    */
+   switch (blend->rt[0].rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (R,G,B) */
+      spe_move(f, term1R_reg, fragR_reg);
+      spe_move(f, term1G_reg, fragG_reg);
+      spe_move(f, term1B_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factors = (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term1R_reg, 0.0f);
+      spe_load_float(f, term1G_reg, 0.0f);
+      spe_load_float(f, term1B_reg, 0.0f);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
+      spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
+      spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) 
+       * or in other words term = (R-R*R, G-G*G, B-B*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+       * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+       * or term = (R-R*A,G-G*A,B-B*A)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) 
+       * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
+      spe_fm(f, term1R_reg, fragR_reg, constR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
+      spe_fm(f, term1R_reg, fragR_reg, constA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) 
+       * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+       * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* We'll need the optional {1,1,1,1} register */
+      setup_const_register(f, &one_reg, 1.0f);
+      /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
+       * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+       * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+       * as long as a is positive), but then we'd have to do three
+       * spe_float_min() functions instead of one, so this is simpler.
+       */
+      /* tmp = 1 - Afb */
+      spe_fs(f, tmp_reg, one_reg, fbA_reg);
+      /* tmp = min(A,tmp) */
+      spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
+      /* term = R*tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Src Alpha term.  Like the above, we're looking for
+    * the full term A*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
+    */
+   switch (blend->rt[0].alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term1A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
+   case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = A */
+      spe_move(f, term1A_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = A*A */
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = A*(1-A) = A-A*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = A*Afb */
+      spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = A*Ac */
+      spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest RGB term.  Like the above, we're looking for
+    * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
+    */
+   switch (blend->rt[0].rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
+      spe_move(f, term2R_reg, fbR_reg);
+      spe_move(f, term2G_reg, fbG_reg);
+      spe_move(f, term2B_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor s= (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term2R_reg, 0.0f);
+      spe_load_float(f, term2G_reg, 0.0f);
+      spe_load_float(f, term2B_reg, 0.0f);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) 
+       * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
+      spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+       * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) 
+       * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+      spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+      spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) 
+       * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+       * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest Alpha term.  Like the above, we're looking for
+    * the full term Afb*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
+    */
+   switch (blend->rt[0].alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = Afb */
+      spe_move(f, term2A_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term2A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = Afb*A */
+      spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = Afb*Afb */
+      spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = Afb*Ac */
+      spe_fm(f, term2A_reg, fbA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest RGB terms as per the blend equation.
+    */
+   switch (blend->rt[0].rgb_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
+      spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
+      spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest A term
+    */
+   switch (blend->rt[0].alpha_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   spe_release_register(f, term1R_reg);
+   spe_release_register(f, term1G_reg);
+   spe_release_register(f, term1B_reg);
+   spe_release_register(f, term1A_reg);
+
+   spe_release_register(f, term2R_reg);
+   spe_release_register(f, term2G_reg);
+   spe_release_register(f, term2B_reg);
+   spe_release_register(f, term2A_reg);
+
+   spe_release_register(f, fbR_reg);
+   spe_release_register(f, fbG_reg);
+   spe_release_register(f, fbB_reg);
+   spe_release_register(f, fbA_reg);
+
+   spe_release_register(f, tmp_reg);
+
+   /* Free any optional registers that actually got used */
+   release_const_register(f, one_reg);
+   release_const_register(f, constR_reg);
+   release_const_register(f, constG_reg);
+   release_const_register(f, constB_reg);
+   release_const_register(f, constA_reg);
+}
+
+
+static void
+gen_logicop(const struct pipe_blend_state *blend,
+            struct spe_function *f,
+            int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+   ASSERT(blend->logicop_enable);
+
+   switch(blend->logicop_func) {
+      case PIPE_LOGICOP_CLEAR: /* 0 */
+         spe_zero(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOR: /* ~(s | d) */
+         spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_INVERT: /* ~d */
+         /* Note that (A nor A) == ~(A|A) == ~A */
+         spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_XOR: /* s ^ d */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NAND: /* ~(s & d) */
+         spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND: /* s & d */
+         spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOOP: /* d */
+         spe_move(f, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY: /* s */
+         break;
+      case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR: /* s | d */
+         spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_SET: /* 1 */
+         spe_load_int(f, fragRGBA_reg, 0xffffffff);
+         break;
+      default:
+         ASSERT(0);
+   }
+}
+
+
+/**
+ * Generate code to pack a quad of float colors into four 32-bit integers.
+ *
+ * \param f             SPE function to append instruction onto.
+ * \param color_format  the dest color packing format
+ * \param r_reg         register containing four red values (in/clobbered)
+ * \param g_reg         register containing four green values (in/clobbered)
+ * \param b_reg         register containing four blue values (in/clobbered)
+ * \param a_reg         register containing four alpha values (in/clobbered)
+ * \param rgba_reg      register to store the packed RGBA colors (out)
+ */
+static void
+gen_pack_colors(struct spe_function *f,
+                enum pipe_format color_format,
+                int r_reg, int g_reg, int b_reg, int a_reg,
+                int rgba_reg)
+{
+   int rg_reg = spe_allocate_available_register(f);
+   int ba_reg = spe_allocate_available_register(f);
+
+   /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
+   spe_cfltu(f, r_reg, r_reg, 32);
+   spe_cfltu(f, g_reg, g_reg, 32);
+   spe_cfltu(f, b_reg, b_reg, 32);
+   spe_cfltu(f, a_reg, a_reg, 32);
+
+   /* Shift the most significant bytes to the least significant positions.
+    * I.e.: reg = reg >> 24
+    */
+   spe_rotmi(f, r_reg, r_reg, -24);
+   spe_rotmi(f, g_reg, g_reg, -24);
+   spe_rotmi(f, b_reg, b_reg, -24);
+   spe_rotmi(f, a_reg, a_reg, -24);
+
+   /* Shift the color bytes according to the surface format */
+   if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+      spe_roti(f, g_reg, g_reg, 8);   /* green <<= 8 */
+      spe_roti(f, r_reg, r_reg, 16);  /* red <<= 16 */
+      spe_roti(f, a_reg, a_reg, 24);  /* alpha <<= 24 */
+   }
+   else if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
+      spe_roti(f, r_reg, r_reg, 8);   /* red <<= 8 */
+      spe_roti(f, g_reg, g_reg, 16);  /* green <<= 16 */
+      spe_roti(f, b_reg, b_reg, 24);  /* blue <<= 24 */
+   }
+   else {
+      ASSERT(0);
+   }
+
+   /* Merge red, green, blue, alpha registers to make packed RGBA colors.
+    * Eg: after shifting according to color_format we might have:
+    *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
+    *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
+    *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
+    *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
+    * OR-ing all those together gives us four packed colors:
+    *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
+    */
+   spe_or(f, rg_reg, r_reg, g_reg);
+   spe_or(f, ba_reg, a_reg, b_reg);
+   spe_or(f, rgba_reg, rg_reg, ba_reg);
+
+   spe_release_register(f, rg_reg);
+   spe_release_register(f, ba_reg);
+}
+
+
+static void
+gen_colormask(struct spe_function *f,
+              uint colormask,
+              enum pipe_format color_format,
+              int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.  Further, the pixels
+    * are packed according to the given color format, not
+    * necessarily RGBA...
+    */
+   uint r_mask;
+   uint g_mask;
+   uint b_mask;
+   uint a_mask;
+
+   /* Calculate exactly where the bits for any particular color
+    * end up, so we can mask them correctly.
+    */
+   switch(color_format) {
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* ARGB */
+         a_mask = 0xff000000;
+         r_mask = 0x00ff0000;
+         g_mask = 0x0000ff00;
+         b_mask = 0x000000ff;
+         break;
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* BGRA */
+         b_mask = 0xff000000;
+         g_mask = 0x00ff0000;
+         r_mask = 0x0000ff00;
+         a_mask = 0x000000ff;
+         break;
+      default:
+         ASSERT(0);
+   }
+
+   /* For each R, G, B, and A component we're supposed to mask out, 
+    * clear its bits.   Then our mask operation later will work 
+    * as expected.
+    */
+   if (!(colormask & PIPE_MASK_R)) {
+      r_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_G)) {
+      g_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_B)) {
+      b_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_A)) {
+      a_mask = 0;
+   }
+
+   /* Get a temporary register to hold the mask that will be applied
+    * to the fragment
+    */
+   int colormask_reg = spe_allocate_available_register(f);
+
+   /* The actual mask we're going to use is an OR of the remaining R, G, B,
+    * and A masks.  Load the result value into our temporary register.
+    */
+   spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
+
+   /* Use the mask register to select between the fragment color
+    * values and the frame buffer color values.  Wherever the
+    * mask has a 0 bit, the current frame buffer color should override
+    * the fragment color.  Wherever the mask has a 1 bit, the 
+    * fragment color should persevere.  The Select Bits (selb rt, rA, rB, rM)
+    * instruction will select bits from its first operand rA wherever the
+    * the mask bits rM are 0, and from its second operand rB wherever the
+    * mask bits rM are 1.  That means that the frame buffer color is the
+    * first operand, and the fragment color the second.
+    */
+    spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
+
+    /* Release the temporary register and we're done */
+    spe_release_register(f, colormask_reg);
+}
+
+
+/**
+ * This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value.  As such, we have
+ * access to the Compare Immediate instructions where we don't in 
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * There's some added complexity if there's a non-trivial state->mask
+ * value; then stencil and reference both must be masked
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test.  The bitmask of valid
+ * fragments that failed would be found in
+ * (fragment_mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f,
+                 const struct pipe_stencil_state *state,
+                 const unsigned ref_value,
+                 uint stencil_max_value,
+                 int fragment_mask_reg,
+                 int fbS_reg, 
+                 int stencil_pass_reg)
+{
+   /* Generate code that puts the set of passing fragments into the
+    * stencil_pass_reg register, taking into account whether each fragment
+    * was active to begin with.
+    */
+   switch (state->func) {
+   case PIPE_FUNC_EQUAL:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
+         uint tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                state->valuemask & ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & ~(s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                state->valuemask & ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_LESS:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference < s)  */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                  state->valuemask & ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_GREATER:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference > s) */
+         /* There's no convenient Compare Less Than Immediate instruction, so
+          * we'll have to do this one the harder way, by loading a register and 
+          * comparing directly.  Compare Logical Greater Than Word (clgt) 
+          * treats its operands as unsigned - no sign extension.
+          */
+         int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
+         int tmp_reg = spe_allocate_available_register(f);
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->valuemask & ref_value);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference >= s) 
+          *              = fragment_mask & ~(s > reference) */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg,
+                                  ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                  state->valuemask & ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference <= s) ]
+          *               = fragment_mask & ~(reference > s) */
+         /* As above, we have to do this by loading a register */
+         int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
+         int tmp_reg = spe_allocate_available_register(f);
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, ref_value & state->valuemask);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_NEVER:
+      /* stencil_pass = fragment_mask & 0 = 0 */
+      spe_load_uint(f, stencil_pass_reg, 0);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* stencil_pass = fragment_mask & 1 = fragment_mask */
+      spe_move(f, stencil_pass_reg, fragment_mask_reg);
+      break;
+   }
+
+   /* The fragments that passed the stencil test are now in stencil_pass_reg.
+    * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
+    */
+}
+
+
+/**
+ * This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply.  It does not
+ * apply any tests.  It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f,
+                   uint stencil_op,
+                   uint stencil_ref_value,
+                   uint stencil_max_value,
+                   int fbS_reg,
+                   int newS_reg)
+{
+   /* The code below assumes that newS_reg and fbS_reg are not the same
+    * register; if they can be, the calculations below will have to use
+    * an additional temporary register.  For now, mark the assumption
+    * with an assertion that will fail if they are the same.
+    */
+   ASSERT(fbS_reg != newS_reg);
+
+   /* The code also assumes that the stencil_max_value is of the form
+    * 2^n-1 and can therefore be used as a mask for the valid bits in 
+    * addition to a maximum.  Make sure this is the case as well.
+    * The clever math below exploits the fact that incrementing a 
+    * binary number serves to flip all the bits of a number starting at
+    * the LSB and continuing to (and including) the first zero bit
+    * found.  That means that a number and its increment will always
+    * have at least one bit in common (the high order bit, if nothing
+    * else) *unless* the number is zero, *or* the number is of a form
+    * consisting of some number of 1s in the low-order bits followed
+    * by nothing but 0s in the high-order bits.  The latter case
+    * implies it's of the form 2^n-1.
+    */
+   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+   switch(stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* newS = S */
+      spe_move(f, newS_reg, fbS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_ZERO:
+      /* newS = 0 */
+      spe_zero(f, newS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_REPLACE:
+      /* newS = stencil reference value */
+      spe_load_uint(f, newS_reg, stencil_ref_value);
+      break;
+
+   case PIPE_STENCIL_OP_INCR: {
+      /* newS = (s == max ? max : s + 1) */
+      int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_DECR: {
+      /* newS = (s == 0 ? 0 : s - 1) */
+      int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+      /* Add Word Immediate with a (-1) value works */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+       * do a normal add and mask off the correct bits 
+       */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_INVERT:
+      /* newS = ~s.  We take advantage of the mask/max value to invert only
+       * the valid bits for the field so we don't have to do an extra "and".
+       */
+      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/**
+ * This function generates code to get all the necessary possible
+ * stencil values.  For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the 
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f,
+                       const struct pipe_stencil_state *stencil,
+                       const unsigned ref_value,
+                       const uint depth_enabled,
+                       int fbS_reg, 
+                       int *fail_reg,
+                       int *zfail_reg, 
+                       int *zpass_reg)
+{
+   uint zfail_op;
+
+   /* Stenciling had better be enabled here */
+   ASSERT(stencil->enabled);
+
+   /* If the depth test is not enabled, it is treated as though it always
+    * passes, which means that the zfail_op is not considered - a
+    * failing stencil test triggers the fail_op, and a passing one
+    * triggers the zpass_op
+    *
+    * As an optimization, override calculation of the zfail_op values
+    * if they aren't going to be used.  By setting the value of
+    * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
+    * to match the incoming stencil values, and no calculation will
+    * be done.
+    */
+   if (depth_enabled) {
+      zfail_op = stencil->zfail_op;
+   }
+   else {
+      zfail_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   /* One-sided or front-facing stencil */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
+      *fail_reg = fbS_reg;
+   }
+   else {
+      *fail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->fail_op, ref_value, 
+         0xff, fbS_reg, *fail_reg);
+   }
+
+   /* Check the possibly overridden value, not the structure value */
+   if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+      *zfail_reg = fbS_reg;
+   }
+   else if (zfail_op == stencil->fail_op) {
+      *zfail_reg = *fail_reg;
+   }
+   else {
+      *zfail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->zfail_op, ref_value, 
+         0xff, fbS_reg, *zfail_reg);
+   }
+
+   if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+      *zpass_reg = fbS_reg;
+   }
+   else if (stencil->zpass_op == stencil->fail_op) {
+      *zpass_reg = *fail_reg;
+   }
+   else if (stencil->zpass_op == zfail_op) {
+      *zpass_reg = *zfail_reg;
+   }
+   else {
+      *zpass_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->zpass_op, ref_value, 
+         0xff, fbS_reg, *zpass_reg);
+   }
+}
+
+/**
+ * Note that fbZ_reg may *not* be set on entry, if in fact
+ * the depth test is not enabled.  This function must not use
+ * the register if depth is not enabled.
+ */
+static boolean
+gen_stencil_depth_test(struct spe_function *f, 
+                       const struct pipe_depth_stencil_alpha_state *dsa,
+                       const struct pipe_stencil_ref *stencil_ref,
+                       const uint facing,
+                       const int mask_reg, const int fragZ_reg, 
+                       const int fbZ_reg, const int fbS_reg)
+{
+   /* True if we've generated code that could require writeback to the
+    * depth and/or stencil buffers
+    */
+   boolean modified_buffers = FALSE;
+
+   boolean need_to_calculate_stencil_values;
+   boolean need_to_writemask_stencil_values;
+
+   struct pipe_stencil_state *stencil;
+
+   /* Registers.  We may or may not actually allocate these, depending
+    * on whether the state values indicate that we need them.
+    */
+   int stencil_pass_reg, stencil_fail_reg;
+   int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   int stencil_writemask_reg;
+   int zmask_reg;
+   int newS_reg;
+   unsigned ref_value;
+
+   /* Stenciling is quite complex: up to six different configurable stencil 
+    * operations/calculations can be required (three each for front-facing
+    * and back-facing fragments).  Many of those operations will likely 
+    * be identical, so there's good reason to try to avoid calculating 
+    * the same values more than once (which unfortunately makes the code less 
+    * straightforward).
+    *
+    * To make register management easier, we start a new 
+    * register set; we can release all the registers in the set at
+    * once, and avoid having to keep track of exactly which registers
+    * we allocate.  We can still allocate and free registers as 
+    * desired (if we know we no longer need a register), but we don't
+    * have to spend the complexity to track the more difficult variant
+    * register usage scenarios.
+    */
+   spe_comment(f, 0, "Allocating stencil register set");
+   spe_allocate_register_set(f);
+
+   /* The facing we're given is the fragment facing; it doesn't
+    * exactly match the stencil facing.  If stencil is enabled,
+    * but two-sided stencil is *not* enabled, we use the same
+    * stencil settings for both front- and back-facing fragments.
+    * We only use the "back-facing" stencil for backfacing fragments
+    * if two-sided stenciling is enabled.
+    */
+   if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
+      stencil = &dsa->stencil[1];
+      ref_value = stencil_ref->ref_value[1];
+   }
+   else {
+      stencil = &dsa->stencil[0];
+      ref_value = stencil_ref->ref_value[0];
+   }
+
+   /* Calculate the writemask.  If the writemask is trivial (either
+    * all 0s, meaning that we don't need to calculate any stencil values
+    * because they're not going to change the stencil anyway, or all 1s,
+    * meaning that we have to calculate the stencil values but do not
+    * need to mask them), we can avoid generating code.  Don't forget
+    * that we need to consider backfacing stencil, if enabled.
+    *
+    * Note that if the backface stencil is *not* enabled, the backface
+    * stencil will have the same values as the frontface stencil.
+    */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+       need_to_calculate_stencil_values = FALSE;
+       need_to_writemask_stencil_values = FALSE;
+    }
+    else if (stencil->writemask == 0x0) {
+      /* All changes are writemasked out, so no need to calculate
+       * what those changes might be, and no need to write anything back.
+       */
+      need_to_calculate_stencil_values = FALSE;
+      need_to_writemask_stencil_values = FALSE;
+   }
+   else if (stencil->writemask == 0xff) {
+      /* Still trivial, but a little less so.  We need to write the stencil
+       * values, but we don't need to mask them.
+       */
+      need_to_calculate_stencil_values = TRUE;
+      need_to_writemask_stencil_values = FALSE;
+   }
+   else {
+      /* The general case: calculate, mask, and write */
+      need_to_calculate_stencil_values = TRUE;
+      need_to_writemask_stencil_values = TRUE;
+
+      /* While we're here, generate code that calculates what the
+       * writemask should be.  If backface stenciling is enabled,
+       * and the backface writemask is not the same as the frontface
+       * writemask, we'll have to generate code that merges the
+       * two masks into a single effective mask based on fragment facing.
+       */
+      spe_comment(f, 0, "Computing stencil writemask");
+      stencil_writemask_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].writemask);
+   }
+
+   /* At least one-sided stenciling must be on.  Generate code that
+    * runs the stencil test on the basic/front-facing stencil, leaving
+    * the mask of passing stencil bits in stencil_pass_reg.  This mask will
+    * be used both to mask the set of active pixels, and also to
+    * determine how the stencil buffer changes.
+    *
+    * This test will *not* change the value in mask_reg (because we don't
+    * yet know whether to apply the two-sided stencil or one-sided stencil).
+    */
+   spe_comment(f, 0, "Running basic stencil test");
+   stencil_pass_reg = spe_allocate_available_register(f);
+   gen_stencil_test(f, stencil, ref_value, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
+
+   /* Generate code that, given the mask of valid fragments and the
+    * mask of valid fragments that passed the stencil test, computes
+    * the mask of valid fragments that failed the stencil test.  We
+    * have to do this before we run a depth test (because the
+    * depth test should not be performed on fragments that failed the
+    * stencil test, and because the depth test will update the 
+    * mask of valid fragments based on the results of the depth test).
+    */
+   spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
+   stencil_fail_reg = spe_allocate_available_register(f);
+   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+   /* Now remove the stenciled-out pixels from the valid fragment mask,
+    * so we can later use the valid fragment mask in the depth test.
+    */
+   spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+   /* We may not need to calculate stencil values, if the writemask is off */
+   if (need_to_calculate_stencil_values) {
+      /* Generate code that calculates exactly which stencil values we need,
+       * without calculating the same value twice (say, if two different
+       * stencil ops have the same value).  This code will work for one-sided
+       * and two-sided stenciling (so that we take into account that operations
+       * may match between front and back stencils), and will also take into
+       * account whether the depth test is enabled (if the depth test is off,
+       * we don't need any of the zfail results, because the depth test always
+       * is considered to pass if it is disabled).  Any register value that
+       * does not need to be calculated will come back with the same value
+       * that's in fbS_reg.
+       *
+       * This function will allocate a variant number of registers that
+       * will be released as part of the register set.
+       */
+      spe_comment(f, 0, facing == CELL_FACING_FRONT
+                  ? "Computing front-facing stencil values"
+                  : "Computing back-facing stencil values");
+      gen_get_stencil_values(f, stencil, ref_value, dsa->depth.enabled, fbS_reg, 
+         &stencil_fail_values, &stencil_pass_depth_fail_values, 
+         &stencil_pass_depth_pass_values);
+   }  
+
+   /* We now have all the stencil values we need.  We also need 
+    * the results of the depth test to figure out which
+    * stencil values will become the new stencil values.  (Even if
+    * we aren't actually calculating stencil values, we need to apply
+    * the depth test if it's enabled.)
+    *
+    * The code generated by gen_depth_test() returns the results of the
+    * test in the given register, but also alters the mask_reg based
+    * on the results of the test.
+    */
+   if (dsa->depth.enabled) {
+      spe_comment(f, 0, "Running stencil depth test");
+      zmask_reg = spe_allocate_available_register(f);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg,
+                                         fbZ_reg, zmask_reg);
+   }
+
+   if (need_to_calculate_stencil_values) {
+
+      /* If we need to writemask the stencil values before going into
+       * the stencil buffer, we'll have to use a new register to
+       * hold the new values.  If not, we can just keep using the
+       * current register.
+       */
+      if (need_to_writemask_stencil_values) {
+         newS_reg = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Saving current stencil values for writemasking");
+         spe_move(f, newS_reg, fbS_reg);
+      }
+      else {
+         newS_reg = fbS_reg;
+      }
+
+      /* Merge in the selected stencil fail values */
+      if (stencil_fail_values != fbS_reg) {
+         spe_comment(f, 0, "Loading stencil fail values");
+         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+         modified_buffers = TRUE;
+      }
+
+      /* Same for the stencil pass/depth fail values.  If this calculation
+       * is not needed (say, if depth test is off), then the
+       * stencil_pass_depth_fail_values register will be equal to fbS_reg
+       * and we'll skip the calculation.
+       */
+      if (stencil_pass_depth_fail_values != fbS_reg) {
+         /* We don't actually have a stencil pass/depth fail mask yet.
+          * Calculate it here from the stencil passing mask and the
+          * depth passing mask.  Note that zmask_reg *must* have been
+          * set above if we're here.
+          */
+         uint stencil_pass_depth_fail_mask =
+            spe_allocate_available_register(f);
+
+         spe_comment(f, 0, "Loading stencil pass/depth fail values");
+         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values,
+                  stencil_pass_depth_fail_mask);
+
+         spe_release_register(f, stencil_pass_depth_fail_mask);
+         modified_buffers = TRUE;
+      }
+
+      /* Same for the stencil pass/depth pass mask.  Note that we
+       * *can* get here with zmask_reg being unset (if the depth
+       * test is off but the stencil test is on).  In this case,
+       * we assume the depth test passes, and don't need to mask
+       * the stencil pass mask with the Z mask.
+       */
+      if (stencil_pass_depth_pass_values != fbS_reg) {
+         if (dsa->depth.enabled) {
+            uint stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+            /* We'll need a separate register */
+            spe_comment(f, 0, "Loading stencil pass/depth pass values");
+            spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+            spe_release_register(f, stencil_pass_depth_pass_mask);
+         }
+         else {
+            /* We can use the same stencil-pass register */
+            spe_comment(f, 0, "Loading stencil pass values");
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
+         }
+         modified_buffers = TRUE;
+      }
+
+      /* Almost done.  If we need to writemask, do it now, leaving the
+       * results in the fbS_reg register passed in.  If we don't need
+       * to writemask, then the results are *already* in the fbS_reg,
+       * so there's nothing more to do.
+       */
+
+      if (need_to_writemask_stencil_values && modified_buffers) {
+         /* The Select Bytes command makes a fine writemask.  Where
+          * the mask is 0, the first (original) values are retained,
+          * effectively masking out changes.  Where the mask is 1, the
+          * second (new) values are retained, incorporating changes.
+          */
+         spe_comment(f, 0, "Writemasking new stencil values");
+         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+      }
+
+   } /* done calculating stencil values */
+
+   /* The stencil and/or depth values have been applied, and the
+    * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+    * We're all done, except that we've allocated a fair number
+    * of registers that we didn't bother tracking.  Release all
+    * those registers as part of the register set, and go home.
+    */
+   spe_comment(f, 0, "Releasing stencil register set");
+   spe_release_register_set(f);
+
+   /* Return TRUE if we could have modified the stencil and/or
+    * depth buffers.
+    */
+   return modified_buffers;
+}
+
+
+/**
+ * Generate depth and/or stencil test code.
+ * \param cell  context
+ * \param dsa  depth/stencil/alpha state
+ * \param f  spe function to emit
+ * \param facing  either CELL_FACING_FRONT or CELL_FACING_BACK
+ * \param mask_reg  register containing the pixel alive/dead mask
+ * \param depth_tile_reg  register containing address of z/stencil tile
+ * \param quad_offset_reg  offset to quad from start of tile
+ * \param fragZ_reg  register containg fragment Z values
+ */
+static void
+gen_depth_stencil(struct cell_context *cell,
+                  const struct pipe_depth_stencil_alpha_state *dsa,
+                  const struct pipe_stencil_ref *stencil_ref,
+                  struct spe_function *f,
+                  uint facing,
+                  int mask_reg,
+                  int depth_tile_reg,
+                  int quad_offset_reg,
+                  int fragZ_reg)
+
+{
+   const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
+   boolean write_depth_stencil;
+
+   /* framebuffer's combined z/stencil values register */
+   int fbZS_reg = spe_allocate_available_register(f);
+
+   /* Framebufer Z values register */
+   int fbZ_reg = spe_allocate_available_register(f);
+
+   /* Framebuffer stencil values register (may not be used) */
+   int fbS_reg = spe_allocate_available_register(f);
+
+   /* 24-bit mask register (may not be used) */
+   int zmask_reg = spe_allocate_available_register(f);
+
+   /**
+    * The following code:
+    * 1. fetch quad of packed Z/S values from the framebuffer tile.
+    * 2. extract the separate the Z and S values from packed values
+    * 3. convert fragment Z values from float in [0,1] to 32/24/16-bit ints
+    *
+    * The instructions for doing this are interleaved for better performance.
+    */
+   spe_comment(f, 0, "Fetch Z/stencil quad from tile");
+
+   switch(zs_format) {
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED: /* fall through */
+   case PIPE_FORMAT_Z24X8_UNORM:
+      /* prepare mask to extract Z vals from ZS vals */
+      spe_load_uint(f, zmask_reg, 0x00ffffff);
+
+      /* convert fragment Z from [0,1] to 32-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* right shift 32-bit fragment Z to 24 bits */
+      spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+      /* extract 24-bit Z values from ZS values by masking */
+      spe_and(f, fbZ_reg, fbZS_reg, zmask_reg);
+
+      /* extract 8-bit stencil values by shifting */
+      spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+      break;
+
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM: /* fall through */
+   case PIPE_FORMAT_X8Z24_UNORM:
+      /* convert fragment Z from [0,1] to 32-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* right shift 32-bit fragment Z to 24 bits */
+      spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+      /* extract 24-bit Z values from ZS values by shifting */
+      spe_rotmi(f, fbZ_reg, fbZS_reg, -8);
+
+      /* extract 8-bit stencil values by masking */
+      spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+      break;
+
+   case PIPE_FORMAT_Z32_UNORM:
+      /* Load: fbZ_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZ_reg, depth_tile_reg, quad_offset_reg);
+
+      /* convert fragment Z from [0,1] to 32-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+      /* No stencil, so can't do anything there */
+      break;
+
+   case PIPE_FORMAT_Z16_UNORM:
+      /* XXX This code for 16bpp Z is broken! */
+
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* Copy over 4 32-bit values */
+      spe_move(f, fbZ_reg, fbZS_reg);
+
+      /* convert Z from [0,1] to 16-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+      spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+      /* No stencil */
+      break;
+
+   default:
+      ASSERT(0); /* invalid format */
+   }
+
+   /* If stencil is enabled, use the stencil-specific code
+    * generator to generate both the stencil and depth (if needed)
+    * tests.  Otherwise, if only depth is enabled, generate
+    * a quick depth test.  The test generators themselves will
+    * report back whether the depth/stencil buffer has to be
+    * written back.
+    */
+   if (dsa->stencil[0].enabled) {
+      /* This will perform the stencil and depth tests, and update
+       * the mask_reg, fbZ_reg, and fbS_reg as required by the
+       * tests.
+       */
+      ASSERT(fbS_reg >= 0);
+      spe_comment(f, 0, "Perform stencil test");
+
+      /* Note that fbZ_reg may not be set on entry, if stenciling
+       * is enabled but there's no Z-buffer.  The 
+       * gen_stencil_depth_test() function must ignore the
+       * fbZ_reg register if depth is not enabled.
+       */
+      write_depth_stencil = gen_stencil_depth_test(f, dsa, stencil_ref, facing,
+                                                   mask_reg, fragZ_reg,
+                                                   fbZ_reg, fbS_reg);
+   }
+   else if (dsa->depth.enabled) {
+      int zmask_reg = spe_allocate_available_register(f);
+      ASSERT(fbZ_reg >= 0);
+      spe_comment(f, 0, "Perform depth test");
+      write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg,
+                                           fbZ_reg, zmask_reg);
+      spe_release_register(f, zmask_reg);
+   }
+   else {
+      write_depth_stencil = FALSE;
+   }
+
+   if (write_depth_stencil) {
+      /* Merge latest Z and Stencil values into fbZS_reg.
+       * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
+       * fbS_reg has four 8-bit Z values in bits [7..0].
+       */
+      spe_comment(f, 0, "Store quad's depth/stencil values in tile");
+      if (zs_format == PIPE_FORMAT_Z24_UNORM_S8_USCALED ||
+          zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+         spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+         spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_S8_USCALED_Z24_UNORM ||
+               zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+         spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+         spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+         spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+         spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_S8_USCALED) {
+         ASSERT(0);   /* XXX to do */
+      }
+      else {
+         ASSERT(0); /* bad zs_format */
+      }
+
+      /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
+      spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+   }
+
+   /* Don't need these any more */
+   spe_release_register(f, fbZS_reg);
+   spe_release_register(f, fbZ_reg);
+   spe_release_register(f, fbS_reg);
+   spe_release_register(f, zmask_reg);
+}
+
+
+
+/**
+ * Generate SPE code to implement the fragment operations (alpha test,
+ * depth test, stencil test, blending, colormask, and final
+ * framebuffer write) as specified by the current context state.
+ *
+ * Logically, this code will be called after running the fragment
+ * shader.  But under some circumstances we could run some of this
+ * code before the fragment shader to cull fragments/quads that are
+ * totally occluded/discarded.
+ *
+ * XXX we only support PIPE_FORMAT_S8_USCALED_Z24_UNORM z/stencil buffer right now.
+ *
+ * See the spu_default_fragment_ops() function to see how the per-fragment
+ * operations would be done with ordinary C code.
+ * The code we generate here though has no branches, is SIMD, etc and
+ * should be much faster.
+ *
+ * \param cell  the rendering context (in)
+ * \param facing whether the generated code is for front-facing or 
+ *              back-facing fragments
+ * \param f     the generated function (in/out); on input, the function
+ *              must already have been initialized.  On exit, whatever
+ *              instructions within the generated function have had
+ *              the fragment ops appended.
+ */
+void
+cell_gen_fragment_function(struct cell_context *cell,
+                           const uint facing,
+                           struct spe_function *f)
+{
+   const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
+   const struct pipe_stencil_ref *stencil_ref = &cell->stencil_ref;
+   const struct pipe_blend_state *blend = cell->blend;
+   const struct pipe_blend_color *blend_color = &cell->blend_color;
+   const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   const int x_reg = 3;  /* uint */
+   const int y_reg = 4;  /* uint */
+   const int color_tile_reg = 5;  /* tile_t * */
+   const int depth_tile_reg = 6;  /* tile_t * */
+   const int fragZ_reg = 7;   /* vector float */
+   const int fragR_reg = 8;   /* vector float */
+   const int fragG_reg = 9;   /* vector float */
+   const int fragB_reg = 10;  /* vector float */
+   const int fragA_reg = 11;  /* vector float */
+   const int mask_reg = 12;   /* vector uint */
+
+   ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
+
+   /* offset of quad from start of tile
+    * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
+    */
+   int quad_offset_reg;
+
+   int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, TRUE);
+      spe_indent(f, 8);
+      spe_comment(f, -4, facing == CELL_FACING_FRONT
+                  ? "Begin front-facing per-fragment ops"
+                  : "Begin back-facing per-fragment ops");
+   }
+
+   spe_allocate_register(f, x_reg);
+   spe_allocate_register(f, y_reg);
+   spe_allocate_register(f, color_tile_reg);
+   spe_allocate_register(f, depth_tile_reg);
+   spe_allocate_register(f, fragZ_reg);
+   spe_allocate_register(f, fragR_reg);
+   spe_allocate_register(f, fragG_reg);
+   spe_allocate_register(f, fragB_reg);
+   spe_allocate_register(f, fragA_reg);
+   spe_allocate_register(f, mask_reg);
+
+   quad_offset_reg = spe_allocate_available_register(f);
+   fbRGBA_reg = spe_allocate_available_register(f);
+
+   /* compute offset of quad from start of tile, in bytes */
+   {
+      int x2_reg = spe_allocate_available_register(f);
+      int y2_reg = spe_allocate_available_register(f);
+
+      ASSERT(TILE_SIZE == 32);
+
+      spe_comment(f, 0, "Compute quad offset within tile");
+      spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
+      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
+      spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
+      spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */
+      spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */
+
+      spe_release_register(f, x2_reg);
+      spe_release_register(f, y2_reg);
+   }
+
+   /* Generate the alpha test, if needed. */
+   if (dsa->alpha.enabled) {
+      gen_alpha_test(dsa, f, mask_reg, fragA_reg);
+   }
+
+   /* generate depth and/or stencil test code */
+   if (dsa->depth.enabled || dsa->stencil[0].enabled) {
+      gen_depth_stencil(cell, dsa, stencil_ref, f,
+                        facing,
+                        mask_reg,
+                        depth_tile_reg,
+                        quad_offset_reg,
+                        fragZ_reg);
+   }
+
+   /* Get framebuffer quad/colors.  We'll need these for blending,
+    * color masking, and to obey the quad/pixel mask.
+    * Load: fbRGBA_reg = memory[color_tile + quad_offset]
+    * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
+    * we could skip this load.
+    */
+   spe_comment(f, 0, "Fetch quad colors from tile");
+   spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
+
+   if (blend->rt[0].blend_enable) {
+      spe_comment(f, 0, "Perform blending");
+      gen_blend(blend, blend_color, f, color_format,
+                fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
+   }
+
+   /*
+    * Write fragment colors to framebuffer/tile.
+    * This involves converting the fragment colors from float[4] to the
+    * tile's specific format and obeying the quad/pixel mask.
+    */
+   {
+      int rgba_reg = spe_allocate_available_register(f);
+
+      /* Pack four float colors as four 32-bit int colors */
+      spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
+      gen_pack_colors(f, color_format,
+                      fragR_reg, fragG_reg, fragB_reg, fragA_reg,
+                      rgba_reg);
+
+      if (blend->logicop_enable) {
+         spe_comment(f, 0, "Compute logic op");
+         gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
+      }
+
+      if (blend->rt[0].colormask != PIPE_MASK_RGBA) {
+         spe_comment(f, 0, "Compute color mask");
+         gen_colormask(f, blend->rt[0].colormask, color_format, rgba_reg, fbRGBA_reg);
+      }
+
+      /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
+       * if (mask[i])
+       *    rgba[i] = rgba[i];
+       * else
+       *    rgba[i] = framebuffer[i];
+       */
+      spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
+
+      /* Store updated quad in tile:
+       * memory[color_tile + quad_offset] = rgba_reg;
+       */
+      spe_comment(f, 0, "Store quad colors into color tile");
+      spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
+
+      spe_release_register(f, rgba_reg);
+   }
+
+   //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
+
+   spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
+
+   spe_release_register(f, fbRGBA_reg);
+   spe_release_register(f, quad_offset_reg);
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      char buffer[1024];
+      sprintf(buffer, "End %s-facing per-fragment ops: %d instructions", 
+         facing == CELL_FACING_FRONT ? "front" : "back", f->num_inst);
+      spe_comment(f, -4, buffer);
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
new file mode 100644
index 0000000000..21b35d1faf
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_GEN_FRAGMENT_H
+#define CELL_GEN_FRAGMENT_H
+
+
+extern void
+cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f);
+
+
+#endif /* CELL_GEN_FRAGMENT_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
new file mode 100644
index 0000000000..03f84d295b
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -0,0 +1,473 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/* Authors:
+ *  Keith Whitwell <keith@tungstengraphics.com>
+ *  Brian Paul
+ */
+
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "draw/draw_context.h"
+#include "cell_context.h"
+#include "cell_flush.h"
+#include "cell_pipe_state.h"
+#include "cell_state.h"
+#include "cell_texture.h"
+
+
+
+static void *
+cell_create_blend_state(struct pipe_context *pipe,
+                        const struct pipe_blend_state *blend)
+{
+   return mem_dup(blend, sizeof(*blend));
+}
+
+
+static void
+cell_bind_blend_state(struct pipe_context *pipe, void *blend)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->blend = (struct pipe_blend_state *) blend;
+   cell->dirty |= CELL_NEW_BLEND;
+}
+
+
+static void
+cell_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   FREE(blend);
+}
+
+
+static void
+cell_set_blend_color(struct pipe_context *pipe,
+                     const struct pipe_blend_color *blend_color)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->blend_color = *blend_color;
+
+   cell->dirty |= CELL_NEW_BLEND;
+}
+
+
+
+
+static void *
+cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
+                 const struct pipe_depth_stencil_alpha_state *dsa)
+{
+   return mem_dup(dsa, sizeof(*dsa));
+}
+
+
+static void
+cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
+                                    void *dsa)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->depth_stencil = (struct pipe_depth_stencil_alpha_state *) dsa;
+   cell->dirty |= CELL_NEW_DEPTH_STENCIL;
+}
+
+
+static void
+cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *dsa)
+{
+   FREE(dsa);
+}
+
+
+static void
+cell_set_stencil_ref(struct pipe_context *pipe,
+                     const struct pipe_stencil_ref *stencil_ref)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->stencil_ref = *stencil_ref;
+
+   cell->dirty |= CELL_NEW_DEPTH_STENCIL;
+}
+
+
+static void
+cell_set_clip_state(struct pipe_context *pipe,
+                    const struct pipe_clip_state *clip)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   /* pass the clip state to the draw module */
+   draw_set_clip_state(cell->draw, clip);
+}
+
+
+static void
+cell_set_sample_mask(struct pipe_context *pipe,
+                     unsigned sample_mask)
+{
+}
+
+
+/* Called when driver state tracker notices changes to the viewport
+ * matrix:
+ */
+static void
+cell_set_viewport_state( struct pipe_context *pipe,
+                         const struct pipe_viewport_state *viewport )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   cell->viewport = *viewport; /* struct copy */
+   cell->dirty |= CELL_NEW_VIEWPORT;
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_state(cell->draw, viewport);
+
+   /* Using tnl/ and vf/ modules is temporary while getting started.
+    * Full pipe will have vertex shader, vertex fetch of its own.
+    */
+}
+
+
+static void
+cell_set_scissor_state( struct pipe_context *pipe,
+                        const struct pipe_scissor_state *scissor )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   memcpy( &cell->scissor, scissor, sizeof(*scissor) );
+   cell->dirty |= CELL_NEW_SCISSOR;
+}
+
+
+static void
+cell_set_polygon_stipple( struct pipe_context *pipe,
+                          const struct pipe_poly_stipple *stipple )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   memcpy( &cell->poly_stipple, stipple, sizeof(*stipple) );
+   cell->dirty |= CELL_NEW_STIPPLE;
+}
+
+
+
+static void *
+cell_create_rasterizer_state(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *rasterizer)
+{
+   return mem_dup(rasterizer, sizeof(*rasterizer));
+}
+
+
+static void
+cell_bind_rasterizer_state(struct pipe_context *pipe, void *rast)
+{
+   struct pipe_rasterizer_state *rasterizer =
+      (struct pipe_rasterizer_state *) rast;
+   struct cell_context *cell = cell_context(pipe);
+
+   /* pass-through to draw module */
+   draw_set_rasterizer_state(cell->draw, rasterizer, rast);
+
+   cell->rasterizer = rasterizer;
+
+   cell->dirty |= CELL_NEW_RASTERIZER;
+}
+
+
+static void
+cell_delete_rasterizer_state(struct pipe_context *pipe, void *rasterizer)
+{
+   FREE(rasterizer);
+}
+
+
+
+static void *
+cell_create_sampler_state(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   return mem_dup(sampler, sizeof(*sampler));
+}
+
+
+static void
+cell_bind_sampler_states(struct pipe_context *pipe,
+                         unsigned num, void **samplers)
+{
+   struct cell_context *cell = cell_context(pipe);
+   uint i, changed = 0x0;
+
+   assert(num <= CELL_MAX_SAMPLERS);
+
+   draw_flush(cell->draw);
+
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      struct pipe_sampler_state *new_samp = i < num ? samplers[i] : NULL;
+      if (cell->sampler[i] != new_samp) {
+         cell->sampler[i] = new_samp;
+         changed |= (1 << i);
+      }
+   }
+
+   if (changed) {
+      cell->dirty |= CELL_NEW_SAMPLER;
+      cell->dirty_samplers |= changed;
+   }
+}
+
+
+static void
+cell_delete_sampler_state(struct pipe_context *pipe,
+                              void *sampler)
+{
+   FREE( sampler );
+}
+
+
+
+static void
+cell_set_fragment_sampler_views(struct pipe_context *pipe,
+                                unsigned num,
+                                struct pipe_sampler_view **views)
+{
+   struct cell_context *cell = cell_context(pipe);
+   uint i, changed = 0x0;
+
+   assert(num <= CELL_MAX_SAMPLERS);
+
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      struct pipe_sampler_view *new_view = i < num ? views[i] : NULL;
+      struct pipe_sampler_view *old_view = cell->fragment_sampler_views[i];
+
+      if (old_view != new_view) {
+         struct pipe_resource *new_tex = new_view ? new_view->texture : NULL;
+
+         pipe_sampler_view_reference(&cell->fragment_sampler_views[i],
+                                     views[i]);
+         pipe_resource_reference((struct pipe_resource **) &cell->texture[i],
+                                (struct pipe_resource *) new_tex);
+
+         changed |= (1 << i);
+      }
+   }
+
+   cell->num_textures = num;
+
+   if (changed) {
+      cell->dirty |= CELL_NEW_TEXTURE;
+      cell->dirty_textures |= changed;
+   }
+}
+
+
+static struct pipe_sampler_view *
+cell_create_sampler_view(struct pipe_context *pipe,
+                         struct pipe_resource *texture,
+                         const struct pipe_sampler_view *templ)
+{
+   struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
+
+   if (view) {
+      *view = *templ;
+      view->reference.count = 1;
+      view->texture = NULL;
+      pipe_resource_reference(&view->texture, texture);
+      view->context = pipe;
+   }
+
+   return view;
+}
+
+
+static void
+cell_sampler_view_destroy(struct pipe_context *pipe,
+                          struct pipe_sampler_view *view)
+{
+   pipe_resource_reference(&view->texture, NULL);
+   FREE(view);
+}
+
+
+/**
+ * Map color and z/stencil framebuffer surfaces.
+ */
+static void
+cell_map_surfaces(struct cell_context *cell)
+{
+#if 0
+   struct pipe_screen *screen = cell->pipe.screen;
+#endif
+   uint i;
+
+   for (i = 0; i < 1; i++) {
+      struct pipe_surface *ps = cell->framebuffer.cbufs[i];
+      if (ps) {
+         struct cell_resource *ct = cell_resource(ps->texture);
+#if 0
+         cell->cbuf_map[i] = screen->buffer_map(screen,
+                                                ct->buffer,
+                                                (PIPE_BUFFER_USAGE_GPU_READ |
+                                                 PIPE_BUFFER_USAGE_GPU_WRITE));
+#else
+         cell->cbuf_map[i] = ct->data;
+#endif
+      }
+   }
+
+   {
+      struct pipe_surface *ps = cell->framebuffer.zsbuf;
+      if (ps) {
+         struct cell_resource *ct = cell_resource(ps->texture);
+#if 0
+         cell->zsbuf_map = screen->buffer_map(screen,
+                                              ct->buffer,
+                                              (PIPE_BUFFER_USAGE_GPU_READ |
+                                               PIPE_BUFFER_USAGE_GPU_WRITE));
+#else
+         cell->zsbuf_map = ct->data;
+#endif
+      }
+   }
+}
+
+
+/**
+ * Unmap color and z/stencil framebuffer surfaces.
+ */
+static void
+cell_unmap_surfaces(struct cell_context *cell)
+{
+   /*struct pipe_screen *screen = cell->pipe.screen;*/
+   uint i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      struct pipe_surface *ps = cell->framebuffer.cbufs[i];
+      if (ps && cell->cbuf_map[i]) {
+         /*struct cell_resource *ct = cell_resource(ps->texture);*/
+         assert(ps->texture);
+         /*assert(ct->buffer);*/
+
+         /*screen->buffer_unmap(screen, ct->buffer);*/
+         cell->cbuf_map[i] = NULL;
+      }
+   }
+
+   {
+      struct pipe_surface *ps = cell->framebuffer.zsbuf;
+      if (ps && cell->zsbuf_map) {
+         /*struct cell_resource *ct = cell_resource(ps->texture);*/
+         /*screen->buffer_unmap(screen, ct->buffer);*/
+         cell->zsbuf_map = NULL;
+      }
+   }
+}
+
+
+static void
+cell_set_framebuffer_state(struct pipe_context *pipe,
+                           const struct pipe_framebuffer_state *fb)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   if (1 /*memcmp(&cell->framebuffer, fb, sizeof(*fb))*/) {
+      uint i;
+
+      /* unmap old surfaces */
+      cell_unmap_surfaces(cell);
+
+      /* Finish any pending rendering to the current surface before
+       * installing a new surface!
+       */
+      cell_flush_int(cell, CELL_FLUSH_WAIT);
+
+      /* update my state
+       * (this is also where old surfaces will finally get freed)
+       */
+      cell->framebuffer.width = fb->width;
+      cell->framebuffer.height = fb->height;
+      cell->framebuffer.nr_cbufs = fb->nr_cbufs;
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+         pipe_surface_reference(&cell->framebuffer.cbufs[i], fb->cbufs[i]);
+      }
+      pipe_surface_reference(&cell->framebuffer.zsbuf, fb->zsbuf);
+
+      /* map new surfaces */
+      cell_map_surfaces(cell);
+
+      cell->dirty |= CELL_NEW_FRAMEBUFFER;
+   }
+}
+
+
+void
+cell_init_state_functions(struct cell_context *cell)
+{
+   cell->pipe.create_blend_state = cell_create_blend_state;
+   cell->pipe.bind_blend_state   = cell_bind_blend_state;
+   cell->pipe.delete_blend_state = cell_delete_blend_state;
+
+   cell->pipe.create_sampler_state = cell_create_sampler_state;
+   cell->pipe.bind_fragment_sampler_states = cell_bind_sampler_states;
+   cell->pipe.delete_sampler_state = cell_delete_sampler_state;
+
+   cell->pipe.set_fragment_sampler_views = cell_set_fragment_sampler_views;
+   cell->pipe.create_sampler_view = cell_create_sampler_view;
+   cell->pipe.sampler_view_destroy = cell_sampler_view_destroy;
+
+   cell->pipe.create_depth_stencil_alpha_state = cell_create_depth_stencil_alpha_state;
+   cell->pipe.bind_depth_stencil_alpha_state   = cell_bind_depth_stencil_alpha_state;
+   cell->pipe.delete_depth_stencil_alpha_state = cell_delete_depth_stencil_alpha_state;
+
+   cell->pipe.create_rasterizer_state = cell_create_rasterizer_state;
+   cell->pipe.bind_rasterizer_state   = cell_bind_rasterizer_state;
+   cell->pipe.delete_rasterizer_state = cell_delete_rasterizer_state;
+
+   cell->pipe.set_blend_color = cell_set_blend_color;
+   cell->pipe.set_stencil_ref = cell_set_stencil_ref;
+   cell->pipe.set_clip_state = cell_set_clip_state;
+   cell->pipe.set_sample_mask = cell_set_sample_mask;
+
+   cell->pipe.set_framebuffer_state = cell_set_framebuffer_state;
+
+   cell->pipe.set_polygon_stipple = cell_set_polygon_stipple;
+   cell->pipe.set_scissor_state = cell_set_scissor_state;
+   cell->pipe.set_viewport_state = cell_set_viewport_state;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.h b/src/gallium/drivers/cell/ppu/cell_pipe_state.h
new file mode 100644
index 0000000000..1889bd52ff
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_PIPE_STATE_H
+#define CELL_PIPE_STATE_H
+
+
+struct cell_context;
+
+extern void
+cell_init_state_functions(struct cell_context *cell);
+
+
+#endif /* CELL_PIPE_STATE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_public.h b/src/gallium/drivers/cell/ppu/cell_public.h
new file mode 100644
index 0000000000..7e2e093565
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_public.h
@@ -0,0 +1,10 @@
+#ifndef CELL_PUBLIC_H
+#define CELL_PUBLIC_H
+
+struct pipe_screen;
+struct sw_winsys;
+
+struct pipe_screen *
+cell_create_screen(struct sw_winsys *winsys);
+
+#endif
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
new file mode 100644
index 0000000000..f648482c55
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -0,0 +1,211 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Last stage of 'draw' pipeline: send tris to SPUs.
+ * \author  Brian Paul
+ */
+
+#include "cell_context.h"
+#include "cell_render.h"
+#include "cell_spu.h"
+#include "util/u_memory.h"
+#include "draw/draw_private.h"
+
+
+struct render_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct cell_context *cell;
+};
+
+
+static INLINE struct render_stage *
+render_stage(struct draw_stage *stage)
+{
+   return (struct render_stage *) stage;
+}
+
+
+static void render_begin( struct draw_stage *stage )
+{
+#if 0
+   struct render_stage *render = render_stage(stage);
+   struct cell_context *sp = render->cell;
+   const struct pipe_shader_state *fs = &render->cell->fs->shader;
+   render->quad.nr_attrs = render->cell->nr_frag_attrs;
+
+   render->firstFpInput = fs->input_semantic_name[0];
+
+   sp->quad.first->begin(sp->quad.first);
+#endif
+}
+
+
+static void render_end( struct draw_stage *stage )
+{
+}
+
+
+static void reset_stipple_counter( struct draw_stage *stage )
+{
+   struct render_stage *render = render_stage(stage);
+   /*render->cell->line_stipple_counter = 0;*/
+}
+
+
+static void
+render_point(struct draw_stage *stage, struct prim_header *prim)
+{
+}
+
+
+static void
+render_line(struct draw_stage *stage, struct prim_header *prim)
+{
+}
+
+
+/** Write a vertex into the prim buffer */
+static void
+save_vertex(struct cell_prim_buffer *buf, uint pos,
+            const struct vertex_header *vert)
+{
+   uint attr, j;
+
+   for (attr = 0; attr < 2; attr++) {
+      for (j = 0; j < 4; j++) {
+         buf->vertex[pos][attr][j] = vert->data[attr][j];
+      }
+   }
+
+   /* update bounding box */
+   if (vert->data[0][0] < buf->xmin)
+      buf->xmin = vert->data[0][0];
+   if (vert->data[0][0] > buf->xmax)
+      buf->xmax = vert->data[0][0];
+   if (vert->data[0][1] < buf->ymin)
+      buf->ymin = vert->data[0][1];
+   if (vert->data[0][1] > buf->ymax)
+      buf->ymax = vert->data[0][1];
+}
+
+
+static void
+render_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct render_stage *rs = render_stage(stage);
+   struct cell_context *cell = rs->cell;
+   struct cell_prim_buffer *buf = &cell->prim_buffer;
+   uint i;
+
+   if (buf->num_verts + 3 > CELL_MAX_VERTS) {
+      cell_flush_prim_buffer(cell);
+   }
+
+   i = buf->num_verts;
+   assert(i+2 <= CELL_MAX_VERTS);
+   save_vertex(buf, i+0, prim->v[0]);
+   save_vertex(buf, i+1, prim->v[1]);
+   save_vertex(buf, i+2, prim->v[2]);
+   buf->num_verts += 3;
+}
+
+
+/**
+ * Send the a RENDER command to all SPUs to have them render the prims
+ * in the current prim_buffer.
+ */
+void
+cell_flush_prim_buffer(struct cell_context *cell)
+{
+   uint i;
+
+   if (cell->prim_buffer.num_verts == 0)
+      return;
+
+   for (i = 0; i < cell->num_spus; i++) {
+      struct cell_command_render *render = &cell_global.command[i].render;
+      render->prim_type = PIPE_PRIM_TRIANGLES;
+      render->num_verts = cell->prim_buffer.num_verts;
+      render->front_ccw = cell->rasterizer->front_ccw;
+      render->vertex_size = cell->vertex_info->size * 4;
+      render->xmin = cell->prim_buffer.xmin;
+      render->ymin = cell->prim_buffer.ymin;
+      render->xmax = cell->prim_buffer.xmax;
+      render->ymax = cell->prim_buffer.ymax;
+      render->vertex_data = &cell->prim_buffer.vertex;
+      ASSERT_ALIGN16(render->vertex_data);
+      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_RENDER);
+   }
+
+   cell->prim_buffer.num_verts = 0;
+
+   cell->prim_buffer.xmin = 1e100;
+   cell->prim_buffer.ymin = 1e100;
+   cell->prim_buffer.xmax = -1e100;
+   cell->prim_buffer.ymax = -1e100;
+
+   /* XXX temporary, need to double-buffer the prim buffer until we get
+    * a real command buffer/list system.
+    */
+   cell_flush(&cell->pipe, 0x0);
+}
+
+
+
+static void render_destroy( struct draw_stage *stage )
+{
+   FREE( stage );
+}
+
+
+/**
+ * Create a new draw/render stage.  This will be plugged into the
+ * draw module as the last pipeline stage.
+ */
+struct draw_stage *cell_draw_render_stage( struct cell_context *cell )
+{
+   struct render_stage *render = CALLOC_STRUCT(render_stage);
+
+   render->cell = cell;
+   render->stage.draw = cell->draw;
+   render->stage.begin = render_begin;
+   render->stage.point = render_point;
+   render->stage.line = render_line;
+   render->stage.tri = render_tri;
+   render->stage.end = render_end;
+   render->stage.reset_stipple_counter = reset_stipple_counter;
+   render->stage.destroy = render_destroy;
+
+   /*
+   render->quad.coef = render->coef;
+   render->quad.posCoef = &render->posCoef;
+   */
+
+   return &render->stage;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_render.h b/src/gallium/drivers/cell/ppu/cell_render.h
new file mode 100644
index 0000000000..826dcbafeb
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_render.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_RENDER_H
+#define CELL_RENDER_H
+
+struct cell_context;
+struct draw_stage;
+
+extern void
+cell_flush_prim_buffer(struct cell_context *cell);
+
+extern struct draw_stage *cell_draw_render_stage( struct cell_context *cell );
+
+#endif /* CELL_RENDER_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
new file mode 100644
index 0000000000..0f12e0667e
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -0,0 +1,210 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "util/u_simple_screen.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+
+#include "cell/common.h"
+#include "cell_context.h"
+#include "cell_screen.h"
+#include "cell_texture.h"
+#include "cell_public.h"
+
+#include "state_tracker/sw_winsys.h"
+
+
+static const char *
+cell_get_vendor(struct pipe_screen *screen)
+{
+   return "VMware, Inc.";
+}
+
+
+static const char *
+cell_get_name(struct pipe_screen *screen)
+{
+   return "Cell";
+}
+
+
+static int
+cell_get_param(struct pipe_screen *screen, enum pipe_cap param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return CELL_MAX_SAMPLERS;
+   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+      return CELL_MAX_SAMPLERS;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 1;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_TIMER_QUERY:
+      return 0;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 10;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return CELL_MAX_TEXTURE_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return CELL_MAX_TEXTURE_LEVELS;
+   case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+      return 1; /* XXX not really true */
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+      return 0; /* XXX to do */
+   case PIPE_CAP_TGSI_CONT_SUPPORTED:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+      return 0;
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
+
+static float
+cell_get_paramf(struct pipe_screen *screen, enum pipe_cap param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 255.0; /* arbitrary */
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0; /* arbitrary */
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 0.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0; /* arbitrary */
+
+   default:
+      return 0;
+   }
+}
+
+
+static boolean
+cell_is_format_supported( struct pipe_screen *screen,
+                          enum pipe_format format,
+                          enum pipe_texture_target target,
+                          unsigned sample_count,
+                          unsigned tex_usage,
+                          unsigned geom_flags )
+{
+   struct sw_winsys *winsys = cell_screen(screen)->winsys;
+
+   if (sample_count > 1)
+      return FALSE;
+
+   if (tex_usage & (PIPE_BIND_DISPLAY_TARGET |
+                    PIPE_BIND_SCANOUT |
+                    PIPE_BIND_SHARED)) {
+      if (!winsys->is_displaytarget_format_supported(winsys, tex_usage, format))
+         return FALSE;
+   }
+
+   /* only a few formats are known to work at this time */
+   switch (format) {
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_I8_UNORM:
+      return TRUE;
+   default:
+      return FALSE;
+   }
+}
+
+
+static void
+cell_destroy_screen( struct pipe_screen *screen )
+{
+   struct cell_screen *sp_screen = cell_screen(screen);
+   struct sw_winsys *winsys = sp_screen->winsys;
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+
+/**
+ * Create a new pipe_screen object
+ * Note: we're not presently subclassing pipe_screen (no cell_screen) but
+ * that would be the place to put SPU thread/context info...
+ */
+struct pipe_screen *
+cell_create_screen(struct sw_winsys *winsys)
+{
+   struct cell_screen *screen = CALLOC_STRUCT(cell_screen);
+
+   if (!screen)
+      return NULL;
+
+   screen->winsys = winsys;
+   screen->base.winsys = NULL;
+
+   screen->base.destroy = cell_destroy_screen;
+
+   screen->base.get_name = cell_get_name;
+   screen->base.get_vendor = cell_get_vendor;
+   screen->base.get_param = cell_get_param;
+   screen->base.get_paramf = cell_get_paramf;
+   screen->base.is_format_supported = cell_is_format_supported;
+   screen->base.context_create = cell_create_context;
+
+   cell_init_screen_texture_funcs(&screen->base);
+
+   return &screen->base;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.h b/src/gallium/drivers/cell/ppu/cell_screen.h
new file mode 100644
index 0000000000..baff9d3b7d
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_screen.h
@@ -0,0 +1,55 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_SCREEN_H
+#define CELL_SCREEN_H
+
+
+#include "pipe/p_screen.h"
+
+struct sw_winsys;
+
+struct cell_screen {
+   struct pipe_screen base;
+
+   struct sw_winsys *winsys;
+
+   /* Increments whenever textures are modified.  Contexts can track
+    * this.
+    */
+   unsigned timestamp;          
+};
+
+static INLINE struct cell_screen *
+cell_screen( struct pipe_screen *pipe )
+{
+   return (struct cell_screen *)pipe;
+}
+
+
+#endif /* CELL_SCREEN_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
new file mode 100644
index 0000000000..39284f3a5d
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -0,0 +1,219 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Utility/wrappers for communicating with the SPUs.
+ */
+
+
+#include <pthread.h>
+
+#include "cell_spu.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+#include "cell/common.h"
+
+
+/*
+helpful headers:
+/opt/ibm/cell-sdk/prototype/src/include/ppu/cbe_mfc.h
+*/
+
+
+/**
+ * Cell/SPU info that's not per-context.
+ */
+struct cell_global_info cell_global;
+
+
+/**
+ * Scan /proc/cpuinfo to determine the timebase for the system.
+ * This is used by the SPUs to convert 'decrementer' ticks to seconds.
+ * There may be a better way to get this value...
+ */
+static unsigned
+get_timebase(void)
+{
+   FILE *f = fopen("/proc/cpuinfo", "r");
+   unsigned timebase;
+
+   assert(f);
+   while (!feof(f)) {
+      char line[80];
+      fgets(line, sizeof(line), f);
+      if (strncmp(line, "timebase", 8) == 0) {
+         char *colon = strchr(line, ':');
+         if (colon) {
+            timebase = atoi(colon + 2);
+            break;
+         }
+      }
+   }
+   fclose(f);
+
+   return timebase;
+}
+
+
+/**
+ * Write a 1-word message to the given SPE mailbox.
+ */
+void
+send_mbox_message(spe_context_ptr_t ctx, unsigned int msg)
+{
+   spe_in_mbox_write(ctx, &msg, 1, SPE_MBOX_ALL_BLOCKING);
+}
+
+
+/**
+ * Wait for a 1-word message to arrive in given mailbox.
+ */
+uint
+wait_mbox_message(spe_context_ptr_t ctx)
+{
+   do {
+      unsigned data;
+      int count = spe_out_mbox_read(ctx, &data, 1);
+
+      if (count == 1) {
+	 return data;
+      }
+      
+      if (count < 0) {
+	 /* error */ ;
+      }
+   } while (1);
+}
+
+
+/**
+ * Called by pthread_create() to spawn an SPU thread.
+ */
+static void *
+cell_thread_function(void *arg)
+{
+   struct cell_init_info *init = (struct cell_init_info *) arg;
+   unsigned entry = SPE_DEFAULT_ENTRY;
+
+   ASSERT_ALIGN16(init);
+
+   if (spe_context_run(cell_global.spe_contexts[init->id], &entry, 0,
+                       init, NULL, NULL) < 0) {
+      fprintf(stderr, "spe_context_run() failed\n");
+      exit(1);
+   }
+
+   pthread_exit(NULL);
+}
+
+
+/**
+ * Create the SPU threads.  This is done once during driver initialization.
+ * This involves setting the "init" message which is sent to each SPU.
+ * The init message specifies an SPU id, total number of SPUs, location
+ * and number of batch buffers, etc.
+ */
+void
+cell_start_spus(struct cell_context *cell)
+{
+   static boolean one_time_init = FALSE;
+   uint i, j;
+   uint timebase = get_timebase();
+
+   if (one_time_init) {
+      fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
+	      "on Cell.\n");
+      abort();
+   }
+
+   one_time_init = TRUE;
+
+   assert(cell->num_spus <= CELL_MAX_SPUS);
+
+   ASSERT_ALIGN16(&cell_global.inits[0]);
+   ASSERT_ALIGN16(&cell_global.inits[1]);
+
+   /*
+    * Initialize the global 'inits' structure for each SPU.
+    * A pointer to the init struct will be passed to each SPU.
+    * The SPUs will then each grab their init info with mfc_get().
+    */
+   for (i = 0; i < cell->num_spus; i++) {
+      cell_global.inits[i].id = i;
+      cell_global.inits[i].num_spus = cell->num_spus;
+      cell_global.inits[i].debug_flags = cell->debug_flags;
+      cell_global.inits[i].inv_timebase = 1000.0f / timebase;
+
+      for (j = 0; j < CELL_NUM_BUFFERS; j++) {
+         cell_global.inits[i].buffers[j] = cell->buffer[j];
+      }
+      cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
+
+      cell_global.inits[i].spu_functions = &cell->spu_functions;
+
+      cell_global.spe_contexts[i] = spe_context_create(0, NULL);
+      if (!cell_global.spe_contexts[i]) {
+         fprintf(stderr, "spe_context_create() failed\n");
+         exit(1);
+      }
+
+      if (spe_program_load(cell_global.spe_contexts[i], &g3d_spu)) {
+         fprintf(stderr, "spe_program_load() failed\n");
+         exit(1);
+      }
+      
+      pthread_create(&cell_global.spe_threads[i], /* returned thread handle */
+                     NULL,                        /* pthread attribs */
+                     &cell_thread_function,       /* start routine */
+		     &cell_global.inits[i]);      /* thread argument */
+   }
+}
+
+
+/**
+ * Tell all the SPUs to stop/exit.
+ * This is done when the driver's exiting / cleaning up.
+ */
+void
+cell_spu_exit(struct cell_context *cell)
+{
+   uint i;
+
+   for (i = 0; i < cell->num_spus; i++) {
+      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_EXIT);
+   }
+
+   /* wait for threads to exit */
+   for (i = 0; i < cell->num_spus; i++) {
+      void *value;
+      pthread_join(cell_global.spe_threads[i], &value);
+      cell_global.spe_threads[i] = 0;
+      cell_global.spe_contexts[i] = 0;
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
new file mode 100644
index 0000000000..c93958a9ed
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_SPU
+#define CELL_SPU
+
+
+#include <libspe2.h>
+#include <pthread.h>
+#include "cell/common.h"
+
+#include "cell_context.h"
+
+
+/**
+ * Global vars, for now anyway.
+ */
+struct cell_global_info
+{
+   /**
+    * SPU/SPE handles, etc
+    */
+   spe_context_ptr_t spe_contexts[CELL_MAX_SPUS];
+   pthread_t spe_threads[CELL_MAX_SPUS];
+
+   /**
+    * Data sent to SPUs at start-up
+    */
+   struct cell_init_info inits[CELL_MAX_SPUS];
+};
+
+
+extern struct cell_global_info cell_global;
+
+
+/** This is the handle for the actual SPE code */
+extern spe_program_handle_t g3d_spu;
+
+
+extern void
+send_mbox_message(spe_context_ptr_t ctx, unsigned int msg);
+
+extern uint
+wait_mbox_message(spe_context_ptr_t ctx);
+
+
+extern void
+cell_start_spus(struct cell_context *cell);
+
+
+extern void
+cell_spu_exit(struct cell_context *cell);
+
+
+#endif /* CELL_SPU */
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
new file mode 100644
index 0000000000..7adedcde57
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -0,0 +1,65 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_STATE_H
+#define CELL_STATE_H
+
+
+#define CELL_NEW_VIEWPORT      0x1
+#define CELL_NEW_RASTERIZER    0x2
+#define CELL_NEW_FS            0x4
+#define CELL_NEW_BLEND         0x8
+#define CELL_NEW_CLIP          0x10
+#define CELL_NEW_SCISSOR       0x20
+#define CELL_NEW_STIPPLE       0x40
+#define CELL_NEW_FRAMEBUFFER   0x80
+#define CELL_NEW_ALPHA_TEST    0x100
+#define CELL_NEW_DEPTH_STENCIL 0x200
+#define CELL_NEW_SAMPLER       0x400
+#define CELL_NEW_TEXTURE       0x800
+#define CELL_NEW_VERTEX        0x1000
+#define CELL_NEW_VS            0x2000
+#define CELL_NEW_VS_CONSTANTS  0x4000
+#define CELL_NEW_FS_CONSTANTS  0x8000
+#define CELL_NEW_VERTEX_INFO   0x10000
+
+
+extern void
+cell_update_derived( struct cell_context *cell );
+
+
+extern void
+cell_init_shader_functions(struct cell_context *cell);
+
+
+extern void
+cell_init_vertex_functions(struct cell_context *cell);
+
+
+#endif /* CELL_STATE_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c
new file mode 100644
index 0000000000..b723e794e7
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c
@@ -0,0 +1,170 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_state.h"
+#include "cell_state_emit.h"
+
+
+/**
+ * Determine how to map vertex program outputs to fragment program inputs.
+ * Basically, this will be used when computing the triangle interpolation
+ * coefficients from the post-transform vertex attributes.
+ */
+static void
+calculate_vertex_layout( struct cell_context *cell )
+{
+   const struct cell_fragment_shader_state *fs = cell->fs;
+   const enum interp_mode colorInterp
+      = cell->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+   struct vertex_info *vinfo = &cell->vertex_info;
+   uint i;
+   int src;
+
+#if 0
+   if (cell->vbuf) {
+      /* if using the post-transform vertex buffer, tell draw_vbuf to
+       * simply emit the whole post-xform vertex as-is:
+       */
+      struct vertex_info *vinfo_vbuf = &cell->vertex_info_vbuf;
+      vinfo_vbuf->num_attribs = 0;
+      draw_emit_vertex_attr(vinfo_vbuf, EMIT_ALL, INTERP_NONE, 0);
+      vinfo_vbuf->size = 4 * vs->num_outputs + sizeof(struct vertex_header)/4;
+   }
+#endif
+
+   /* reset vinfo */
+   vinfo->num_attribs = 0;
+
+   /* we always want to emit vertex pos */
+   src = draw_find_shader_output(cell->draw, TGSI_SEMANTIC_POSITION, 0);
+   assert(src >= 0);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
+
+
+   /*
+    * Loop over fragment shader inputs, searching for the matching output
+    * from the vertex shader.
+    */
+   for (i = 0; i < fs->info.num_inputs; i++) {
+      switch (fs->info.input_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         /* already done above */
+         break;
+
+      case TGSI_SEMANTIC_COLOR:
+         src = draw_find_shader_output(cell->draw, TGSI_SEMANTIC_COLOR, 
+                                   fs->info.input_semantic_index[i]);
+         assert(src >= 0);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         break;
+
+      case TGSI_SEMANTIC_FOG:
+         src = draw_find_shader_output(cell->draw, TGSI_SEMANTIC_FOG, 0);
+#if 1
+         if (src < 0) /* XXX temp hack, try demos/fogcoord.c with this */
+            src = 0;
+#endif
+         assert(src >= 0);
+         draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+         break;
+
+      case TGSI_SEMANTIC_GENERIC:
+         /* this includes texcoords and varying vars */
+         src = draw_find_shader_output(cell->draw, TGSI_SEMANTIC_GENERIC,
+                              fs->info.input_semantic_index[i]);
+         assert(src >= 0);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   draw_compute_vertex_size(vinfo);
+
+   /* XXX only signal this if format really changes */
+   cell->dirty |= CELL_NEW_VERTEX_INFO;
+}
+
+
+#if 0
+/**
+ * Recompute cliprect from scissor bounds, scissor enable and surface size.
+ */
+static void
+compute_cliprect(struct cell_context *sp)
+{
+   uint surfWidth = sp->framebuffer.width;
+   uint surfHeight = sp->framebuffer.height;
+
+   if (sp->rasterizer->scissor) {
+      /* clip to scissor rect */
+      sp->cliprect.minx = MAX2(sp->scissor.minx, 0);
+      sp->cliprect.miny = MAX2(sp->scissor.miny, 0);
+      sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth);
+      sp->cliprect.maxy = MIN2(sp->scissor.maxy, surfHeight);
+   }
+   else {
+      /* clip to surface bounds */
+      sp->cliprect.minx = 0;
+      sp->cliprect.miny = 0;
+      sp->cliprect.maxx = surfWidth;
+      sp->cliprect.maxy = surfHeight;
+   }
+}
+#endif
+
+
+
+/**
+ * Update derived state, send current state to SPUs prior to rendering.
+ */
+void cell_update_derived( struct cell_context *cell )
+{
+   if (cell->dirty & (CELL_NEW_RASTERIZER |
+                      CELL_NEW_FS |
+                      CELL_NEW_VS))
+      calculate_vertex_layout( cell );
+
+#if 0
+   if (cell->dirty & (CELL_NEW_SCISSOR |
+                      CELL_NEW_DEPTH_STENCIL_ALPHA |
+                      CELL_NEW_FRAMEBUFFER))
+      compute_cliprect(cell);
+#endif
+
+   cell_emit_state(cell);
+
+   cell->dirty = 0;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
new file mode 100644
index 0000000000..bb11c68fa2
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -0,0 +1,343 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_format.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+#include "cell_state.h"
+#include "cell_state_emit.h"
+#include "cell_batch.h"
+#include "cell_texture.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+
+/**
+ * Find/create a cell_command_fragment_ops object corresponding to the
+ * current blend/stencil/z/colormask/etc. state.
+ */
+static struct cell_command_fragment_ops *
+lookup_fragment_ops(struct cell_context *cell)
+{
+   struct cell_fragment_ops_key key;
+   struct cell_command_fragment_ops *ops;
+
+   /*
+    * Build key
+    */
+   memset(&key, 0, sizeof(key));
+   key.blend = *cell->blend;
+   key.blend_color = cell->blend_color;
+   key.dsa = *cell->depth_stencil;
+
+   if (cell->framebuffer.cbufs[0])
+      key.color_format = cell->framebuffer.cbufs[0]->format;
+   else
+      key.color_format = PIPE_FORMAT_NONE;
+
+   if (cell->framebuffer.zsbuf)
+      key.zs_format = cell->framebuffer.zsbuf->format;
+   else
+      key.zs_format = PIPE_FORMAT_NONE;
+
+   /*
+    * Look up key in cache.
+    */
+   ops = (struct cell_command_fragment_ops *)
+      util_keymap_lookup(cell->fragment_ops_cache, &key);
+
+   /*
+    * If not found, create/save new fragment ops command.
+    */
+   if (!ops) {
+      struct spe_function spe_code_front, spe_code_back;
+      unsigned int facing_dependent, total_code_size;
+
+      if (0)
+         debug_printf("**** Create New Fragment Ops\n");
+
+      /* Prepare the buffer that will hold the generated code.  The
+       * "0" passed in for the size means that the SPE code will
+       * use a default size.
+       */
+      spe_init_func(&spe_code_front, 0);
+      spe_init_func(&spe_code_back, 0);
+
+      /* Generate new code.  Always generate new code for both front-facing
+       * and back-facing fragments, even if it's the same code in both
+       * cases.
+       */
+      cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
+      cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
+
+      /* Make sure the code is a multiple of 8 bytes long; this is
+       * required to ensure that the dual pipe instruction alignment
+       * is correct.  It's also important for the SPU unpacking,
+       * which assumes 8-byte boundaries.
+       */
+      unsigned int front_code_size = spe_code_size(&spe_code_front);
+      while (front_code_size % 8 != 0) {
+         spe_lnop(&spe_code_front);
+         front_code_size = spe_code_size(&spe_code_front);
+      }
+      unsigned int back_code_size = spe_code_size(&spe_code_back);
+      while (back_code_size % 8 != 0) {
+         spe_lnop(&spe_code_back);
+         back_code_size = spe_code_size(&spe_code_back);
+      }
+
+      /* Determine whether the code we generated is facing-dependent, by
+       * determining whether the generated code is different for the front-
+       * and back-facing fragments.
+       */
+      if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) {
+         /* Code is identical; only need one copy. */
+         facing_dependent = 0;
+         total_code_size = front_code_size;
+      }
+      else {
+         /* Code is different for front-facing and back-facing fragments.
+          * Need to send both copies.
+          */
+         facing_dependent = 1;
+         total_code_size = front_code_size + back_code_size;
+      }
+
+      /* alloc new fragment ops command.  Note that this structure
+       * has variant length based on the total code size required.
+       */
+      ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
+      /* populate the new cell_command_fragment_ops object */
+      ops->opcode[0] = CELL_CMD_STATE_FRAGMENT_OPS;
+      ops->total_code_size = total_code_size;
+      ops->front_code_index = 0;
+      memcpy(ops->code, spe_code_front.store, front_code_size);
+      if (facing_dependent) {
+        /* We have separate front- and back-facing code.  Append the
+         * back-facing code to the buffer.  Be careful because the code
+         * size is in bytes, but the buffer is of unsigned elements.
+         */
+        ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]);
+        memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size);
+      }
+      else {
+        /* Use the same code for front- and back-facing fragments */
+        ops->back_code_index = ops->front_code_index;
+      }
+
+      /* Set the fields for the fallback case.  Note that these fields
+       * (and the whole fallback case) will eventually go away.
+       */
+      ops->dsa = *cell->depth_stencil;
+      ops->blend = *cell->blend;
+      ops->blend_color = cell->blend_color;
+
+      /* insert cell_command_fragment_ops object into keymap/cache */
+      util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
+
+      /* release rtasm buffer */
+      spe_release_func(&spe_code_front);
+      spe_release_func(&spe_code_back);
+   }
+   else {
+      if (0)
+         debug_printf("**** Re-use Fragment Ops\n");
+   }
+
+   return ops;
+}
+
+
+
+static void
+emit_state_cmd(struct cell_context *cell, uint cmd,
+               const void *state, uint state_size)
+{
+   uint32_t *dst = (uint32_t *) 
+       cell_batch_alloc16(cell, ROUNDUP16(sizeof(opcode_t) + state_size));
+   *dst = cmd;
+   memcpy(dst + 4, state, state_size);
+}
+
+
+/**
+ * For state marked as 'dirty', construct a state-update command block
+ * and insert it into the current batch buffer.
+ */
+void
+cell_emit_state(struct cell_context *cell)
+{
+   if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
+      struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
+      struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
+      STATIC_ASSERT(sizeof(struct cell_command_framebuffer) % 16 == 0);
+      struct cell_command_framebuffer *fb
+         = cell_batch_alloc16(cell, sizeof(*fb));
+      fb->opcode[0] = CELL_CMD_STATE_FRAMEBUFFER;
+      fb->color_start = cell->cbuf_map[0];
+      fb->color_format = cbuf->format;
+      fb->depth_start = cell->zsbuf_map;
+      fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE;
+      fb->width = cell->framebuffer.width;
+      fb->height = cell->framebuffer.height;
+#if 0
+      printf("EMIT color format %s\n", util_format_name(fb->color_format));
+      printf("EMIT depth format %s\n", util_format_name(fb->depth_format));
+#endif
+   }
+
+   if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+      STATIC_ASSERT(sizeof(struct cell_command_rasterizer) % 16 == 0);
+      struct cell_command_rasterizer *rast =
+         cell_batch_alloc16(cell, sizeof(*rast));
+      rast->opcode[0] = CELL_CMD_STATE_RASTERIZER;
+      rast->rasterizer = *cell->rasterizer;
+   }
+
+   if (cell->dirty & (CELL_NEW_FS)) {
+      /* Send new fragment program to SPUs */
+      STATIC_ASSERT(sizeof(struct cell_command_fragment_program) % 16 == 0);
+      struct cell_command_fragment_program *fp
+            = cell_batch_alloc16(cell, sizeof(*fp));
+      fp->opcode[0] = CELL_CMD_STATE_FRAGMENT_PROGRAM;
+      fp->num_inst = cell->fs->code.num_inst;
+      memcpy(&fp->code, cell->fs->code.store,
+             SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+      if (0) {
+         int i;
+         printf("PPU Emit CELL_CMD_STATE_FRAGMENT_PROGRAM:\n");
+         for (i = 0; i < fp->num_inst; i++) {
+            printf(" %3d: 0x%08x\n", i, fp->code[i]);
+         }
+      }
+   }
+
+   if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+      const uint shader = PIPE_SHADER_FRAGMENT;
+      const uint num_const = cell->constants[shader]->width0 / sizeof(float);
+      uint i, j;
+      float *buf = cell_batch_alloc16(cell, ROUNDUP16(32 + num_const * sizeof(float)));
+      uint32_t *ibuf = (uint32_t *) buf;
+      const float *constants = cell->mapped_constants[shader];
+      ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+      ibuf[4] = num_const;
+      j = 8;
+      for (i = 0; i < num_const; i++) {
+         buf[j++] = constants[i];
+      }
+   }
+
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
+                      CELL_NEW_DEPTH_STENCIL |
+                      CELL_NEW_BLEND)) {
+      struct cell_command_fragment_ops *fops, *fops_cmd;
+      /* Note that cell_command_fragment_ops is a variant-sized record */
+      fops = lookup_fragment_ops(cell);
+      fops_cmd = cell_batch_alloc16(cell, ROUNDUP16(sizeof(*fops_cmd) + fops->total_code_size));
+      memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
+   }
+
+   if (cell->dirty & CELL_NEW_SAMPLER) {
+      uint i;
+      for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+         if (cell->dirty_samplers & (1 << i)) {
+            if (cell->sampler[i]) {
+               STATIC_ASSERT(sizeof(struct cell_command_sampler) % 16 == 0);
+               struct cell_command_sampler *sampler
+                  = cell_batch_alloc16(cell, sizeof(*sampler));
+               sampler->opcode[0] = CELL_CMD_STATE_SAMPLER;
+               sampler->unit = i;
+               sampler->state = *cell->sampler[i];
+            }
+         }
+      }
+      cell->dirty_samplers = 0x0;
+   }
+
+   if (cell->dirty & CELL_NEW_TEXTURE) {
+      uint i;
+      for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
+         if (cell->dirty_textures & (1 << i)) {
+            STATIC_ASSERT(sizeof(struct cell_command_texture) % 16 == 0);
+            struct cell_command_texture *texture =
+               (struct cell_command_texture *)
+               cell_batch_alloc16(cell, sizeof(*texture));
+
+            texture->opcode[0] = CELL_CMD_STATE_TEXTURE;
+            texture->unit = i;
+            if (cell->texture[i]) {
+               struct cell_resource *ct = cell->texture[i];
+               uint level;
+               for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+                  texture->start[level] = (ct->mapped +
+                                           ct->level_offset[level]);
+                  texture->width[level] = u_minify(ct->base.width0, level);
+                  texture->height[level] = u_minify(ct->base.height0, level);
+                  texture->depth[level] = u_minify(ct->base.depth0, level);
+               }
+               texture->target = ct->base.target;
+            }
+            else {
+               uint level;
+               for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+                  texture->start[level] = NULL;
+                  texture->width[level] = 0;
+                  texture->height[level] = 0;
+                  texture->depth[level] = 0;
+               }
+               texture->target = 0;
+            }
+         }
+      }
+      cell->dirty_textures = 0x0;
+   }
+
+   if (cell->dirty & CELL_NEW_VERTEX_INFO) {
+      emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
+                     &cell->vertex_info, sizeof(struct vertex_info));
+   }
+
+#if 0
+   if (cell->dirty & CELL_NEW_VS) {
+      const struct draw_context *const draw = cell->draw;
+      struct cell_shader_info info;
+
+      info.num_outputs = draw_num_shader_outputs(draw);
+      info.declarations = (uintptr_t) draw->vs.machine.Declarations;
+      info.num_declarations = draw->vs.machine.NumDeclarations;
+      info.instructions = (uintptr_t) draw->vs.machine.Instructions;
+      info.num_instructions = draw->vs.machine.NumInstructions;
+      info.immediates = (uintptr_t) draw->vs.machine.Imms;
+      info.num_immediates = draw->vs.machine.ImmLimit / 4;
+
+      emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS, &info, sizeof(info));
+   }
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.h b/src/gallium/drivers/cell/ppu/cell_state_emit.h
new file mode 100644
index 0000000000..59f8affe8d
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.h
@@ -0,0 +1,36 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_STATE_EMIT_H
+#define CELL_STATE_EMIT_H
+
+
+extern void
+cell_emit_state(struct cell_context *cell);
+
+
+#endif /* CELL_STATE_EMIT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
new file mode 100644
index 0000000000..dc33e7ccc2
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -0,0 +1,1432 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ * Generate code to perform all per-fragment operations.
+ *
+ * Code generated by these functions perform both alpha, depth, and stencil
+ * testing as well as alpha blending.
+ *
+ * \note
+ * Occlusion query is not supported, but this is the right place to add that
+ * support.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "cell_context.h"
+
+#include "rtasm/rtasm_ppc_spe.h"
+
+
+/**
+ * Generate code to perform alpha testing.
+ *
+ * The code generated by this function uses the register specificed by
+ * \c mask as both an input and an output.
+ *
+ * \param dsa    Current alpha-test state
+ * \param f      Function to which code should be appended
+ * \param mask   Index of register containing active fragment mask
+ * \param alphas Index of register containing per-fragment alpha values
+ *
+ * \note Emits a maximum of 6 instructions.
+ */
+static void
+emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,
+                struct spe_function *f, int mask, int alphas)
+{
+   /* If the alpha function is either NEVER or ALWAYS, there is no need to
+    * load the reference value into a register.  ALWAYS is a fairly common
+    * case, and this optimization saves 2 instructions.
+    */
+   if (dsa->alpha.enabled
+       && (dsa->alpha.func != PIPE_FUNC_NEVER)
+       && (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      int ref = spe_allocate_available_register(f);
+      int tmp_a = spe_allocate_available_register(f);
+      int tmp_b = spe_allocate_available_register(f);
+      union {
+         float f;
+         unsigned u;
+      } ref_val;
+      boolean complement = FALSE;
+
+      ref_val.f = dsa->alpha.ref;
+
+      spe_il(f, ref, ref_val.u & 0x0000ffff);
+      spe_ilh(f, ref, ref_val.u >> 16);
+
+      switch (dsa->alpha.func) {
+      case PIPE_FUNC_NOTEQUAL:
+         complement = TRUE;
+         /* FALLTHROUGH */
+
+      case PIPE_FUNC_EQUAL:
+         spe_fceq(f, tmp_a, ref, alphas);
+         break;
+
+      case PIPE_FUNC_LEQUAL:
+         complement = TRUE;
+         /* FALLTHROUGH */
+
+      case PIPE_FUNC_GREATER:
+         spe_fcgt(f, tmp_a, ref, alphas);
+         break;
+
+      case PIPE_FUNC_LESS:
+         complement = TRUE;
+         /* FALLTHROUGH */
+
+      case PIPE_FUNC_GEQUAL:
+         spe_fcgt(f, tmp_a, ref, alphas);
+         spe_fceq(f, tmp_b, ref, alphas);
+         spe_or(f, tmp_a, tmp_b, tmp_a);
+         break;
+
+      case PIPE_FUNC_ALWAYS:
+      case PIPE_FUNC_NEVER:
+      default:
+         assert(0);
+         break;
+      }
+
+      if (complement) {
+         spe_andc(f, mask, mask, tmp_a);
+      } else {
+         spe_and(f, mask, mask, tmp_a);
+      }
+
+      spe_release_register(f, ref);
+      spe_release_register(f, tmp_a);
+      spe_release_register(f, tmp_b);
+   } else if (dsa->alpha.enabled && (dsa->alpha.func == PIPE_FUNC_NEVER)) {
+      spe_il(f, mask, 0);
+   }
+}
+
+
+/**
+ * Generate code to perform Z testing.  Four Z values are tested at once.
+ * \param dsa        Current depth-test state
+ * \param f          Function to which code should be appended
+ * \param mask       Index of register to contain depth-pass mask
+ * \param stored     Index of register containing values from depth buffer
+ * \param calculated Index of register containing per-fragment depth values
+ *
+ * \return
+ * If the calculated depth comparison mask is the actual mask, \c FALSE is
+ * returned.  If the calculated depth comparison mask is the compliment of
+ * the actual mask, \c TRUE is returned.
+ *
+ * \note Emits a maximum of 3 instructions.
+ */
+static boolean
+emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,
+                struct spe_function *f, int mask, int stored, int calculated)
+{
+   unsigned func = (dsa->depth.enabled)
+       ? dsa->depth.func : PIPE_FUNC_ALWAYS;
+   int tmp = spe_allocate_available_register(f);
+   boolean compliment = FALSE;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask, 0);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_EQUAL:
+      spe_ceq(f, mask, calculated, stored);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GREATER:
+      spe_clgt(f, mask, calculated, stored);
+      break;
+
+   case PIPE_FUNC_LESS:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GEQUAL:
+      spe_clgt(f, mask, calculated, stored);
+      spe_ceq(f, tmp, calculated, stored);
+      spe_or(f, mask, mask, tmp);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      spe_il(f, mask, ~0);
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   spe_release_register(f, tmp);
+   return compliment;
+}
+
+
+/**
+ * Generate code to apply the stencil operation (after testing).
+ * \note Emits a maximum of 5 instructions.
+ *
+ * \warning
+ * Since \c out and \c in might be the same register, this routine cannot
+ * generate code that uses \c out as a temporary.
+ */
+static void
+emit_stencil_op(struct spe_function *f,
+                int out, int in, int mask, unsigned op, unsigned ref)
+{
+   const int clamp = spe_allocate_available_register(f);
+   const int clamp_mask = spe_allocate_available_register(f);
+   const int result = spe_allocate_available_register(f);
+
+   switch(op) {
+   case PIPE_STENCIL_OP_KEEP:
+      assert(0);
+   case PIPE_STENCIL_OP_ZERO:
+      spe_il(f, result, 0);
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      spe_il(f, result, ref);
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      /* clamp = [0xff, 0xff, 0xff, 0xff] */
+      spe_il(f, clamp, 0x0ff);
+      /* result[i] = in[i] + 1 */
+      spe_ai(f, result, in, 1);
+      /* clamp_mask[i] = (result[i] > 0xff) */
+      spe_clgti(f, clamp_mask, result, 0x0ff);
+      /* result[i] = clamp_mask[i] ? clamp[i] : result[i] */
+      spe_selb(f, result, result, clamp, clamp_mask);
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      spe_il(f, clamp, 0);
+      spe_ai(f, result, in, -1);
+
+      /* If "(s-1) < 0" in signed arithemtic, then "(s-1) > MAX" in unsigned
+       * arithmetic.
+       */
+      spe_clgti(f, clamp_mask, result, 0x0ff);
+      spe_selb(f, result, result, clamp, clamp_mask);
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      spe_ai(f, result, in, 1);
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      spe_ai(f, result, in, -1);
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      spe_nor(f, result, in, in);
+      break;
+   default:
+      assert(0);
+   }
+
+   spe_selb(f, out, in, result, mask);
+
+   spe_release_register(f, result);
+   spe_release_register(f, clamp_mask);
+   spe_release_register(f, clamp);
+}
+
+
+/**
+ * Generate code to do stencil test.  Four pixels are tested at once.
+ * \param dsa        Depth / stencil test state
+ * \param face       0 for front face, 1 for back face
+ * \param f          Function to append instructions to
+ * \param mask       Register containing mask of fragments passing the
+ *                   alpha test
+ * \param depth_mask Register containing mask of fragments passing the
+ *                   depth test
+ * \param depth_compliment  Is \c depth_mask the compliment of the actual mask?
+ * \param stencil    Register containing values from stencil buffer
+ * \param depth_pass Register to store mask of fragments passing stencil test
+ *                   and depth test
+ *
+ * \note
+ * Emits a maximum of 10 + (3 * 5) = 25 instructions.
+ */
+static int
+emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
+                  struct pipe_stencil_ref *sr,
+                  unsigned face,
+                  struct spe_function *f,
+                  int mask,
+                  int depth_mask,
+                  boolean depth_complement,
+                  int stencil,
+                  int depth_pass)
+{
+   int stencil_fail = spe_allocate_available_register(f);
+   int depth_fail = spe_allocate_available_register(f);
+   int stencil_mask = spe_allocate_available_register(f);
+   int stencil_pass = spe_allocate_available_register(f);
+   int face_stencil = spe_allocate_available_register(f);
+   int stencil_src = stencil;
+   const unsigned ref = (sr->ref_value[face]
+                         & dsa->stencil[face].valuemask);
+   boolean complement = FALSE;
+   int stored;
+   int tmp = spe_allocate_available_register(f);
+
+
+   if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+       && (dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+       && (dsa->stencil[face].valuemask != 0x0ff)) {
+      stored = spe_allocate_available_register(f);
+      spe_andi(f, stored, stencil, dsa->stencil[face].valuemask);
+   } else {
+      stored = stencil;
+   }
+
+
+   switch (dsa->stencil[face].func) {
+   case PIPE_FUNC_NEVER:
+      spe_il(f, stencil_mask, 0);   /* stencil_mask[0..3] = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_EQUAL:
+      /* stencil_mask[i] = (stored[i] == ref) */
+      spe_ceqi(f, stencil_mask, stored, ref);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GREATER:
+      complement = TRUE;
+      /* stencil_mask[i] = (stored[i] > ref) */
+      spe_clgti(f, stencil_mask, stored, ref);
+      break;
+
+   case PIPE_FUNC_LESS:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GEQUAL:
+      /* stencil_mask[i] = (stored[i] > ref) */
+      spe_clgti(f, stencil_mask, stored, ref);
+      /* tmp[i] = (stored[i] == ref) */
+      spe_ceqi(f, tmp, stored, ref);
+      /* stencil_mask[i] = stencil_mask[i] | tmp[i] */
+      spe_or(f, stencil_mask, stencil_mask, tmp);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* See comment below. */
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   if (stored != stencil) {
+      spe_release_register(f, stored);
+   }
+   spe_release_register(f, tmp);
+
+
+   /* ALWAYS is a very common stencil-test, so some effort is applied to
+    * optimize that case.  The stencil-pass mask is the same as the input
+    * fragment mask.  This makes the stencil-test (above) a no-op, and the
+    * input fragment mask can be "renamed" the stencil-pass mask.
+    */
+   if (dsa->stencil[face].func == PIPE_FUNC_ALWAYS) {
+      spe_release_register(f, stencil_pass);
+      stencil_pass = mask;
+   } else {
+      if (complement) {
+         spe_andc(f, stencil_pass, mask, stencil_mask);
+      } else {
+         spe_and(f, stencil_pass, mask, stencil_mask);
+      }
+   }
+
+   if (depth_complement) {
+      spe_andc(f, depth_pass, stencil_pass, depth_mask);
+   } else {
+      spe_and(f, depth_pass, stencil_pass, depth_mask);
+   }
+
+
+   /* Conditionally emit code to update the stencil value under various
+    * condititons.  Note that there is no need to generate code under the
+    * following circumstances:
+    *
+    * - Stencil write mask is zero.
+    * - For stencil-fail if the stencil test is ALWAYS
+    * - For depth-fail if the stencil test is NEVER
+    * - For depth-pass if the stencil test is NEVER
+    * - Any of the 3 conditions if the operation is KEEP
+    */
+   if (dsa->stencil[face].writemask != 0) {
+      if ((dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+          && (dsa->stencil[face].fail_op != PIPE_STENCIL_OP_KEEP)) {
+         if (complement) {
+            spe_and(f, stencil_fail, mask, stencil_mask);
+         } else {
+            spe_andc(f, stencil_fail, mask, stencil_mask);
+         }
+
+         emit_stencil_op(f, face_stencil, stencil_src, stencil_fail,
+                         dsa->stencil[face].fail_op,
+                         sr->ref_value[face]);
+
+         stencil_src = face_stencil;
+      }
+
+      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+          && (dsa->stencil[face].zfail_op != PIPE_STENCIL_OP_KEEP)) {
+         if (depth_complement) {
+            spe_and(f, depth_fail, stencil_pass, depth_mask);
+         } else {
+            spe_andc(f, depth_fail, stencil_pass, depth_mask);
+         }
+
+         emit_stencil_op(f, face_stencil, stencil_src, depth_fail,
+                         dsa->stencil[face].zfail_op,
+                         sr->ref_value[face]);
+         stencil_src = face_stencil;
+      }
+
+      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+          && (dsa->stencil[face].zpass_op != PIPE_STENCIL_OP_KEEP)) {
+         emit_stencil_op(f, face_stencil, stencil_src, depth_pass,
+                         dsa->stencil[face].zpass_op,
+                         sr->ref_value[face]);
+         stencil_src = face_stencil;
+      }
+   }
+
+   spe_release_register(f, stencil_fail);
+   spe_release_register(f, depth_fail);
+   spe_release_register(f, stencil_mask);
+   if (stencil_pass != mask) {
+      spe_release_register(f, stencil_pass);
+   }
+
+   /* If all of the stencil operations were KEEP or the stencil write mask was
+    * zero, "stencil_src" will still be set to "stencil".  In this case
+    * release the "face_stencil" register.  Otherwise apply the stencil write
+    * mask to select bits from the calculated stencil value and the previous
+    * stencil value.
+    */
+   if (stencil_src == stencil) {
+      spe_release_register(f, face_stencil);
+   } else if (dsa->stencil[face].writemask != 0x0ff) {
+      int tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, dsa->stencil[face].writemask);
+      spe_selb(f, stencil_src, stencil, stencil_src, tmp);
+
+      spe_release_register(f, tmp);
+   }
+
+   return stencil_src;
+}
+
+
+void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa,
+                                 struct pipe_stencil_ref *sr)
+{
+   struct pipe_depth_stencil_alpha_state *const dsa = &cdsa->base;
+   struct spe_function *const f = &cdsa->code;
+
+   /* This code generates a maximum of 6 (alpha test) + 3 (depth test)
+    * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions.  Round
+    * up to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, SPE_INST_SIZE * 64);
+
+
+   /* Allocate registers for the function's input parameters.  Cleverly (and
+    * clever code is usually dangerous, but I couldn't resist) the generated
+    * function returns a structure.  Returned structures start with register
+    * 3, and the structure fields are ordered to match up exactly with the
+    * input parameters.
+    */
+   int mask = spe_allocate_register(f, 3);
+   int depth = spe_allocate_register(f, 4);
+   int stencil = spe_allocate_register(f, 5);
+   int zvals = spe_allocate_register(f, 6);
+   int frag_a = spe_allocate_register(f, 7);
+   int facing = spe_allocate_register(f, 8);
+
+   int depth_mask = spe_allocate_available_register(f);
+
+   boolean depth_complement;
+
+
+   emit_alpha_test(dsa, f, mask, frag_a);
+
+   depth_complement = emit_depth_test(dsa, f, depth_mask, depth, zvals);
+
+   if (dsa->stencil[0].enabled) {
+      const int front_depth_pass = spe_allocate_available_register(f);
+      int front_stencil = emit_stencil_test(dsa, sr, 0, f, mask,
+                                            depth_mask, depth_complement,
+                                            stencil, front_depth_pass);
+
+      if (dsa->stencil[1].enabled) {
+         const int back_depth_pass = spe_allocate_available_register(f);
+         int back_stencil = emit_stencil_test(dsa, sr, 1, f, mask,
+                                              depth_mask,  depth_complement,
+                                              stencil, back_depth_pass);
+
+         /* If the front facing stencil value and the back facing stencil
+          * value are stored in the same register, there is no need to select
+          * a value based on the facing.  This can happen if the stencil value
+          * was not modified due to the write masks being zero, the stencil
+          * operations being KEEP, etc.
+          */
+         if (front_stencil != back_stencil) {
+            spe_selb(f, stencil, back_stencil, front_stencil, facing);
+         }
+
+         if (back_stencil != stencil) {
+            spe_release_register(f, back_stencil);
+         }
+
+         if (front_stencil != stencil) {
+            spe_release_register(f, front_stencil);
+         }
+
+         spe_selb(f, mask, back_depth_pass, front_depth_pass, facing);
+
+         spe_release_register(f, back_depth_pass);
+      } else {
+         if (front_stencil != stencil) {
+            spe_or(f, stencil, front_stencil, front_stencil);
+            spe_release_register(f, front_stencil);
+         }
+         spe_or(f, mask, front_depth_pass, front_depth_pass);
+      }
+
+      spe_release_register(f, front_depth_pass);
+   } else if (dsa->depth.enabled) {
+      if (depth_complement) {
+         spe_andc(f, mask, mask, depth_mask);
+      } else {
+         spe_and(f, mask, mask, depth_mask);
+      }
+   }
+
+   if (dsa->depth.writemask) {
+         spe_selb(f, depth, depth, zvals, mask);
+   }
+
+   spe_bi(f, 0, 0, 0);  /* return from function call */
+
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+      unsigned i;
+
+      printf("# alpha (%sabled)\n",
+             (dsa->alpha.enabled) ? "en" : "dis");
+      printf("#    func: %u\n", dsa->alpha.func);
+      printf("#    ref: %.2f\n", dsa->alpha.ref);
+
+      printf("# depth (%sabled)\n",
+             (dsa->depth.enabled) ? "en" : "dis");
+      printf("#    func: %u\n", dsa->depth.func);
+
+      for (i = 0; i < 2; i++) {
+         printf("# %s stencil (%sabled)\n",
+                (i == 0) ? "front" : "back",
+                (dsa->stencil[i].enabled) ? "en" : "dis");
+
+         printf("#    func: %u\n", dsa->stencil[i].func);
+         printf("#    op (sf, zf, zp): %u %u %u\n",
+                dsa->stencil[i].fail_op,
+                dsa->stencil[i].zfail_op,
+                dsa->stencil[i].zpass_op);
+         printf("#    ref value / value mask / write mask: %02x %02x %02x\n",
+                sr->ref_value[i],
+                dsa->stencil[i].valuemask,
+                dsa->stencil[i].writemask);
+      }
+
+      printf("\t.text\n");
+      for (/* empty */; p < f->csr; p++) {
+         printf("\t.long\t0x%04x\n", *p);
+      }
+      fflush(stdout);
+   }
+#endif
+}
+
+
+/**
+ * \note Emits a maximum of 3 instructions
+ */
+static int
+emit_alpha_factor_calculation(struct spe_function *f,
+                              unsigned factor,
+                              int src_alpha, int dst_alpha, int const_alpha)
+{
+   int factor_reg;
+   int tmp;
+
+
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_or(f, factor_reg, src_alpha, src_alpha);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      factor_reg = dst_alpha;
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      factor_reg = spe_allocate_available_register(f);
+
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, const_alpha);
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor_reg = const_alpha;
+      break;
+
+   case PIPE_BLENDFACTOR_ZERO:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, src_alpha);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, dst_alpha);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      assert(0);
+      factor_reg = -1;
+      break;
+   }
+
+   return factor_reg;
+}
+
+
+/**
+ * \note Emits a maximum of 6 instructions
+ */
+static void
+emit_color_factor_calculation(struct spe_function *f,
+                              unsigned sF, unsigned mask,
+                              const int *src,
+                              const int *dst,
+                              const int *const_color,
+                              int *factor)
+{
+   int tmp;
+   unsigned i;
+
+
+   factor[0] = -1;
+   factor[1] = -1;
+   factor[2] = -1;
+   factor[3] = -1;
+
+   switch (sF) {
+   case PIPE_BLENDFACTOR_ONE:
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_or(f, factor[i], src[i], src[i]);
+         }
+      }
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_or(f, factor[0], src[3], src[3]);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      factor[0] = dst[3];
+      factor[1] = dst[3];
+      factor[2] = dst[3];
+      break;
+
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      factor[0] = dst[0];
+      factor[1] = dst[1];
+      factor[2] = dst[2];
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      /* Alpha saturate means min(As, 1-Ad).
+       */
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, tmp, tmp, dst[3]);
+      spe_fcgt(f, factor[0], tmp, src[3]);
+      spe_selb(f, factor[0], src[3], tmp, factor[0]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; i++) {
+         factor[i] = spe_allocate_available_register(f);
+
+         spe_fs(f, factor[i], tmp, const_color[i]);
+      }
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      for (i = 0; i < 3; i++) {
+         factor[i] = const_color[i];
+      }
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, const_color[3]);
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor[0] = const_color[3];
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+      break;
+
+   case PIPE_BLENDFACTOR_ZERO:
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_fs(f, factor[i], tmp, src[i]);
+         }
+      }
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, src[3]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, dst[3]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_fs(f, factor[i], tmp, dst[i]);
+         }
+      }
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+emit_blend_calculation(struct spe_function *f,
+                       unsigned func, unsigned sF, unsigned dF,
+                       int src, int src_factor, int dst, int dst_factor)
+{
+   int tmp = spe_allocate_available_register(f);
+
+   switch (func) {
+   case PIPE_BLEND_ADD:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            /* Do nothing. */
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fa(f, src, src, dst);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_or(f, src, dst, dst);
+         } else {
+            spe_fm(f, src, dst, dst_factor);
+         }
+      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
+         spe_fm(f, src, src, src_factor);
+      } else {
+         spe_fm(f, tmp, dst, dst_factor);
+         spe_fma(f, src, src, src_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_SUBTRACT:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            /* Do nothing. */
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fs(f, src, src, dst);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_il(f, tmp, 0);
+            spe_fs(f, src, tmp, dst);
+         } else {
+            spe_fm(f, src, dst, dst_factor);
+         }
+      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
+         spe_fm(f, src, src, src_factor);
+      } else {
+         spe_fm(f, tmp, dst, dst_factor);
+         spe_fms(f, src, src, src_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, tmp, 0);
+            spe_fs(f, src, tmp, src);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fs(f, src, dst, src);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_or(f, src, dst, dst);
+         } else {
+            spe_fm(f, src, dst, dst_factor);
+         }
+      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
+         spe_fm(f, src, src, src_factor);
+      } else {
+         spe_fm(f, tmp, src, src_factor);
+         spe_fms(f, src, src, dst_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_MIN:
+      spe_cgt(f, tmp, src, dst);
+      spe_selb(f, src, src, dst, tmp);
+      break;
+
+   case PIPE_BLEND_MAX:
+      spe_cgt(f, tmp, src, dst);
+      spe_selb(f, src, dst, src, tmp);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   spe_release_register(f, tmp);
+}
+
+
+/**
+ * Generate code to perform alpha blending on the SPE
+ */
+void
+cell_generate_alpha_blend(struct cell_blend_state *cb)
+{
+   struct pipe_blend_state *const b = &cb->base;
+   struct spe_function *const f = &cb->code;
+
+   /* This code generates a maximum of 3 (source alpha factor)
+    * + 3 (destination alpha factor) + (3 * 6) (source color factor)
+    * + (3 * 6) (destination color factor) + (4 * 2) (blend equation)
+    * + 4 (fragment mask) + 1 (return) = 55 instlructions.  Round up to 64 to
+    * make it a happy power-of-two.
+    */
+   spe_init_func(f, SPE_INST_SIZE * 64);
+
+
+   const int frag[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+   const int pixel[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+   const int const_color[4] = {
+      spe_allocate_register(f, 11),
+      spe_allocate_register(f, 12),
+      spe_allocate_register(f, 13),
+      spe_allocate_register(f, 14),
+   };
+   unsigned func[4];
+   unsigned sF[4];
+   unsigned dF[4];
+   unsigned i;
+   int src_factor[4];
+   int dst_factor[4];
+
+
+   /* Does the selected blend mode make use of the source / destination
+    * color (RGB) blend factors?
+    */
+   boolean need_color_factor = b->rt[0].blend_enable
+       && (b->rt[0].rgb_func != PIPE_BLEND_MIN)
+       && (b->rt[0].rgb_func != PIPE_BLEND_MAX);
+
+   /* Does the selected blend mode make use of the source / destination
+    * alpha blend factors?
+    */
+   boolean need_alpha_factor = b->rt[0].blend_enable
+       && (b->rt[0].alpha_func != PIPE_BLEND_MIN)
+       && (b->rt[0].alpha_func != PIPE_BLEND_MAX);
+
+
+   if (b->rt[0].blend_enable) {
+      sF[0] = b->rt[0].rgb_src_factor;
+      sF[1] = sF[0];
+      sF[2] = sF[0];
+      switch (b->rt[0].alpha_src_factor & 0x0f) {
+      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         sF[3] = PIPE_BLENDFACTOR_ONE;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+      case PIPE_BLENDFACTOR_DST_COLOR:
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+         sF[3] = b->rt[0].alpha_src_factor + 1;
+         break;
+      default:
+         sF[3] = b->rt[0].alpha_src_factor;
+      }
+
+      dF[0] = b->rt[0].rgb_dst_factor;
+      dF[1] = dF[0];
+      dF[2] = dF[0];
+      switch (b->rt[0].alpha_dst_factor & 0x0f) {
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+      case PIPE_BLENDFACTOR_DST_COLOR:
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+         dF[3] = b->rt[0].alpha_dst_factor + 1;
+         break;
+      default:
+         dF[3] = b->rt[0].alpha_dst_factor;
+      }
+
+      func[0] = b->rt[0].rgb_func;
+      func[1] = func[0];
+      func[2] = func[0];
+      func[3] = b->rt[0].alpha_func;
+   } else {
+      sF[0] = PIPE_BLENDFACTOR_ONE;
+      sF[1] = PIPE_BLENDFACTOR_ONE;
+      sF[2] = PIPE_BLENDFACTOR_ONE;
+      sF[3] = PIPE_BLENDFACTOR_ONE;
+      dF[0] = PIPE_BLENDFACTOR_ZERO;
+      dF[1] = PIPE_BLENDFACTOR_ZERO;
+      dF[2] = PIPE_BLENDFACTOR_ZERO;
+      dF[3] = PIPE_BLENDFACTOR_ZERO;
+
+      func[0] = PIPE_BLEND_ADD;
+      func[1] = PIPE_BLEND_ADD;
+      func[2] = PIPE_BLEND_ADD;
+      func[3] = PIPE_BLEND_ADD;
+   }
+
+
+   /* If alpha writing is enabled and the alpha blend mode requires use of
+    * the alpha factor, calculate the alpha factor.
+    */
+   if (((b->rt[0].colormask & 8) != 0) && need_alpha_factor) {
+      src_factor[3] = emit_alpha_factor_calculation(f, sF[3], const_color[3],
+                                                    frag[3], pixel[3]);
+
+      /* If the alpha destination blend factor is the same as the alpha source
+       * blend factor, re-use the previously calculated value.
+       */
+      dst_factor[3] = (dF[3] == sF[3])
+          ? src_factor[3]
+          : emit_alpha_factor_calculation(f, dF[3], const_color[3],
+                                          frag[3], pixel[3]);
+   }
+
+
+   if (sF[0] == sF[3]) {
+      src_factor[0] = src_factor[3];
+      src_factor[1] = src_factor[3];
+      src_factor[2] = src_factor[3];
+   } else if (sF[0] == dF[3]) {
+      src_factor[0] = dst_factor[3];
+      src_factor[1] = dst_factor[3];
+      src_factor[2] = dst_factor[3];
+   } else if (need_color_factor) {
+      emit_color_factor_calculation(f,
+                                    b->rt[0].rgb_src_factor,
+                                    b->rt[0].colormask,
+                                    frag, pixel, const_color, src_factor);
+   }
+
+
+   if (dF[0] == sF[3]) {
+      dst_factor[0] = src_factor[3];
+      dst_factor[1] = src_factor[3];
+      dst_factor[2] = src_factor[3];
+   } else if (dF[0] == dF[3]) {
+      dst_factor[0] = dst_factor[3];
+      dst_factor[1] = dst_factor[3];
+      dst_factor[2] = dst_factor[3];
+   } else if (dF[0] == sF[0]) {
+      dst_factor[0] = src_factor[0];
+      dst_factor[1] = src_factor[1];
+      dst_factor[2] = src_factor[2];
+   } else if (need_color_factor) {
+      emit_color_factor_calculation(f,
+                                    b->rt[0].rgb_dst_factor,
+                                    b->rt[0].colormask,
+                                    frag, pixel, const_color, dst_factor);
+   }
+
+
+
+   for (i = 0; i < 4; ++i) {
+      if ((b->rt[0].colormask & (1U << i)) != 0) {
+         emit_blend_calculation(f,
+                                func[i], sF[i], dF[i],
+                                frag[i], src_factor[i],
+                                pixel[i], dst_factor[i]);
+      }
+   }
+
+   spe_bi(f, 0, 0, 0);
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+
+      printf("# %u instructions\n", f->csr - f->store);
+      printf("# blend (%sabled)\n",
+             (cb->base.blend_enable) ? "en" : "dis");
+      printf("#    RGB func / sf / df: %u %u %u\n",
+             cb->base.rgb_func,
+             cb->base.rgb_src_factor,
+             cb->base.rgb_dst_factor);
+      printf("#    ALP func / sf / df: %u %u %u\n",
+             cb->base.alpha_func,
+             cb->base.alpha_src_factor,
+             cb->base.alpha_dst_factor);
+
+      printf("\t.text\n");
+      for (/* empty */; p < f->csr; p++) {
+         printf("\t.long\t0x%04x\n", *p);
+      }
+      fflush(stdout);
+   }
+#endif
+}
+
+
+static int
+PC_OFFSET(const struct spe_function *f, const void *d)
+{
+   const intptr_t pc = (intptr_t) &f->store[f->num_inst];
+   const intptr_t ea = ~0x0f & (intptr_t) d;
+
+   return (ea - pc) >> 2;
+}
+
+
+/**
+ * Generate code to perform color conversion and logic op
+ *
+ * \bug
+ * The code generated by this function should also perform dithering.
+ *
+ * \bug
+ * The code generated by this function should also perform color-write
+ * masking.
+ *
+ * \bug
+ * Only two framebuffer formats are supported at this time.
+ */
+void
+cell_generate_logic_op(struct spe_function *f,
+                       const struct pipe_blend_state *blend,
+                       struct pipe_surface *surf)
+{
+   const unsigned logic_op = (blend->logicop_enable)
+       ? blend->logicop_func : PIPE_LOGICOP_COPY;
+
+   /* This code generates a maximum of 37 instructions.  An additional 32
+    * bytes (equiv. to 8 instructions) are needed for data storage.  Round up
+    * to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, SPE_INST_SIZE * 64);
+
+
+   /* Pixel colors in framebuffer format in AoS layout.
+    */
+   const int pixel[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+
+   /* Fragment colors stored as floats in SoA layout.
+    */
+   const int frag[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+
+   const int mask = spe_allocate_register(f, 11);
+
+
+   /* Short-circuit the noop and invert cases.
+    */
+   if ((logic_op == PIPE_LOGICOP_NOOP) || (blend->rt[0].colormask == 0)) {
+      spe_bi(f, 0, 0, 0);
+      return;
+   } else if (logic_op == PIPE_LOGICOP_INVERT) {
+      spe_nor(f, pixel[0], pixel[0], pixel[0]);
+      spe_nor(f, pixel[1], pixel[1], pixel[1]);
+      spe_nor(f, pixel[2], pixel[2], pixel[2]);
+      spe_nor(f, pixel[3], pixel[3], pixel[3]);
+      spe_bi(f, 0, 0, 0);
+      return;
+   }
+
+
+   const int tmp[4] = {
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+   };
+
+   const int shuf_xpose_hi = spe_allocate_available_register(f);
+   const int shuf_xpose_lo = spe_allocate_available_register(f);
+   const int shuf_color = spe_allocate_available_register(f);
+
+
+   /* Pointer to the begining of the function's private data area.
+    */
+   uint32_t *const data = ((uint32_t *) f->store) + (64 - 8);
+
+
+   /* Convert fragment colors to framebuffer format in AoS layout.
+    */
+   switch (surf->format) {
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      data[0] = 0x00010203;
+      data[1] = 0x10111213;
+      data[2] = 0x04050607;
+      data[3] = 0x14151617;
+      data[4] = 0x0c000408;
+      data[5] = 0x80808080;
+      data[6] = 0x80808080;
+      data[7] = 0x80808080;
+      break;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      data[0] = 0x03020100;
+      data[1] = 0x13121110;
+      data[2] = 0x07060504;
+      data[3] = 0x17161514;
+      data[4] = 0x0804000c;
+      data[5] = 0x80808080;
+      data[6] = 0x80808080;
+      data[7] = 0x80808080;
+      break;
+   default:
+      fprintf(stderr, "CELL: Bad pixel format in cell_generate_logic_op()");
+      ASSERT(0);
+   }
+
+   spe_ilh(f, tmp[0], 0x0808);
+   spe_lqr(f, shuf_xpose_hi, PC_OFFSET(f, data+0));
+   spe_lqr(f, shuf_color, PC_OFFSET(f, data+4));
+   spe_a(f, shuf_xpose_lo, shuf_xpose_hi, tmp[0]);
+
+   spe_shufb(f, tmp[0], frag[0], frag[2], shuf_xpose_hi);
+   spe_shufb(f, tmp[1], frag[0], frag[2], shuf_xpose_lo);
+   spe_shufb(f, tmp[2], frag[1], frag[3], shuf_xpose_hi);
+   spe_shufb(f, tmp[3], frag[1], frag[3], shuf_xpose_lo);
+
+   spe_shufb(f, frag[0], tmp[0], tmp[2], shuf_xpose_hi);
+   spe_shufb(f, frag[1], tmp[0], tmp[2], shuf_xpose_lo);
+   spe_shufb(f, frag[2], tmp[1], tmp[3], shuf_xpose_hi);
+   spe_shufb(f, frag[3], tmp[1], tmp[3], shuf_xpose_lo);
+
+   spe_cfltu(f, frag[0], frag[0], 32);
+   spe_cfltu(f, frag[1], frag[1], 32);
+   spe_cfltu(f, frag[2], frag[2], 32);
+   spe_cfltu(f, frag[3], frag[3], 32);
+
+   spe_shufb(f, frag[0], frag[0], pixel[0], shuf_color);
+   spe_shufb(f, frag[1], frag[1], pixel[1], shuf_color);
+   spe_shufb(f, frag[2], frag[2], pixel[2], shuf_color);
+   spe_shufb(f, frag[3], frag[3], pixel[3], shuf_color);
+
+
+   /* If logic op is enabled, perform the requested logical operation on the
+    * converted fragment colors and the pixel colors.
+    */
+   switch (logic_op) {
+   case PIPE_LOGICOP_CLEAR:
+      spe_il(f, frag[0], 0);
+      spe_il(f, frag[1], 0);
+      spe_il(f, frag[2], 0);
+      spe_il(f, frag[3], 0);
+      break;
+   case PIPE_LOGICOP_NOR:
+      spe_nor(f, frag[0], frag[0], pixel[0]);
+      spe_nor(f, frag[1], frag[1], pixel[1]);
+      spe_nor(f, frag[2], frag[2], pixel[2]);
+      spe_nor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      spe_andc(f, frag[0], pixel[0], frag[0]);
+      spe_andc(f, frag[1], pixel[1], frag[1]);
+      spe_andc(f, frag[2], pixel[2], frag[2]);
+      spe_andc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      spe_nor(f, frag[0], frag[0], frag[0]);
+      spe_nor(f, frag[1], frag[1], frag[1]);
+      spe_nor(f, frag[2], frag[2], frag[2]);
+      spe_nor(f, frag[3], frag[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      spe_andc(f, frag[0], frag[0], pixel[0]);
+      spe_andc(f, frag[1], frag[1], pixel[1]);
+      spe_andc(f, frag[2], frag[2], pixel[2]);
+      spe_andc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_XOR:
+      spe_xor(f, frag[0], frag[0], pixel[0]);
+      spe_xor(f, frag[1], frag[1], pixel[1]);
+      spe_xor(f, frag[2], frag[2], pixel[2]);
+      spe_xor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_NAND:
+      spe_nand(f, frag[0], frag[0], pixel[0]);
+      spe_nand(f, frag[1], frag[1], pixel[1]);
+      spe_nand(f, frag[2], frag[2], pixel[2]);
+      spe_nand(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND:
+      spe_and(f, frag[0], frag[0], pixel[0]);
+      spe_and(f, frag[1], frag[1], pixel[1]);
+      spe_and(f, frag[2], frag[2], pixel[2]);
+      spe_and(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      spe_eqv(f, frag[0], frag[0], pixel[0]);
+      spe_eqv(f, frag[1], frag[1], pixel[1]);
+      spe_eqv(f, frag[2], frag[2], pixel[2]);
+      spe_eqv(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      spe_orc(f, frag[0], pixel[0], frag[0]);
+      spe_orc(f, frag[1], pixel[1], frag[1]);
+      spe_orc(f, frag[2], pixel[2], frag[2]);
+      spe_orc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY:
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      spe_orc(f, frag[0], frag[0], pixel[0]);
+      spe_orc(f, frag[1], frag[1], pixel[1]);
+      spe_orc(f, frag[2], frag[2], pixel[2]);
+      spe_orc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR:
+      spe_or(f, frag[0], frag[0], pixel[0]);
+      spe_or(f, frag[1], frag[1], pixel[1]);
+      spe_or(f, frag[2], frag[2], pixel[2]);
+      spe_or(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_SET:
+      spe_il(f, frag[0], ~0);
+      spe_il(f, frag[1], ~0);
+      spe_il(f, frag[2], ~0);
+      spe_il(f, frag[3], ~0);
+      break;
+
+   /* These two cases are short-circuited above.
+    */
+   case PIPE_LOGICOP_INVERT:
+   case PIPE_LOGICOP_NOOP:
+   default:
+      assert(0);
+   }
+
+
+   /* Apply fragment mask.
+    */
+   spe_ilh(f, tmp[0], 0x0000);
+   spe_ilh(f, tmp[1], 0x0404);
+   spe_ilh(f, tmp[2], 0x0808);
+   spe_ilh(f, tmp[3], 0x0c0c);
+
+   spe_shufb(f, tmp[0], mask, mask, tmp[0]);
+   spe_shufb(f, tmp[1], mask, mask, tmp[1]);
+   spe_shufb(f, tmp[2], mask, mask, tmp[2]);
+   spe_shufb(f, tmp[3], mask, mask, tmp[3]);
+
+   spe_selb(f, pixel[0], pixel[0], frag[0], tmp[0]);
+   spe_selb(f, pixel[1], pixel[1], frag[1], tmp[1]);
+   spe_selb(f, pixel[2], pixel[2], frag[2], tmp[2]);
+   spe_selb(f, pixel[3], pixel[3], frag[3], tmp[3]);
+
+   spe_bi(f, 0, 0, 0);
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+      unsigned i;
+
+      printf("# %u instructions\n", f->csr - f->store);
+
+      printf("\t.text\n");
+      for (i = 0; i < 64; i++) {
+         printf("\t.long\t0x%04x\n", p[i]);
+      }
+      fflush(stdout);
+   }
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
new file mode 100644
index 0000000000..a8267a5133
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
@@ -0,0 +1,39 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CELL_STATE_PER_FRAGMENT_H
+#define CELL_STATE_PER_FRAGMENT_H
+
+extern void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa);
+
+extern void
+cell_generate_alpha_blend(struct cell_blend_state *cb);
+
+extern void
+cell_generate_logic_op(struct spe_function *f,
+                       const struct pipe_blend_state *blend,
+                       struct pipe_surface *surf);
+
+#endif /* CELL_STATE_PER_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
new file mode 100644
index 0000000000..ddf1477268
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -0,0 +1,229 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "draw/draw_context.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "cell_context.h"
+#include "cell_state.h"
+#include "cell_gen_fp.h"
+#include "cell_texture.h"
+
+
+/** cast wrapper */
+static INLINE struct cell_fragment_shader_state *
+cell_fragment_shader_state(void *shader)
+{
+   return (struct cell_fragment_shader_state *) shader;
+}
+
+
+/** cast wrapper */
+static INLINE struct cell_vertex_shader_state *
+cell_vertex_shader_state(void *shader)
+{
+   return (struct cell_vertex_shader_state *) shader;
+}
+
+
+/**
+ * Create fragment shader state.
+ * Called via pipe->create_fs_state()
+ */
+static void *
+cell_create_fs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_fragment_shader_state *cfs;
+
+   cfs = CALLOC_STRUCT(cell_fragment_shader_state);
+   if (!cfs)
+      return NULL;
+
+   cfs->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!cfs->shader.tokens) {
+      FREE(cfs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(templ->tokens, &cfs->info);
+
+   cell_gen_fragment_program(cell, cfs->shader.tokens, &cfs->code);
+
+   return cfs;
+}
+
+
+/**
+ * Called via pipe->bind_fs_state()
+ */
+static void
+cell_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   cell->fs = cell_fragment_shader_state(fs);
+
+   cell->dirty |= CELL_NEW_FS;
+}
+
+
+/**
+ * Called via pipe->delete_fs_state()
+ */
+static void
+cell_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct cell_fragment_shader_state *cfs = cell_fragment_shader_state(fs);
+
+   spe_release_func(&cfs->code);
+
+   FREE((void *) cfs->shader.tokens);
+   FREE(cfs);
+}
+
+
+/**
+ * Create vertex shader state.
+ * Called via pipe->create_vs_state()
+ */
+static void *
+cell_create_vs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_vertex_shader_state *cvs;
+
+   cvs = CALLOC_STRUCT(cell_vertex_shader_state);
+   if (!cvs)
+      return NULL;
+
+   cvs->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!cvs->shader.tokens) {
+      FREE(cvs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(templ->tokens, &cvs->info);
+
+   cvs->draw_data = draw_create_vertex_shader(cell->draw, &cvs->shader);
+   if (cvs->draw_data == NULL) {
+      FREE( (void *) cvs->shader.tokens );
+      FREE( cvs );
+      return NULL;
+   }
+
+   return cvs;
+}
+
+
+/**
+ * Called via pipe->bind_vs_state()
+ */
+static void
+cell_bind_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   cell->vs = cell_vertex_shader_state(vs);
+
+   draw_bind_vertex_shader(cell->draw,
+                           (cell->vs ? cell->vs->draw_data : NULL));
+
+   cell->dirty |= CELL_NEW_VS;
+}
+
+
+/**
+ * Called via pipe->delete_vs_state()
+ */
+static void
+cell_delete_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_vertex_shader_state *cvs = cell_vertex_shader_state(vs);
+
+   draw_delete_vertex_shader(cell->draw, cvs->draw_data);
+   FREE( (void *) cvs->shader.tokens );
+   FREE( cvs );
+}
+
+
+/**
+ * Called via pipe->set_constant_buffer()
+ */
+static void
+cell_set_constant_buffer(struct pipe_context *pipe,
+                         uint shader, uint index,
+                         struct pipe_resource *constants)
+{
+   struct cell_context *cell = cell_context(pipe);
+   unsigned size = constants ? constants->width0 : 0;
+   const void *data = constants ? cell_resource(constants)->data : NULL;
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   if (cell->constants[shader] == constants)
+      return;
+
+   draw_flush(cell->draw);
+
+   /* note: reference counting */
+   pipe_resource_reference(&cell->constants[shader], constants);
+
+   if(shader == PIPE_SHADER_VERTEX) {
+      draw_set_mapped_constant_buffer(cell->draw, PIPE_SHADER_VERTEX, 0,
+                                      data, size);
+   }
+
+   cell->mapped_constants[shader] = data;
+
+   if (shader == PIPE_SHADER_VERTEX)
+      cell->dirty |= CELL_NEW_VS_CONSTANTS;
+   else if (shader == PIPE_SHADER_FRAGMENT)
+      cell->dirty |= CELL_NEW_FS_CONSTANTS;
+}
+
+
+void
+cell_init_shader_functions(struct cell_context *cell)
+{
+   cell->pipe.create_fs_state = cell_create_fs_state;
+   cell->pipe.bind_fs_state   = cell_bind_fs_state;
+   cell->pipe.delete_fs_state = cell_delete_fs_state;
+
+   cell->pipe.create_vs_state = cell_create_vs_state;
+   cell->pipe.bind_vs_state   = cell_bind_vs_state;
+   cell->pipe.delete_vs_state = cell_delete_vs_state;
+
+   cell->pipe.set_constant_buffer = cell_set_constant_buffer;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_vertex.c b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
new file mode 100644
index 0000000000..9510ea9ac2
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
@@ -0,0 +1,101 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "cell_context.h"
+#include "cell_state.h"
+
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+
+
+void *
+cell_create_vertex_elements_state(struct pipe_context *pipe,
+                                  unsigned count,
+                                  const struct pipe_vertex_element *attribs)
+{
+   struct cell_velems_state *velems;
+   assert(count <= PIPE_MAX_ATTRIBS);
+   velems = (struct cell_velems_state *) MALLOC(sizeof(struct cell_velems_state));
+   if (velems) {
+      velems->count = count;
+      memcpy(velems->velem, attribs, sizeof(*attribs) * count);
+   }
+   return velems;
+}
+
+void
+cell_bind_vertex_elements_state(struct pipe_context *pipe,
+                                void *velems)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_velems_state *cell_velems = (struct cell_velems_state *) velems;
+
+   cell->velems = cell_velems;
+
+   cell->dirty |= CELL_NEW_VERTEX;
+
+   if (cell_velems)
+      draw_set_vertex_elements(cell->draw, cell_velems->count, cell_velems->velem);
+}
+
+void
+cell_delete_vertex_elements_state(struct pipe_context *pipe, void *velems)
+{
+   FREE( velems );
+}
+
+
+static void
+cell_set_vertex_buffers(struct pipe_context *pipe,
+                        unsigned count,
+                        const struct pipe_vertex_buffer *buffers)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(cell->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   cell->num_vertex_buffers = count;
+
+   cell->dirty |= CELL_NEW_VERTEX;
+
+   draw_set_vertex_buffers(cell->draw, count, buffers);
+}
+
+
+void
+cell_init_vertex_functions(struct cell_context *cell)
+{
+   cell->pipe.set_vertex_buffers = cell_set_vertex_buffers;
+   cell->pipe.create_vertex_elements_state = cell_create_vertex_elements_state;
+   cell->pipe.bind_vertex_elements_state = cell_bind_vertex_elements_state;
+   cell->pipe.delete_vertex_elements_state = cell_delete_vertex_elements_state;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
new file mode 100644
index 0000000000..777454479b
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_surface.h"
+#include "cell_context.h"
+#include "cell_surface.h"
+
+
+void
+cell_init_surface_functions(struct cell_context *cell)
+{
+   cell->pipe.resource_copy_region = util_resource_copy_region;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.h b/src/gallium/drivers/cell/ppu/cell_surface.h
new file mode 100644
index 0000000000..9e58f32944
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_surface.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef CELL_SURFACE_H
+#define CELL_SURFACE_H
+
+
+struct cell_context;
+
+
+extern void
+cell_init_surface_functions(struct cell_context *cell);
+
+
+#endif /* SP_SURFACE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
new file mode 100644
index 0000000000..b3042df779
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -0,0 +1,660 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  *   Brian Paul
+  */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_transfer.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "cell_context.h"
+#include "cell_screen.h"
+#include "cell_state.h"
+#include "cell_texture.h"
+
+#include "state_tracker/sw_winsys.h"
+
+
+
+static boolean
+cell_resource_layout(struct pipe_screen *screen, 
+		     struct cell_resource *ct)
+{
+   struct pipe_resource *pt = &ct->base;
+   unsigned level;
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
+
+   ct->buffer_size = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      unsigned size;
+      unsigned w_tile, h_tile;
+
+      assert(level < CELL_MAX_TEXTURE_LEVELS);
+
+      /* width, height, rounded up to tile size */
+      w_tile = align(width, TILE_SIZE);
+      h_tile = align(height, TILE_SIZE);
+
+      ct->stride[level] = util_format_get_stride(pt->format, w_tile);
+
+      ct->level_offset[level] = ct->buffer_size;
+
+      size = ct->stride[level] * util_format_get_nblocksy(pt->format, h_tile);
+      if (pt->target == PIPE_TEXTURE_CUBE)
+         size *= 6;
+      else
+         size *= depth;
+
+      ct->buffer_size += size;
+
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
+   }
+
+   ct->data = align_malloc(ct->buffer_size, 16);
+ 
+   return ct->data != NULL;
+}
+
+
+/**
+ * Texture layout for simple color buffers.
+ */
+static boolean
+cell_displaytarget_layout(struct pipe_screen *screen,
+                          struct cell_resource * ct)
+{
+   struct sw_winsys *winsys = cell_screen(screen)->winsys;
+
+   /* Round up the surface size to a multiple of the tile size?
+    */
+   ct->dt = winsys->displaytarget_create(winsys,
+                                          ct->base.bind,
+                                          ct->base.format,
+                                          ct->base.width0, 
+                                          ct->base.height0,
+                                          16,
+                                          &ct->dt_stride );
+
+   return ct->dt != NULL;
+}
+
+static struct pipe_resource *
+cell_resource_create(struct pipe_screen *screen,
+                    const struct pipe_resource *templat)
+{
+   struct cell_resource *ct = CALLOC_STRUCT(cell_resource);
+   if (!ct)
+      return NULL;
+
+   ct->base = *templat;
+   pipe_reference_init(&ct->base.reference, 1);
+   ct->base.screen = screen;
+
+   /* Create both a displaytarget (linear) and regular texture
+    * (twiddled).  Convert twiddled->linear at flush_frontbuffer time.
+    */
+   if (ct->base.bind & (PIPE_BIND_DISPLAY_TARGET |
+                        PIPE_BIND_SCANOUT |
+                        PIPE_BIND_SHARED)) {
+      if (!cell_displaytarget_layout(screen, ct))
+         goto fail;
+   }
+
+   if (!cell_resource_layout(screen, ct))
+      goto fail;
+
+   return &ct->base;
+
+fail:
+   if (ct->dt) {
+      struct sw_winsys *winsys = cell_screen(screen)->winsys;
+      winsys->displaytarget_destroy(winsys, ct->dt);
+   }
+
+   FREE(ct);
+
+   return NULL;
+}
+
+
+static void
+cell_resource_destroy(struct pipe_screen *scrn, struct pipe_resource *pt)
+{
+   struct cell_screen *screen = cell_screen(scrn);
+   struct sw_winsys *winsys = screen->winsys;
+   struct cell_resource *ct = cell_resource(pt);
+
+   if (ct->dt) {
+      /* display target */
+      winsys->displaytarget_destroy(winsys, ct->dt);
+   }
+   else if (!ct->userBuffer) {
+      align_free(ct->data);
+   }
+
+   FREE(ct);
+}
+
+
+
+/**
+ * Convert image from linear layout to tiled layout.  4-byte pixels.
+ */
+static void
+twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                   uint src_stride, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
+
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   src_stride /= 4; /* convert from bytes to pixels */
+
+   /* loop over dest tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of dest tile: */
+         uint *tdst = dst + (it * w_t + jt) * tile_size2;
+
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
+
+         /* loop over texels in the tile */
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               const uint srci = it * tile_size + i;
+               const uint srcj = jt * tile_size + j;
+               ASSERT(srci < h);
+               ASSERT(srcj < w);
+               tdst[i * tile_size + j] = src[srci * src_stride + srcj];
+            }
+         }
+      }
+   }
+}
+
+
+/**
+ * For Cell.  Basically, rearrange the pixels/quads from this layout:
+ *  +--+--+--+--+
+ *  |p0|p1|p2|p3|....
+ *  +--+--+--+--+
+ *
+ * to this layout:
+ *  +--+--+
+ *  |p0|p1|....
+ *  +--+--+
+ *  |p2|p3|
+ *  +--+--+
+ */
+static void
+twiddle_tile(const uint *tileIn, uint *tileOut)
+{
+   int y, x;
+
+   for (y = 0; y < TILE_SIZE; y+=2) {
+      for (x = 0; x < TILE_SIZE; x+=2) {
+         int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+         tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k];
+         tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1];
+         tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2];
+         tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3];
+      }
+   }
+}
+
+
+/**
+ * Convert image from tiled layout to linear layout.  4-byte pixels.
+ */
+static void
+untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                     uint dst_stride, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
+   uint *tile_buf;
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   dst_stride /= 4; /* convert from bytes to pixels */
+
+   tile_buf = align_malloc(tile_size * tile_size * 4, 16);
+   
+   /* loop over src tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of src tile: */
+         const uint *tsrc = src + (it * w_t + jt) * tile_size2;
+         
+         twiddle_tile(tsrc, tile_buf);
+         tsrc = tile_buf;
+
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
+
+         /* loop over texels in the tile */
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               uint dsti = it * tile_size + i;
+               uint dstj = jt * tile_size + j;
+               ASSERT(dsti < h);
+               ASSERT(dstj < w);
+               dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j];
+            }
+         }
+      }
+   }
+
+   align_free(tile_buf);
+}
+
+
+static struct pipe_surface *
+cell_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_resource *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned usage)
+{
+   struct cell_resource *ct = cell_resource(pt);
+   struct pipe_surface *ps;
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      pipe_reference_init(&ps->reference, 1);
+      pipe_resource_reference(&ps->texture, pt);
+      ps->format = pt->format;
+      ps->width = u_minify(pt->width0, level);
+      ps->height = u_minify(pt->height0, level);
+      ps->offset = ct->level_offset[level];
+      /* XXX may need to override usage flags (see sp_texture.c) */
+      ps->usage = usage;
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
+
+      if (pt->target == PIPE_TEXTURE_CUBE) {
+         unsigned h_tile = align(ps->height, TILE_SIZE);
+         ps->offset += face * util_format_get_nblocksy(ps->format, h_tile) * ct->stride[level];
+      }
+      else if (pt->target == PIPE_TEXTURE_3D) {
+         unsigned h_tile = align(ps->height, TILE_SIZE);
+         ps->offset += zslice * util_format_get_nblocksy(ps->format, h_tile) * ct->stride[level];
+      }
+      else {
+         assert(face == 0);
+         assert(zslice == 0);
+      }
+   }
+   return ps;
+}
+
+
+static void 
+cell_tex_surface_destroy(struct pipe_surface *surf)
+{
+   pipe_resource_reference(&surf->texture, NULL);
+   FREE(surf);
+}
+
+
+/**
+ * Create new pipe_transfer object.
+ * This is used by the user to put tex data into a texture (and get it
+ * back out for glGetTexImage).
+ */
+static struct pipe_transfer *
+cell_get_transfer(struct pipe_context *ctx,
+		  struct pipe_resource *resource,
+		  struct pipe_subresource sr,
+		  unsigned usage,
+		  const struct pipe_box *box)
+{
+   struct cell_resource *ct = cell_resource(resource);
+   struct cell_transfer *ctrans;
+   enum pipe_format format = resource->format;
+
+   assert(resource);
+   assert(sr.level <= resource->last_level);
+
+   /* make sure the requested region is in the image bounds */
+   assert(box->x + box->width <= u_minify(resource->width0, sr.level));
+   assert(box->y + box->height <= u_minify(resource->height0, sr.level));
+   assert(box->z + box->depth <= u_minify(resource->depth0, sr.level));
+
+   ctrans = CALLOC_STRUCT(cell_transfer);
+   if (ctrans) {
+      struct pipe_transfer *pt = &ctrans->base;
+      pipe_resource_reference(&pt->resource, resource);
+      pt->sr = sr;
+      pt->usage = usage;
+      pt->box = *box;
+      pt->stride = ct->stride[sr.level];
+
+      ctrans->offset = ct->level_offset[sr.level];
+
+      if (resource->target == PIPE_TEXTURE_CUBE) {
+         unsigned h_tile = align(u_minify(resource->height0, sr.level), TILE_SIZE);
+         ctrans->offset += sr.face * util_format_get_nblocksy(format, h_tile) * pt->stride;
+      }
+      else if (resource->target == PIPE_TEXTURE_3D) {
+         unsigned h_tile = align(u_minify(resource->height0, sr.level), TILE_SIZE);
+         ctrans->offset += box->z * util_format_get_nblocksy(format, h_tile) * pt->stride;
+      }
+      else {
+         assert(sr.face == 0);
+         assert(box->z == 0);
+      }
+
+      return pt;
+   }
+   return NULL;
+}
+
+
+static void 
+cell_transfer_destroy(struct pipe_context *ctx, struct pipe_transfer *t)
+{
+   struct cell_transfer *transfer = cell_transfer(t);
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For cell, nothing to do.
+    */
+   assert (transfer->base.resource);
+   pipe_resource_reference(&transfer->base.resource, NULL);
+   FREE(transfer);
+}
+
+
+/**
+ * Return pointer to texture image data in linear layout.
+ */
+static void *
+cell_transfer_map(struct pipe_context *ctx, struct pipe_transfer *transfer)
+{
+   struct cell_transfer *ctrans = cell_transfer(transfer);
+   struct pipe_resource *pt = transfer->resource;
+   struct cell_resource *ct = cell_resource(pt);
+
+   assert(transfer->resource);
+
+   if (ct->mapped == NULL) {
+      ct->mapped = ct->data;
+   }
+
+
+   /* Better test would be resource->is_linear
+    */
+   if (transfer->resource->target != PIPE_BUFFER) {
+      const uint level = ctrans->base.sr.level;
+      const uint texWidth = u_minify(pt->width0, level);
+      const uint texHeight = u_minify(pt->height0, level);
+      unsigned size;
+
+
+      /*
+       * Create a buffer of ordinary memory for the linear texture.
+       * This is the memory that the user will read/write.
+       */
+      size = (util_format_get_stride(pt->format, align(texWidth, TILE_SIZE)) *
+	      util_format_get_nblocksy(pt->format, align(texHeight, TILE_SIZE)));
+
+      ctrans->map = align_malloc(size, 16);
+      if (!ctrans->map)
+	 return NULL; /* out of memory */
+
+      if (transfer->usage & PIPE_TRANSFER_READ) {
+	 /* Textures always stored twiddled, need to untwiddle the
+	  * texture to make a linear version.
+	  */
+	 const uint bpp = util_format_get_blocksize(ct->base.format);
+	 if (bpp == 4) {
+	    const uint *src = (uint *) (ct->mapped + ctrans->offset);
+	    uint *dst = ctrans->map;
+	    untwiddle_image_uint(texWidth, texHeight, TILE_SIZE,
+				 dst, transfer->stride, src);
+	 }
+	 else {
+	    // xxx fix
+	 }
+      }
+   }
+   else {
+      unsigned stride = transfer->stride;
+      enum pipe_format format = pt->format;
+      unsigned blocksize = util_format_get_blocksize(format);
+
+      ctrans->map = (ct->mapped + 
+		     ctrans->offset +
+		     ctrans->base.box.y / util_format_get_blockheight(format) * stride +
+		     ctrans->base.box.x / util_format_get_blockwidth(format) * blocksize);
+   }
+
+
+   return ctrans->map;
+}
+
+
+/**
+ * Called when user is done reading/writing texture data.
+ * If new data was written, this is where we convert the linear data
+ * to tiled data.
+ */
+static void
+cell_transfer_unmap(struct pipe_context *ctx,
+                    struct pipe_transfer *transfer)
+{
+   struct cell_transfer *ctrans = cell_transfer(transfer);
+   struct pipe_resource *pt = transfer->resource;
+   struct cell_resource *ct = cell_resource(pt);
+   const uint level = ctrans->base.sr.level;
+   const uint texWidth = u_minify(pt->width0, level);
+   const uint texHeight = u_minify(pt->height0, level);
+   const uint stride = ct->stride[level];
+
+   if (!ct->mapped) {
+      assert(0);
+      return;
+   }
+
+   if (pt->target != PIPE_BUFFER) {
+      if (transfer->usage & PIPE_TRANSFER_WRITE) {
+	 /* The user wrote new texture data into the mapped buffer.
+	  * We need to convert the new linear data into the twiddled/tiled format.
+	  */
+	 const uint bpp = util_format_get_blocksize(ct->base.format);
+	 if (bpp == 4) {
+	    const uint *src = ctrans->map;
+	    uint *dst = (uint *) (ct->mapped + ctrans->offset);
+	    twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst, stride, src);
+	 }
+	 else {
+	    // xxx fix
+	 }
+      }
+      
+      align_free(ctrans->map);
+   }
+   else {
+      /* nothing to do */
+   }
+
+   ctrans->map = NULL;
+}
+
+
+
+/* This used to be overriden by the co-state tracker, but really needs
+ * to be active with sw_winsys.
+ *
+ * Contrasting with llvmpipe and softpipe, this is the only place
+ * where we use the ct->dt display target in any real sense.
+ *
+ * Basically just untwiddle our local data into the linear
+ * displaytarget.
+ */
+static void
+cell_flush_frontbuffer(struct pipe_screen *_screen,
+                       struct pipe_surface *surface,
+                       void *context_private)
+{
+   struct cell_screen *screen = cell_screen(_screen);
+   struct sw_winsys *winsys = screen->winsys;
+   struct cell_resource *ct = cell_resource(surface->texture);
+
+   if (!ct->dt)
+      return;
+
+   /* Need to untwiddle from our internal representation here:
+    */
+   {
+      unsigned *map = winsys->displaytarget_map(winsys, ct->dt,
+                                                (PIPE_TRANSFER_READ |
+                                                 PIPE_TRANSFER_WRITE));
+      unsigned *src = (unsigned *)(ct->data + ct->level_offset[surface->level]);
+
+      untwiddle_image_uint(surface->width,
+                           surface->height,
+                           TILE_SIZE,
+                           map,
+                           ct->dt_stride,
+                           src);
+
+      winsys->displaytarget_unmap(winsys, ct->dt);
+   }
+
+   winsys->displaytarget_display(winsys, ct->dt, context_private);
+}
+
+
+
+/**
+ * Create buffer which wraps user-space data.
+ */
+static struct pipe_resource *
+cell_user_buffer_create(struct pipe_screen *screen,
+                            void *ptr,
+                            unsigned bytes,
+			    unsigned bind_flags)
+{
+   struct cell_resource *buffer;
+
+   buffer = CALLOC_STRUCT(cell_resource);
+   if(!buffer)
+      return NULL;
+
+   pipe_reference_init(&buffer->base.reference, 1);
+   buffer->base.screen = screen;
+   buffer->base.format = PIPE_FORMAT_R8_UNORM; /* ?? */
+   buffer->base.bind = PIPE_BIND_TRANSFER_READ | bind_flags;
+   buffer->base.usage = PIPE_USAGE_IMMUTABLE;
+   buffer->base.flags = 0;
+   buffer->base.width0 = bytes;
+   buffer->base.height0 = 1;
+   buffer->base.depth0 = 1;
+   buffer->userBuffer = TRUE;
+   buffer->data = ptr;
+
+   return &buffer->base;
+}
+
+
+static struct pipe_resource *
+cell_resource_from_handle(struct pipe_screen *screen,
+                          const struct pipe_resource *templat,
+                          struct winsys_handle *handle)
+{
+   /* XXX todo */
+   return NULL;
+}
+
+
+static boolean 
+cell_resource_get_handle(struct pipe_screen *scree,
+                         struct pipe_resource *tex,
+                         struct winsys_handle *handle)
+{
+   /* XXX todo */
+   return FALSE;
+}
+
+
+void
+cell_init_screen_texture_funcs(struct pipe_screen *screen)
+{
+   screen->resource_create = cell_resource_create;
+   screen->resource_destroy = cell_resource_destroy;
+   screen->resource_from_handle = cell_resource_from_handle;
+   screen->resource_get_handle = cell_resource_get_handle;
+   screen->user_buffer_create = cell_user_buffer_create;
+
+   screen->get_tex_surface = cell_get_tex_surface;
+   screen->tex_surface_destroy = cell_tex_surface_destroy;
+
+   screen->flush_frontbuffer = cell_flush_frontbuffer;
+}
+
+void
+cell_init_texture_transfer_funcs(struct cell_context *cell)
+{
+   cell->pipe.get_transfer = cell_get_transfer;
+   cell->pipe.transfer_destroy = cell_transfer_destroy;
+   cell->pipe.transfer_map = cell_transfer_map;
+   cell->pipe.transfer_unmap = cell_transfer_unmap;
+
+   cell->pipe.transfer_flush_region = u_default_transfer_flush_region;
+   cell->pipe.transfer_inline_write = u_default_transfer_inline_write;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
new file mode 100644
index 0000000000..bd8224b3b7
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -0,0 +1,102 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_TEXTURE_H
+#define CELL_TEXTURE_H
+
+#include "cell/common.h"
+
+struct cell_context;
+struct pipe_resource;
+
+
+/**
+ * Subclass of pipe_resource
+ */
+struct cell_resource
+{
+   struct pipe_resource base;
+
+   unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS];
+   unsigned long stride[CELL_MAX_TEXTURE_LEVELS];
+
+   /**
+    * Display target, for textures with the PIPE_BIND_DISPLAY_TARGET
+    * usage.
+    */
+   struct sw_displaytarget *dt;
+   unsigned dt_stride;
+
+   /**
+    * Malloc'ed data for regular textures, or a mapping to dt above.
+    */
+   void *data;
+   boolean userBuffer;
+
+   /* Size of the linear buffer??
+    */
+   unsigned long buffer_size;
+
+   /** The buffer above, mapped.  This is the memory from which the
+    * SPUs will fetch texels.  This texture data is in the tiled layout.
+    */
+   ubyte *mapped;
+};
+
+
+struct cell_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned long offset;
+   void *map;
+};
+
+
+/** cast wrapper */
+static INLINE struct cell_resource *
+cell_resource(struct pipe_resource *pt)
+{
+   return (struct cell_resource *) pt;
+}
+
+
+/** cast wrapper */
+static INLINE struct cell_transfer *
+cell_transfer(struct pipe_transfer *pt)
+{
+   return (struct cell_transfer *) pt;
+}
+
+
+extern void
+cell_init_screen_texture_funcs(struct pipe_screen *screen);
+
+extern void
+cell_init_texture_transfer_funcs(struct cell_context *cell);
+
+#endif /* CELL_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
new file mode 100644
index 0000000000..37b7195648
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -0,0 +1,332 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Vertex buffer code.  The draw module transforms vertices to window
+ * coords, etc. and emits the vertices into buffer supplied by this module.
+ * When a vertex buffer is full, or we flush, we'll send the vertex data
+ * to the SPUs.
+ *
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include "cell_batch.h"
+#include "cell_context.h"
+#include "cell_fence.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_vbuf.h"
+#include "draw/draw_vbuf.h"
+#include "util/u_memory.h"
+
+
+/** Allow vertex data to be inlined after RENDER command */
+#define ALLOW_INLINE_VERTS 1
+
+
+/**
+ * Subclass of vbuf_render because we need a cell_context pointer in
+ * a few places.
+ */
+struct cell_vbuf_render
+{
+   struct vbuf_render base;
+   struct cell_context *cell;
+   uint prim;            /**< PIPE_PRIM_x */
+   uint vertex_size;     /**< in bytes */
+   void *vertex_buffer;  /**< just for debug, really */
+   uint vertex_buf;      /**< in [0, CELL_NUM_BUFFERS-1] */
+   uint vertex_buffer_size;  /**< size in bytes */
+};
+
+
+/** cast wrapper */
+static struct cell_vbuf_render *
+cell_vbuf_render(struct vbuf_render *vbr)
+{
+   return (struct cell_vbuf_render *) vbr;
+}
+
+
+
+static const struct vertex_info *
+cell_vbuf_get_vertex_info(struct vbuf_render *vbr)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   return &cvbr->cell->vertex_info;
+}
+
+
+static boolean
+cell_vbuf_allocate_vertices(struct vbuf_render *vbr,
+                            ushort vertex_size, ushort nr_vertices)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   unsigned size = vertex_size * nr_vertices;
+   /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/
+
+   assert(cvbr->vertex_buf == ~0);
+   cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell);
+   cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf];
+   cvbr->vertex_buffer_size = size;
+   cvbr->vertex_size = vertex_size;
+
+   return cvbr->vertex_buffer != NULL;
+}
+
+
+static void
+cell_vbuf_release_vertices(struct vbuf_render *vbr)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   struct cell_context *cell = cvbr->cell;
+
+   /*
+   printf("%s vertex_buf = %u  count = %u\n",
+          __FUNCTION__, cvbr->vertex_buf, vertices_used);
+   */
+
+   /* Make sure texture buffers aren't released until we're done rendering
+    * with them.
+    */
+   cell_add_fenced_textures(cell);
+
+   /* Tell SPUs they can release the vert buf */
+   if (cvbr->vertex_buf != ~0U) {
+      STATIC_ASSERT(sizeof(struct cell_command_release_verts) % 16 == 0);
+      struct cell_command_release_verts *release
+         = (struct cell_command_release_verts *)
+         cell_batch_alloc16(cell, sizeof(struct cell_command_release_verts));
+      release->opcode[0] = CELL_CMD_RELEASE_VERTS;
+      release->vertex_buf = cvbr->vertex_buf;
+   }
+
+   cvbr->vertex_buf = ~0;
+   cell_flush_int(cell, 0x0);
+
+   cvbr->vertex_buffer = NULL;
+}
+
+
+static void *
+cell_vbuf_map_vertices(struct vbuf_render *vbr)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   return cvbr->vertex_buffer;
+}
+
+
+static void 
+cell_vbuf_unmap_vertices(struct vbuf_render *vbr, 
+                         ushort min_index,
+                         ushort max_index )
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   assert( cvbr->vertex_buffer_size >= (max_index+1) * cvbr->vertex_size );
+   /* do nothing */
+}
+
+
+
+static boolean
+cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   cvbr->prim = prim;
+   /*printf("cell_set_prim %u\n", prim);*/
+   return TRUE;
+}
+
+
+static void
+cell_vbuf_draw_elements(struct vbuf_render *vbr,
+                        const ushort *indices,
+                        uint nr_indices)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   struct cell_context *cell = cvbr->cell;
+   float xmin, ymin, xmax, ymax;
+   uint i;
+   uint nr_vertices = 0, min_index = ~0;
+   const void *vertices = cvbr->vertex_buffer;
+   const uint vertex_size = cvbr->vertex_size;
+
+   for (i = 0; i < nr_indices; i++) {
+      if (indices[i] > nr_vertices)
+         nr_vertices = indices[i];
+      if (indices[i] < min_index)
+         min_index = indices[i];
+   }
+   nr_vertices++;
+
+#if 0
+   /*if (min_index > 0)*/
+      printf("%s min_index = %u\n", __FUNCTION__, min_index);
+#endif
+
+#if 0
+   printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u\n",
+          nr_indices, nr_vertices);
+   printf("  ");
+   for (i = 0; i < nr_indices; i += 3) {
+      printf("%u %u %u, ", indices[i+0], indices[i+1], indices[i+2]);
+   }
+   printf("\n");
+#elif 0
+   printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u  indexes = [%u %u %u ...]\n",
+          nr_indices, nr_vertices,
+          indices[0], indices[1], indices[2]);
+   printf("ind space = %u, vert space = %u, space = %u\n",
+          nr_indices * 2,
+          nr_vertices * 4 * cell->vertex_info.size,
+          cell_batch_free_space(cell));
+#endif
+
+   /* compute x/y bounding box */
+   xmin = ymin = 1e50;
+   xmax = ymax = -1e50;
+   for (i = min_index; i < nr_vertices; i++) {
+      const float *v = (float *) ((ubyte *) vertices + i * vertex_size);
+      if (v[0] < xmin)
+         xmin = v[0];
+      if (v[0] > xmax)
+         xmax = v[0];
+      if (v[1] < ymin)
+         ymin = v[1];
+      if (v[1] > ymax)
+         ymax = v[1];
+   }
+#if 0
+   printf("PPU Bounds %g, %g .. %g, %g\n", xmin, ymin, xmax, ymax);
+   fflush(stdout);
+#endif
+
+   if (cvbr->prim != PIPE_PRIM_TRIANGLES)
+      return; /* only render tris for now */
+
+   /* build/insert batch RENDER command */
+   {
+      const uint index_bytes = ROUNDUP16(nr_indices * 2);
+      const uint vertex_bytes = ROUNDUP16(nr_vertices * 4 * cell->vertex_info.size);
+      STATIC_ASSERT(sizeof(struct cell_command_render) % 16 == 0);
+      const uint batch_size = sizeof(struct cell_command_render) + index_bytes;
+
+      struct cell_command_render *render
+         = (struct cell_command_render *)
+         cell_batch_alloc16(cell, batch_size);
+
+      render->opcode[0] = CELL_CMD_RENDER;
+      render->prim_type = cvbr->prim;
+
+      render->num_indexes = nr_indices;
+      render->min_index = min_index;
+
+      /* append indices after render command */
+      memcpy(render + 1, indices, nr_indices * 2);
+
+      /* if there's room, append vertices after the indices, else leave
+       * vertices in the original/separate buffer.
+       */
+      render->vertex_size = 4 * cell->vertex_info.size;
+      render->num_verts = nr_vertices;
+      if (ALLOW_INLINE_VERTS &&
+          min_index == 0 &&
+          vertex_bytes + 16 <= cell_batch_free_space(cell)) {
+         /* vertex data inlined, after indices, at 16-byte boundary */
+         void *dst = cell_batch_alloc16(cell, vertex_bytes);
+         memcpy(dst, vertices, vertex_bytes);
+         render->inline_verts = TRUE;
+         render->vertex_buf = ~0;
+      }
+      else {
+         /* vertex data in separate buffer */
+         render->inline_verts = FALSE;
+         ASSERT(cvbr->vertex_buf >= 0);
+         render->vertex_buf = cvbr->vertex_buf;
+      }
+
+      render->xmin = xmin;
+      render->ymin = ymin;
+      render->xmax = xmax;
+      render->ymax = ymax;
+   }
+
+#if 0
+   /* helpful for debug */
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
+#endif
+}
+
+
+static void
+cell_vbuf_destroy(struct vbuf_render *vbr)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   cvbr->cell->vbuf_render = NULL;
+   FREE(cvbr);
+}
+
+
+/**
+ * Initialize the post-transform vertex buffer information for the given
+ * context.
+ */
+void
+cell_init_vbuf(struct cell_context *cell)
+{
+   assert(cell->draw);
+
+   cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render);
+
+   /* The max number of indexes is what can fix into a batch buffer,
+    * minus the render and release-verts commands.
+    */
+   cell->vbuf_render->base.max_indices
+      = (CELL_BUFFER_SIZE
+         - sizeof(struct cell_command_render)
+         - sizeof(struct cell_command_release_verts))
+      / sizeof(ushort);
+   cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE;
+
+   cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info;
+   cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices;
+   cell->vbuf_render->base.map_vertices = cell_vbuf_map_vertices;
+   cell->vbuf_render->base.unmap_vertices = cell_vbuf_unmap_vertices;
+   cell->vbuf_render->base.set_primitive = cell_vbuf_set_primitive;
+   cell->vbuf_render->base.draw_elements = cell_vbuf_draw_elements;
+   cell->vbuf_render->base.release_vertices = cell_vbuf_release_vertices;
+   cell->vbuf_render->base.destroy = cell_vbuf_destroy;
+
+   cell->vbuf_render->cell = cell;
+#if 1
+   cell->vbuf_render->vertex_buf = ~0;
+#endif
+
+   cell->vbuf = draw_vbuf_stage(cell->draw, &cell->vbuf_render->base);
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.h b/src/gallium/drivers/cell/ppu/cell_vbuf.h
new file mode 100644
index 0000000000..d265cbf770
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_VBUF_H
+#define CELL_VBUF_H
+
+
+struct cell_context;
+
+extern void
+cell_init_vbuf(struct cell_context *cell);
+
+
+#endif /* CELL_VBUF_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
new file mode 100644
index 0000000000..9cba537d9e
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -0,0 +1,346 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <inttypes.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+
+#include "../auxiliary/draw/draw_context.h"
+#include "../auxiliary/draw/draw_private.h"
+
+#include "cell_context.h"
+#include "rtasm/rtasm_ppc_spe.h"
+
+
+/**
+ * Emit a 4x4 matrix transpose operation
+ *
+ * \param p         Function that the transpose operation is to be appended to
+ * \param row0      Register containing row 0 of the source matrix
+ * \param row1      Register containing row 1 of the source matrix
+ * \param row2      Register containing row 2 of the source matrix
+ * \param row3      Register containing row 3 of the source matrix
+ * \param dest_ptr  Register containing the address of the destination matrix
+ * \param shuf_ptr  Register containing the address of the shuffled data
+ * \param count     Number of colums to actually be written to the destination
+ *
+ * \note
+ * This function assumes that the registers named by \c row0, \c row1,
+ * \c row2, and \c row3 are scratch and can be modified by the generated code.
+ * Furthermore, these registers will be released, via calls to
+ * \c release_register, by this function.
+ * 
+ * \note
+ * This function requires that four temporary are available on entry.
+ */
+static void
+emit_matrix_transpose(struct spe_function *p,
+		      unsigned row0, unsigned row1, unsigned row2,
+		      unsigned row3, unsigned dest_ptr,
+		      unsigned shuf_ptr, unsigned count)
+{
+   int shuf_hi = spe_allocate_available_register(p);
+   int shuf_lo = spe_allocate_available_register(p);
+   int t1 = spe_allocate_available_register(p);
+   int t2 = spe_allocate_available_register(p);
+   int t3;
+   int t4;
+   int col0;
+   int col1;
+   int col2;
+   int col3;
+
+
+   spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+   spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
+   spe_shufb(p, t1, row0, row2, shuf_hi);
+   spe_shufb(p, t2, row0, row2, shuf_lo);
+
+
+   /* row0 and row2 are now no longer needed.  Re-use those registers as
+    * temporaries.
+    */
+   t3 = row0;
+   t4 = row2;
+
+   spe_shufb(p, t3, row1, row3, shuf_hi);
+   spe_shufb(p, t4, row1, row3, shuf_lo);
+
+
+   /* row1 and row3 are now no longer needed.  Re-use those registers as
+    * temporaries.
+    */
+   col0 = row1;
+   col1 = row3;
+
+   spe_shufb(p, col0, t1, t3, shuf_hi);
+   if (count > 1) {
+      spe_shufb(p, col1, t1, t3, shuf_lo);
+   }
+
+   /* t1 and t3 are now no longer needed.  Re-use those registers as
+    * temporaries.
+    */
+   col2 = t1;
+   col3 = t3;
+
+   if (count > 2) {
+      spe_shufb(p, col2, t2, t4, shuf_hi);
+   }
+
+   if (count > 3) {
+      spe_shufb(p, col3, t2, t4, shuf_lo);
+   }
+
+
+   /* Store the results.  Remember that the stqd instruction is encoded using
+    * the qword offset (stand-alone assemblers to the byte-offset to
+    * qword-offset conversion for you), so the byte-offset needs be divided by
+    * 16.
+    */
+   switch (count) {
+   case 4:
+      spe_stqd(p, col3, dest_ptr, 3 * 16);
+   case 3:
+      spe_stqd(p, col2, dest_ptr, 2 * 16);
+   case 2:
+      spe_stqd(p, col1, dest_ptr, 1 * 16);
+   case 1:
+      spe_stqd(p, col0, dest_ptr, 0 * 16);
+   }
+
+
+   /* Release all of the temporary registers used.
+    */
+   spe_release_register(p, col0);
+   spe_release_register(p, col1);
+   spe_release_register(p, col2);
+   spe_release_register(p, col3);
+   spe_release_register(p, shuf_hi);
+   spe_release_register(p, shuf_lo);
+   spe_release_register(p, t2);
+   spe_release_register(p, t4);
+}
+
+
+#if 0
+/* This appears to not be used currently */
+static void
+emit_fetch(struct spe_function *p,
+	   unsigned in_ptr, unsigned *offset,
+	   unsigned out_ptr, unsigned shuf_ptr,
+	   enum pipe_format format)
+{
+   const unsigned count = (pf_size_x(format) != 0) + (pf_size_y(format) != 0)
+       + (pf_size_z(format) != 0) + (pf_size_w(format) != 0);
+   const unsigned type = pf_type(format);
+   const unsigned bytes = pf_size_x(format);
+
+   int v0 = spe_allocate_available_register(p);
+   int v1 = spe_allocate_available_register(p);
+   int v2 = spe_allocate_available_register(p);
+   int v3 = spe_allocate_available_register(p);
+   int tmp = spe_allocate_available_register(p);
+   int float_zero = -1;
+   int float_one = -1;
+   float scale_signed = 0.0;
+   float scale_unsigned = 0.0;
+
+   spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+   spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+   spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+   spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
+   offset[0] += 4;
+   
+   switch (bytes) {
+   case 1:
+      scale_signed = 1.0f / 127.0f;
+      scale_unsigned = 1.0f / 255.0f;
+      spe_lqd(p, tmp, shuf_ptr, 1 * 16);
+      spe_shufb(p, v0, v0, v0, tmp);
+      spe_shufb(p, v1, v1, v1, tmp);
+      spe_shufb(p, v2, v2, v2, tmp);
+      spe_shufb(p, v3, v3, v3, tmp);
+      break;
+   case 2:
+      scale_signed = 1.0f / 32767.0f;
+      scale_unsigned = 1.0f / 65535.0f;
+      spe_lqd(p, tmp, shuf_ptr, 2 * 16);
+      spe_shufb(p, v0, v0, v0, tmp);
+      spe_shufb(p, v1, v1, v1, tmp);
+      spe_shufb(p, v2, v2, v2, tmp);
+      spe_shufb(p, v3, v3, v3, tmp);
+      break;
+   case 4:
+      scale_signed = 1.0f / 2147483647.0f;
+      scale_unsigned = 1.0f / 4294967295.0f;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   switch (type) {
+   case PIPE_FORMAT_TYPE_FLOAT:
+      break;
+   case PIPE_FORMAT_TYPE_UNORM:
+      spe_ilhu(p, tmp, ((unsigned) scale_unsigned) >> 16);
+      spe_iohl(p, tmp, ((unsigned) scale_unsigned) & 0x0ffff);
+      spe_cuflt(p, v0, v0, 0);
+      spe_fm(p, v0, v0, tmp);
+      break;
+   case PIPE_FORMAT_TYPE_SNORM:
+      spe_ilhu(p, tmp, ((unsigned) scale_signed) >> 16);
+      spe_iohl(p, tmp, ((unsigned) scale_signed) & 0x0ffff);
+      spe_csflt(p, v0, v0, 0);
+      spe_fm(p, v0, v0, tmp);
+      break;
+   case PIPE_FORMAT_TYPE_USCALED:
+      spe_cuflt(p, v0, v0, 0);
+      break;
+   case PIPE_FORMAT_TYPE_SSCALED:
+      spe_csflt(p, v0, v0, 0);
+      break;
+   }
+
+
+   if (count < 4) {
+      float_one = spe_allocate_available_register(p);
+      spe_il(p, float_one, 1);
+      spe_cuflt(p, float_one, float_one, 0);
+      
+      if (count < 3) {
+	 float_zero = spe_allocate_available_register(p);
+	 spe_il(p, float_zero, 0);
+      }
+   }
+
+   spe_release_register(p, tmp);
+
+   emit_matrix_transpose(p, v0, v1, v2, v3, out_ptr, shuf_ptr, count);
+
+   switch (count) {
+   case 1:
+      spe_stqd(p, float_zero, out_ptr, 1 * 16);
+   case 2:
+      spe_stqd(p, float_zero, out_ptr, 2 * 16);
+   case 3:
+      spe_stqd(p, float_one, out_ptr, 3 * 16);
+   }
+
+   if (float_zero != -1) {
+      spe_release_register(p, float_zero);
+   }
+
+   if (float_one != -1) {
+      spe_release_register(p, float_one);
+   }
+}
+#endif
+
+
+void cell_update_vertex_fetch(struct draw_context *draw)
+{
+#if 0
+   struct cell_context *const cell =
+       (struct cell_context *) draw->driver_private;
+   struct spe_function *p = &cell->attrib_fetch;
+   unsigned function_index[PIPE_MAX_ATTRIBS];
+   unsigned unique_attr_formats;
+   int out_ptr;
+   int in_ptr;
+   int shuf_ptr;
+   unsigned i;
+   unsigned j;
+
+
+   /* Determine how many unique input attribute formats there are.  At the
+    * same time, store the index of the lowest numbered attribute that has
+    * the same format as any non-unique format.
+    */
+   unique_attr_formats = 1;
+   function_index[0] = 0;
+   for (i = 1; i < draw->vertex_fetch.nr_attrs; i++) {
+      const enum pipe_format curr_fmt = draw->vertex_element[i].src_format;
+
+      for (j = 0; j < i; j++) {
+	 if (curr_fmt == draw->vertex_element[j].src_format) {
+	    break;
+	 }
+      }
+      
+      if (j == i) {
+	 unique_attr_formats++;
+      }
+
+      function_index[i] = j;
+   }
+
+
+   /* Each fetch function can be a maximum of 34 instructions (note: this is
+    * actually a slight over-estimate).
+    */
+   spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats);
+
+
+   /* Allocate registers for the function's input parameters.
+    */
+   out_ptr = spe_allocate_register(p, 3);
+   in_ptr = spe_allocate_register(p, 4);
+   shuf_ptr = spe_allocate_register(p, 5);
+
+
+   /* Generate code for the individual attribute fetch functions.
+    */
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      unsigned offset;
+
+      if (function_index[i] == i) {
+	 cell->attrib_fetch_offsets[i] = (unsigned) ((void *) p->csr 
+						     - (void *) p->store);
+
+	 offset = 0;
+	 emit_fetch(p, in_ptr, &offset, out_ptr, shuf_ptr,
+		    draw->vertex_element[i].src_format);
+	 spe_bi(p, 0, 0, 0);
+
+	 /* Round up to the next 16-byte boundary.
+	  */
+	 if ((((unsigned) p->store) & 0x0f) != 0) {
+	    const unsigned align = ((unsigned) p->store) & 0x0f;
+	    p->store = (uint32_t *) (((void *) p->store) + align);
+	 }
+      } else {
+	 /* Use the same function entry-point as a previously seen attribute
+	  * with the same format.
+	  */
+	 cell->attrib_fetch_offsets[i] = 
+	     cell->attrib_fetch_offsets[function_index[i]];
+      }
+   }
+#else
+   assert(0);
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
new file mode 100644
index 0000000000..3d389d6ea3
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -0,0 +1,145 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file cell_vertex_shader.c
+ * Vertex shader interface routines for Cell.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "util/u_math.h"
+
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_batch.h"
+
+#include "cell/common.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+/**
+ * Run the vertex shader on all vertices in the vertex queue.
+ * Called by the draw module when the vertx cache needs to be flushed.
+ */
+void
+cell_vertex_shader_queue_flush(struct draw_context *draw)
+{
+#if 0
+   struct cell_context *const cell =
+       (struct cell_context *) draw->driver_private;
+   struct cell_command_vs *const vs = &cell_global.command[0].vs;
+   uint64_t *batch;
+   struct cell_array_info *array_info;
+   unsigned i, j;
+   struct cell_attribute_fetch_code *cf;
+
+   assert(draw->vs.queue_nr != 0);
+
+   /* XXX: do this on statechange: 
+    */
+   draw_update_vertex_fetch(draw);
+   cell_update_vertex_fetch(draw);
+
+
+   batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*cf));
+   batch[0] = CELL_CMD_STATE_ATTRIB_FETCH;
+   cf = (struct cell_attribute_fetch_code *) (&batch[1]);
+   cf->base = (uint64_t) cell->attrib_fetch.store;
+   cf->size = ROUNDUP16((unsigned)((void *) cell->attrib_fetch.csr 
+				   - (void *) cell->attrib_fetch.store));
+
+
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      const enum pipe_format format = draw->vertex_element[i].src_format;
+      const unsigned count = ((pf_size_x(format) != 0)
+			      + (pf_size_y(format) != 0)
+			      + (pf_size_z(format) != 0)
+			      + (pf_size_w(format) != 0));
+      const unsigned size = pf_size_x(format) * count;
+
+      batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info));
+
+      batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO;
+
+      array_info = (struct cell_array_info *) &batch[1];
+      assert(draw->vertex_fetch.src_ptr[i] != NULL);
+      array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
+      array_info->attr = i;
+      array_info->pitch = draw->vertex_fetch.pitch[i];
+      array_info->size = size;
+      array_info->function_offset = cell->attrib_fetch_offsets[i];
+   }
+
+   batch = cell_batch_alloc(cell, sizeof(batch[0])
+                            + sizeof(struct pipe_viewport_state));
+   batch[0] = CELL_CMD_STATE_VIEWPORT;
+   (void) memcpy(&batch[1], &draw->viewport,
+                 sizeof(struct pipe_viewport_state));
+
+   {
+      uint64_t uniforms = (uintptr_t) draw->user.constants;
+
+      batch = cell_batch_alloc(cell, 2 *sizeof(batch[0]));
+      batch[0] = CELL_CMD_STATE_UNIFORMS;
+      batch[1] = uniforms;
+   }
+
+   cell_batch_flush(cell);
+
+   vs->opcode = CELL_CMD_VS_EXECUTE;
+   vs->nr_attrs = draw->vertex_fetch.nr_attrs;
+
+   (void) memcpy(vs->plane, draw->plane, sizeof(draw->plane));
+   vs->nr_planes = draw->nr_planes;
+
+   for (i = 0; i < draw->vs.queue_nr; i += SPU_VERTS_PER_BATCH) {
+      const unsigned n = MIN2(SPU_VERTS_PER_BATCH, draw->vs.queue_nr - i);
+
+      for (j = 0; j < n; j++) {
+         vs->elts[j] = draw->vs.queue[i + j].elt;
+         vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].vertex;
+      }
+
+      for (/* empty */; j < SPU_VERTS_PER_BATCH; j++) {
+         vs->elts[j] = vs->elts[0];
+         vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].vertex;
+      }
+
+      vs->num_elts = n;
+      send_mbox_message(cell_global.spe_contexts[0], CELL_CMD_VS_EXECUTE);
+
+      cell_flush_int(cell, CELL_FLUSH_WAIT);
+   }
+
+   draw->vs.post_nr = draw->vs.queue_nr;
+   draw->vs.queue_nr = 0;
+#else
+   assert(0);
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore
new file mode 100644
index 0000000000..2be9a2d324
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/.gitignore
@@ -0,0 +1 @@
+g3d_spu
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
new file mode 100644
index 0000000000..3cc52301da
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -0,0 +1,83 @@
+# Gallium3D Cell driver: SPU code
+
+# This makefile builds the g3d_spu.a file that's linked into the
+# PPU code/library.
+
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+
+PROG = g3d
+
+PROG_SPU = $(PROG)_spu
+PROG_SPU_A = $(PROG)_spu.a
+PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
+
+
+SOURCES = \
+	spu_command.c \
+	spu_dcache.c \
+	spu_funcs.c \
+	spu_main.c \
+	spu_per_fragment_op.c \
+	spu_render.c \
+	spu_texture.c \
+	spu_tile.c \
+	spu_tri.c
+
+OLD_SOURCES = \
+	spu_exec.c \
+	spu_util.c \
+	spu_vertex_fetch.c \
+	spu_vertex_shader.c
+
+
+SPU_OBJECTS = $(SOURCES:.c=.o)
+
+SPU_ASM_OUT = $(SOURCES:.c=.s)
+
+
+INCLUDE_DIRS = \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/drivers
+
+
+.c.o:
+	$(SPU_CC) $(SPU_CFLAGS) -c $<
+
+.c.s:
+	$(SPU_CC) $(SPU_CFLAGS) -O3 -S $<
+
+
+# The .a file will be linked into the main/PPU executable
+default: $(PROG_SPU_A)
+
+$(PROG_SPU_A): $(PROG_SPU_EMBED_O)
+	$(SPU_AR) $(SPU_AR_FLAGS) $(PROG_SPU_A) $(PROG_SPU_EMBED_O)
+
+$(PROG_SPU_EMBED_O): $(PROG_SPU)
+	$(SPU_EMBED) $(SPU_EMBED_FLAGS) $(PROG_SPU) $(PROG_SPU) $(PROG_SPU_EMBED_O)
+
+$(PROG_SPU): $(SPU_OBJECTS)
+	$(SPU_CC) -o $(PROG_SPU) $(SPU_OBJECTS) $(SPU_LFLAGS)
+
+
+
+asmfiles: $(SPU_ASM_OUT)
+
+
+clean:
+	rm -f *~ *.o *.a *.d *.s $(PROG_SPU)
+
+
+
+depend: $(SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(INCLUDE_DIRS) $(SOURCES) 2> /dev/null
+
+include depend
+
diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
new file mode 100644
index 0000000000..d7ce005524
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -0,0 +1,145 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef SPU_COLORPACK_H
+#define SPU_COLORPACK_H
+
+
+#include <transpose_matrix4x4.h>
+#include <spu_intrinsics.h>
+
+
+static INLINE unsigned int
+spu_pack_R8G8B8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  0, 4, 8, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0 }) );
+
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_A8R8G8B8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  12, 0, 4, 8, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_B8G8R8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  8, 4, 0, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, shuffle);
+  return spu_extract(out, 0);
+}
+
+
+static INLINE vector float
+spu_unpack_B8G8R8A8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          ((vector unsigned char) {
+                             2, 2, 2, 2,
+                             1, 1, 1, 1,
+                             0, 0, 0, 0,
+                             3, 3, 3, 3}) );
+   return spu_convtf(color_u4, 32);
+}
+
+
+static INLINE vector float
+spu_unpack_A8R8G8B8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          ((vector unsigned char) {
+                             1, 1, 1, 1,
+                             2, 2, 2, 2,
+                             3, 3, 3, 3,
+                             0, 0, 0, 0}) );
+   return spu_convtf(color_u4, 32);
+}
+
+
+/**
+ * \param color_in - array of 32-bit packed ARGB colors
+ * \param color_out - returns float colors in RRRR, GGGG, BBBB, AAAA order
+ */
+static INLINE void
+spu_unpack_A8R8G8B8_transpose4(const vector unsigned int color_in[4],
+                               vector float color_out[4])
+{
+   vector unsigned int c0;
+
+   c0 = spu_shuffle(color_in[0], color_in[0],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[0] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[1], color_in[1],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[1] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[2], color_in[2],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[2] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[3], color_in[3],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[3] = spu_convtf(c0, 32);
+
+   _transpose_matrix4x4(color_out, color_out);
+}
+
+
+
+#endif /* SPU_COLORPACK_H */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
new file mode 100644
index 0000000000..f16cabc027
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -0,0 +1,810 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU command processing code
+ */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "cell/common.h"
+
+
+struct spu_vs_context draw;
+
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+PIPE_ALIGN_VAR(16) static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS];
+
+
+
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE};
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory.
+ * There's a qword of status per SPU.
+ */
+static void
+cmd_fence(struct cell_command_fence *fence_cmd)
+{
+   static const vector unsigned int status = {CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED};
+   uint *dst = (uint *) fence_cmd->fence;
+   dst += 4 * spu.init.id;  /* main store/memory address, not local store */
+   ASSERT_ALIGN16(dst);
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_FENCE,           /* tag */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+static void
+cmd_clear_surface(const struct cell_command_clear_surface *clear)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+
+   if (clear->surface == 0) {
+      spu.fb.color_clear_value = clear->value;
+      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+            (spu.init.id << 20) | (spu.init.id << 28);
+         spu.fb.color_clear_value ^= x;
+      }
+   }
+   else {
+      spu.fb.depth_clear_value = clear->value;
+   }
+
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+   /* Simply set all tiles' status to CLEAR.
+    * When we actually begin rendering into a tile, we'll initialize it to
+    * the clear value.  If any tiles go untouched during the frame,
+    * really_clear_tiles() will set them to the clear value.
+    */
+   if (clear->surface == 0) {
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
+   }
+   else {
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
+   }
+
+#else
+
+   /*
+    * This path clears the whole framebuffer to the clear color right now.
+    */
+
+   /*
+   printf("SPU: %s num=%d w=%d h=%d\n",
+          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
+   */
+
+   /* init a single tile to the clear value */
+   if (clear->surface == 0) {
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+   }
+
+   /* walk over my tiles, writing the 'clear' tile's data */
+   {
+      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+      uint i;
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (clear->surface == 0)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         else
+            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+      wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   }
+
+#endif /* CLEAR_OPT */
+
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n");
+}
+
+
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
+   release_buffer(release->vertex_buf);
+}
+
+
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
+
+   /* Copy state info (for fallback case only - this will eventually
+    * go away when the fallback case goes away)
+    */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+   memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
+
+   /* Make sure the SPU knows which buffers it's expected to read when
+    * it's told to pull tiles.
+    */
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
+
+   /* If we're forcing the fallback code to be used (for debug purposes),
+    * install that.  Otherwise install the incoming SPU code.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) {
+      static unsigned int warned = 0;
+      if (!warned) {
+         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+         warned = 1;
+      }
+      /* The following two lines aren't really necessary if you
+       * know the debug flags won't change during a run, and if you
+       * know that the function pointers are initialized correctly.
+       * We set them here to allow a person to change the debug
+       * flags during a run (from inside a debugger).
+       */
+      spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+      spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+      return;
+   }
+
+   /* Make sure the SPU code buffer is large enough to hold the incoming code.
+    * Note that we *don't* use align_malloc() and align_free(), because
+    * those utility functions are *not* available in SPU code.
+    * */
+   if (spu.fragment_ops_code_size < fops->total_code_size) {
+      if (spu.fragment_ops_code != NULL) {
+         free(spu.fragment_ops_code);
+      }
+      spu.fragment_ops_code_size = fops->total_code_size;
+      spu.fragment_ops_code = malloc(fops->total_code_size);
+      if (spu.fragment_ops_code == NULL) {
+         /* Whoops. */
+         fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size);
+         spu.fragment_ops_code = NULL;
+         spu.fragment_ops_code_size = 0;
+         spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+         spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+         return;
+      }
+   }
+
+   /* Copy the SPU code from the command buffer to the spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size);
+
+   /* Set the pointers for the front-facing and back-facing fragments
+    * to the specified offsets within the code.  Note that if the
+    * front-facing and back-facing code are the same, they'll have
+    * the same offset.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index];
+   spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index];
+}
+
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_program_code, fp->code,
+          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+   /* Point function pointer at new code */
+   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
+static uint
+cmd_state_fs_constants(const qword *buffer, uint pos)
+{
+   const uint num_const = spu_extract((vector unsigned int)buffer[pos+1], 0);
+   const float *constants = (const float *) &buffer[pos+2];
+   uint i;
+
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+   /* Expand each float to float[4] for SOA execution */
+   for (i = 0; i < num_const; i++) {
+      D_PRINTF(CELL_DEBUG_CMD, "  const[%u] = %f\n", i, constants[i]);
+      spu.constants[i] = spu_splats(constants[i]);
+   }
+
+   /* return new buffer pos (in 16-byte words) */
+   return pos + 2 + (ROUNDUP16(num_const * sizeof(float)) / 16);
+}
+
+
+static void
+cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+             cmd->width,
+             cmd->height,
+             cmd->color_start,
+             cmd->color_format,
+             cmd->depth_format);
+
+   ASSERT_ALIGN16(cmd->color_start);
+   ASSERT_ALIGN16(cmd->depth_start);
+
+   spu.fb.color_start = cmd->color_start;
+   spu.fb.depth_start = cmd->depth_start;
+   spu.fb.color_format = cmd->color_format;
+   spu.fb.depth_format = cmd->depth_format;
+   spu.fb.width = cmd->width;
+   spu.fb.height = cmd->height;
+   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
+   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
+
+   switch (spu.fb.depth_format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
+      break;
+   default:
+      spu.fb.zsize = 0;
+      break;
+   }
+}
+
+
+/**
+ * Tex texture mask_s/t and scale_s/t fields depend on the texture size and
+ * sampler wrap modes.
+ */
+static void
+update_tex_masks(struct spu_texture *texture,
+                 const struct pipe_sampler_state *sampler)
+{
+   uint i;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      int width = texture->level[i].width;
+      int height = texture->level[i].height;
+
+      if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_s = spu_splats(width - 1);
+      else
+         texture->level[i].mask_s = spu_splats(~0);
+
+      if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_t = spu_splats(height - 1);
+      else
+         texture->level[i].mask_t = spu_splats(~0);
+
+      if (sampler->normalized_coords) {
+         texture->level[i].scale_s = spu_splats((float) width);
+         texture->level[i].scale_t = spu_splats((float) height);
+      }
+      else {
+         texture->level[i].scale_s = spu_splats(1.0f);
+         texture->level[i].scale_t = spu_splats(1.0f);
+      }
+   }
+}
+
+
+static void
+cmd_state_sampler(const struct cell_command_sampler *sampler)
+{
+   uint unit = sampler->unit;
+
+   D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit);
+
+   spu.sampler[unit] = sampler->state;
+
+   switch (spu.sampler[unit].min_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
+      break;
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   switch (spu.sampler[sampler->unit].mag_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
+      break;
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   switch (spu.sampler[sampler->unit].min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      spu.sample_texture_2d[unit] = sample_texture_2d_lod;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit];
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+}
+
+
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   const uint unit = texture->unit;
+   uint i;
+
+   D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit);
+
+   spu.texture[unit].max_level = 0;
+   spu.texture[unit].target = texture->target;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      uint width = texture->width[i];
+      uint height = texture->height[i];
+      uint depth = texture->depth[i];
+
+      D_PRINTF(CELL_DEBUG_CMD, "  LEVEL %u: at %p  size[0] %u x %u\n", i,
+             texture->start[i], texture->width[i], texture->height[i]);
+
+      spu.texture[unit].level[i].start = texture->start[i];
+      spu.texture[unit].level[i].width = width;
+      spu.texture[unit].level[i].height = height;
+      spu.texture[unit].level[i].depth = depth;
+
+      spu.texture[unit].level[i].tiles_per_row =
+         (width + TILE_SIZE - 1) / TILE_SIZE;
+
+      spu.texture[unit].level[i].bytes_per_image =
+         4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth;
+
+      spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
+      spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
+
+      if (texture->start[i])
+         spu.texture[unit].max_level = i;
+   }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
+   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
+}
+
+
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_MAX_ATTRIBS);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.size[attr] = vs_info->size;
+   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
+   draw.vertex_fetch.dirty = 1;
+}
+
+
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
+cmd_finish(void)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "FINISH\n");
+   really_clear_tiles(0);
+   /* wait for all outstanding DMAs to finish */
+   mfc_write_tag_mask(~0);
+   mfc_read_tag_status_all();
+   /* send mbox message to PPU */
+   spu_write_out_mbox(CELL_CMD_FINISH);
+}
+
+
+/**
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
+ * The opcode param encodes the location of the buffer and its size.
+ */
+static void
+cmd_batch(uint opcode)
+{
+   const uint buf = (opcode >> 8) & 0xff;
+   uint size = (opcode >> 16);
+   PIPE_ALIGN_VAR(16) qword buffer[CELL_BUFFER_SIZE / 16];
+   const unsigned usize = ROUNDUP16(size) / sizeof(buffer[0]);
+   uint pos;
+
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
+             buf, size, spu.init.buffers[buf]);
+
+   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   size = ROUNDUP16(size);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   mfc_get(buffer,  /* dest */
+           (unsigned int) spu.init.buffers[buf],  /* src */
+           size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   /* Tell PPU we're done copying the buffer to local store */
+   D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf);
+   release_buffer(buf);
+
+   /*
+    * Loop over commands in the batch buffer
+    */
+   for (pos = 0; pos < usize; /* no incr */) {
+      switch (si_to_uint(buffer[pos])) {
+      /*
+       * rendering commands
+       */
+      case CELL_CMD_CLEAR_SURFACE:
+         {
+            struct cell_command_clear_surface *clr
+               = (struct cell_command_clear_surface *) &buffer[pos];
+            cmd_clear_surface(clr);
+            pos += sizeof(*clr) / 16;
+         }
+         break;
+      case CELL_CMD_RENDER:
+         {
+            struct cell_command_render *render
+               = (struct cell_command_render *) &buffer[pos];
+            uint pos_incr;
+            cmd_render(render, &pos_incr);
+            pos += ((pos_incr+1)&~1) / 2; // should 'fix' cmd_render return
+         }
+         break;
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
+         {
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            /* This is a variant-sized command */
+            pos += ROUNDUP16(sizeof(*fops) + fops->total_code_size) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+         {
+            struct cell_command_fragment_program *fp
+               = (struct cell_command_fragment_program *) &buffer[pos];
+            cmd_state_fragment_program(fp);
+            pos += sizeof(*fp) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_FS_CONSTANTS:
+         pos = cmd_state_fs_constants(buffer, pos);
+         break;
+      case CELL_CMD_STATE_RASTERIZER:
+         {
+            struct cell_command_rasterizer *rast =
+               (struct cell_command_rasterizer *) &buffer[pos];
+            spu.rasterizer = rast->rasterizer;
+            pos += sizeof(*rast) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_SAMPLER:
+         {
+            struct cell_command_sampler *sampler
+               = (struct cell_command_sampler *) &buffer[pos];
+            cmd_state_sampler(sampler);
+            pos += sizeof(*sampler) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_TEXTURE:
+         {
+            struct cell_command_texture *texture
+               = (struct cell_command_texture *) &buffer[pos];
+            cmd_state_texture(texture);
+            pos += sizeof(*texture) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_VERTEX_INFO:
+         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
+         pos += 1 + ROUNDUP16(sizeof(struct vertex_info)) / 16;
+         break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += 1 + ROUNDUP16(sizeof(struct pipe_viewport_state)) / 16;
+         break;
+      case CELL_CMD_STATE_UNIFORMS:
+         draw.constants = (const float (*)[4]) (uintptr_t)spu_extract((vector unsigned int)buffer[pos+1],0);
+         pos += 2;
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_array_info)) / 16;
+         break;
+      case CELL_CMD_STATE_BIND_VS:
+#if 0
+         spu_bind_vertex_shader(&draw,
+                                (struct cell_shader_info *) &buffer[pos+1]);
+#endif
+         pos += 1 + ROUNDUP16(sizeof(struct cell_shader_info)) / 16;
+         break;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_attribute_fetch_code)) / 16;
+         break;
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_FENCE:
+         {
+            struct cell_command_fence *fence_cmd =
+               (struct cell_command_fence *) &buffer[pos];
+            cmd_fence(fence_cmd);
+            pos += sizeof(*fence_cmd) / 16;
+         }
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 16;
+         }
+         break;
+      case CELL_CMD_FLUSH_BUFFER_RANGE: {
+	 struct cell_buffer_range *br = (struct cell_buffer_range *)
+	     &buffer[pos+1];
+
+	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_buffer_range)) / 16;
+	 break;
+      }
+      default:
+         printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, si_to_uint(buffer[pos]));
+         ASSERT(0);
+         break;
+      }
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n");
+}
+
+
+#define PERF 0
+
+
+/**
+ * Main loop for SPEs: Get a command, execute it, repeat.
+ */
+void
+command_loop(void)
+{
+   int exitFlag = 0;
+   uint t0, t1;
+
+   D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
+
+   while (!exitFlag) {
+      unsigned opcode;
+
+      D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
+
+      if (PERF)
+         spu_write_decrementer(~0);
+
+      /* read/wait from mailbox */
+      opcode = (unsigned int) spu_read_in_mbox();
+      D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
+
+      if (PERF)
+         t0 = spu_read_decrementer();
+
+      switch (opcode & CELL_CMD_OPCODE_MASK) {
+      case CELL_CMD_EXIT:
+         D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
+         exitFlag = 1;
+         break;
+      case CELL_CMD_VS_EXECUTE:
+#if 0
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
+         break;
+      case CELL_CMD_BATCH:
+         cmd_batch(opcode);
+         break;
+      default:
+         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
+      }
+
+      if (PERF) {
+         t1 = spu_read_decrementer();
+         printf("wait mbox time: %gms   batch time: %gms\n",
+                (~0u - t0) * spu.init.inv_timebase,
+                (t0 - t1) * spu.init.inv_timebase);
+      }
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
+
+   if (spu.init.debug_flags & CELL_DEBUG_CACHE)
+      spu_dcache_report();
+}
+
+/* Initialize this module; we manage the fragment ops buffer here. */
+void
+spu_command_init(void)
+{
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+
+   /* Set up the basic empty buffer for code-gen'ed fragment ops */
+   spu.fragment_ops_code = NULL;
+   spu.fragment_ops_code_size = 0;
+}
+
+void
+spu_command_close(void)
+{
+   /* Deallocate the code-gen buffer for fragment ops, and reset the
+    * fragment ops functions to their initial setting (just to leave
+    * things in a good state).
+    */
+   if (spu.fragment_ops_code != NULL) {
+      free(spu.fragment_ops_code);
+   }
+   spu_command_init();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
new file mode 100644
index 0000000000..83dcdade28
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+extern void
+command_loop(void);
+
+extern void
+spu_command_init(void);
+
+extern void
+spu_command_close(void);
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
new file mode 100644
index 0000000000..a6d67634fd
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -0,0 +1,145 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_dcache.h"
+
+#define CACHELINE_LOG2SIZE    7
+#define LINE_SIZE             (1U << 7)
+#define ALIGN_MASK            (~(LINE_SIZE - 1))
+
+#define CACHE_NAME            data
+#define CACHED_TYPE           qword
+#define CACHE_TYPE            CACHE_TYPE_RO
+#define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
+#define CACHE_LOG2NNWAY       2
+#define CACHE_LOG2NSETS       6
+#ifdef DEBUG
+#define CACHE_STATS           1
+#endif
+#include <cache-api.h>
+
+/* Yes folks, this is ugly.
+ */
+#undef CACHE_NWAY
+#undef CACHE_NSETS
+#define CACHE_NAME            data
+#define CACHE_NWAY            4
+#define CACHE_NSETS           (1U << 6)
+
+
+/**
+ * Fetch between arbitrary number of bytes from an unaligned address
+ *
+ * \param dst   Destination data buffer
+ * \param ea    Main memory effective address of source data
+ * \param size  Number of bytes to read
+ *
+ * \warning
+ * As is hinted by the type of the \c dst pointer, this function writes
+ * multiples of 16-bytes.
+ */
+void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size)
+{
+   const int shift = ea & 0x0f;
+   const unsigned read_size = ROUNDUP16(size + shift);
+   const unsigned last_read = ROUNDUP16(ea + size);
+   const qword *const last_write = dst + (ROUNDUP16(size) / 16);
+   unsigned i;
+
+
+   if (shift == 0) {
+      /* Data is already aligned.  Fetch directly into the destination buffer.
+       */
+      for (i = 0; i < size; i += 16) {
+         *(dst++) = cache_rd(data, ea + i);
+      }
+   } else {
+      qword hi;
+
+
+      /* Please exercise extreme caution when modifying this code.  This code
+       * must not read past the end of the page containing the source data,
+       * and it must not write more than ((size + 15) / 16) qwords to the
+       * destination buffer.
+       */
+      ea &= ~0x0f;
+      hi = cache_rd(data, ea);
+      for (i = 16; i < read_size; i += 16) {
+         qword lo = cache_rd(data, ea + i);
+
+         *(dst++) = si_or((qword) spu_slqwbyte(hi, shift),
+                          (qword) spu_rlmaskqwbyte(lo, shift - 16));
+         hi = lo;
+      }
+
+      if (dst != last_write) {
+         *(dst++) = si_or((qword) spu_slqwbyte(hi, shift), si_il(0));
+      }
+   }
+   
+   ASSERT((ea + i) == last_read);
+   ASSERT(dst == last_write);
+}
+
+
+/**
+ * Notify the cache that a range of main memory may have been modified
+ */
+void
+spu_dcache_mark_dirty(unsigned ea, unsigned size)
+{
+   unsigned i;
+   const unsigned aligned_start = (ea & ALIGN_MASK);
+   const unsigned aligned_end = (ea + size + (LINE_SIZE - 1)) 
+       & ALIGN_MASK;
+
+
+   for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) {
+      const unsigned entry = __cache_dir[i];
+      const unsigned addr = entry & ~0x0f;
+
+      __cache_dir[i] = ((addr >= aligned_start) && (addr < aligned_end))
+          ? (entry & ~CACHELINE_VALID) : entry;
+   }
+}
+
+
+/**
+ * Print cache utilization report
+ */
+void
+spu_dcache_report(void)
+{
+#ifdef CACHE_STATS
+   if (spu.init.id == 0) {
+      printf("SPU 0: Texture cache report:\n");
+      cache_pr_stats(data);
+   }
+#endif
+}
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.h b/src/gallium/drivers/cell/spu/spu_dcache.h
new file mode 100644
index 0000000000..39a19eb31b
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.h
@@ -0,0 +1,37 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SPU_DCACHE_H
+#define SPU_DCACHE_H
+
+extern void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size);
+
+extern void
+spu_dcache_mark_dirty(unsigned ea, unsigned size);
+
+extern void
+spu_dcache_report(void);
+
+#endif /* SPU_DCACHE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
new file mode 100644
index 0000000000..e4ebeb595c
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -0,0 +1,1870 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI interpretor/executor.
+ *
+ * Flow control information:
+ *
+ * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
+ * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
+ * care since a condition may be true for some quad components but false
+ * for other components.
+ *
+ * We basically execute all statements (even if they're in the part of
+ * an IF/ELSE clause that's "not taken") and use a special mask to
+ * control writing to destination registers.  This is the ExecMask.
+ * See store_dest().
+ *
+ * The ExecMask is computed from three other masks (CondMask, LoopMask and
+ * ContMask) which are controlled by the flow control instructions (namely:
+ * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
+ *
+ *
+ * Authors:
+ *   Michal Krol
+ *   Brian Paul
+ */
+
+#include <transpose_matrix4x4.h>
+#include <simdmath/ceilf4.h>
+#include <simdmath/cosf4.h>
+#include <simdmath/divf4.h>
+#include <simdmath/floorf4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
+#include <simdmath/sinf4.h>
+#include <simdmath/sqrtf4.h>
+#include <simdmath/truncf4.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "spu_exec.h"
+#include "spu_main.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "cell/common.h"
+
+#define TILE_TOP_LEFT     0
+#define TILE_TOP_RIGHT    1
+#define TILE_BOTTOM_LEFT  2
+#define TILE_BOTTOM_RIGHT 3
+
+/*
+ * Shorthand locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
+#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
+#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
+#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
+#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
+#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
+#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
+#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
+#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
+#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
+#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
+#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
+#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
+#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
+#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
+#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
+#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
+#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
+#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
+#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
+#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
+#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_R0            TGSI_EXEC_TEMP_R0
+
+#define FOR_EACH_CHANNEL(CHAN)\
+   for (CHAN = 0; CHAN < 4; CHAN++)
+
+#define IS_CHANNEL_ENABLED(INST, CHAN)\
+   ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
+
+#define IS_CHANNEL_ENABLED2(INST, CHAN)\
+   ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
+
+#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
+
+
+/** The execution mask depends on the conditional mask and the loop mask */
+#define UPDATE_EXEC_MASK(MACH) \
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+
+
+/**
+ * Initialize machine state by expanding tokens to full instructions,
+ * allocating temporary storage, setting up constants, etc.
+ * After this, we can call spu_exec_machine_run() many times.
+ */
+void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor)
+{
+   const qword zero = si_il(0);
+   const qword not_zero = si_il(~0);
+
+   (void) numSamplers;
+   mach->Samplers = samplers;
+   mach->Processor = processor;
+   mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
+
+   /* Setup constants. */
+   mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
+   mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
+   mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1);
+   mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31);
+
+   mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f);
+   mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f);
+   mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f);
+   mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f);
+}
+
+
+static INLINE qword
+micro_abs(qword src)
+{
+   return si_rotmi(si_shli(src, 1), -1);
+}
+
+static INLINE qword
+micro_ceil(qword src)
+{
+   return (qword) _ceilf4((vec_float4) src);
+}
+
+static INLINE qword
+micro_cos(qword src)
+{
+   return (qword) _cosf4((vec_float4) src);
+}
+
+static const qword br_shuf = {
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+};
+
+static const qword bl_shuf = {
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+};
+
+static const qword tl_shuf = {
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+};
+
+static qword
+micro_ddx(qword src)
+{
+   qword bottom_right = si_shufb(src, src, br_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
+
+   return si_fs(bottom_right, bottom_left);
+}
+
+static qword
+micro_ddy(qword src)
+{
+   qword top_left = si_shufb(src, src, tl_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
+
+   return si_fs(top_left, bottom_left);
+}
+
+static INLINE qword
+micro_div(qword src0, qword src1)
+{
+   return (qword) _divf4((vec_float4) src0, (vec_float4) src1);
+}
+
+static qword
+micro_flr(qword src)
+{
+   return (qword) _floorf4((vec_float4) src);
+}
+
+static qword
+micro_frc(qword src)
+{
+   return si_fs(src, (qword) _floorf4((vec_float4) src));
+}
+
+static INLINE qword
+micro_ge(qword src0, qword src1)
+{
+   return si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
+}
+
+static qword
+micro_lg2(qword src)
+{
+   return (qword) _log2f4((vec_float4) src);
+}
+
+static INLINE qword
+micro_lt(qword src0, qword src1)
+{
+   const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
+
+   return si_xori(tmp, 0xff);
+}
+
+static INLINE qword
+micro_max(qword src0, qword src1)
+{
+   return si_selb(src1, src0, si_fcgt(src0, src1));
+}
+
+static INLINE qword
+micro_min(qword src0, qword src1)
+{
+   return si_selb(src0, src1, si_fcgt(src0, src1));
+}
+
+static qword
+micro_neg(qword src)
+{
+   return si_xor(src, (qword) spu_splats(0x80000000));
+}
+
+static qword
+micro_set_sign(qword src)
+{
+   return si_or(src, (qword) spu_splats(0x80000000));
+}
+
+static qword
+micro_pow(qword src0, qword src1)
+{
+   return (qword) _powf4((vec_float4) src0, (vec_float4) src1);
+}
+
+static qword
+micro_rnd(qword src)
+{
+   const qword half = (qword) spu_splats(0.5f);
+
+   /* May be able to use _roundf4.  There may be some difference, though.
+    */
+   return (qword) _floorf4((vec_float4) si_fa(src, half));
+}
+
+static INLINE qword
+micro_ishr(qword src0, qword src1)
+{
+   return si_rotma(src0, si_sfi(src1, 0));
+}
+
+static qword
+micro_trunc(qword src)
+{
+   return (qword) _truncf4((vec_float4) src);
+}
+
+static qword
+micro_sin(qword src)
+{
+   return (qword) _sinf4((vec_float4) src);
+}
+
+static INLINE qword
+micro_sqrt(qword src)
+{
+   return (qword) _sqrtf4((vec_float4) src);
+}
+
+static void
+fetch_src_file_channel(
+   const struct spu_exec_machine *mach,
+   const uint file,
+   const uint swizzle,
+   const union spu_exec_channel *index,
+   union spu_exec_channel *chan )
+{
+   switch( swizzle ) {
+   case TGSI_SWIZZLE_X:
+   case TGSI_SWIZZLE_Y:
+   case TGSI_SWIZZLE_Z:
+   case TGSI_SWIZZLE_W:
+      switch( file ) {
+      case TGSI_FILE_CONSTANT: {
+         unsigned i;
+
+         for (i = 0; i < 4; i++) {
+            const float *ptr = mach->Consts[index->i[i]];
+            float tmp[4];
+
+            spu_dcache_fetch_unaligned((qword *) tmp,
+                                       (uintptr_t)(ptr + swizzle),
+                                       sizeof(float));
+
+            chan->f[i] = tmp[0];
+         }
+         break;
+      }
+
+      case TGSI_FILE_INPUT:
+         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         ASSERT( index->i[0] < (int) mach->ImmLimit );
+         ASSERT( index->i[1] < (int) mach->ImmLimit );
+         ASSERT( index->i[2] < (int) mach->ImmLimit );
+         ASSERT( index->i[3] < (int) mach->ImmLimit );
+
+         chan->f[0] = mach->Imms[index->i[0]][swizzle];
+         chan->f[1] = mach->Imms[index->i[1]][swizzle];
+         chan->f[2] = mach->Imms[index->i[2]][swizzle];
+         chan->f[3] = mach->Imms[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         /* vertex/fragment output vars can be read too */
+         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      default:
+         ASSERT( 0 );
+      }
+      break;
+
+   default:
+      ASSERT( 0 );
+   }
+}
+
+static void
+fetch_source(
+   const struct spu_exec_machine *mach,
+   union spu_exec_channel *chan,
+   const struct tgsi_full_src_register *reg,
+   const uint chan_index )
+{
+   union spu_exec_channel index;
+   uint swizzle;
+
+   index.i[0] =
+   index.i[1] =
+   index.i[2] =
+   index.i[3] = reg->Register.Index;
+
+   if (reg->Register.Indirect) {
+      union spu_exec_channel index2;
+      union spu_exec_channel indir_index;
+
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->Indirect.Index;
+
+      swizzle = tgsi_util_get_src_register_swizzle(&reg->Indirect,
+                                                   CHAN_X);
+      fetch_src_file_channel(
+         mach,
+         reg->Indirect.File,
+         swizzle,
+         &index2,
+         &indir_index );
+
+      index.q = si_a(index.q, indir_index.q);
+   }
+
+   if( reg->Register.Dimension ) {
+      switch( reg->Register.File ) {
+      case TGSI_FILE_INPUT:
+         index.q = si_mpyi(index.q, 17);
+         break;
+      case TGSI_FILE_CONSTANT:
+         index.q = si_shli(index.q, 12);
+         break;
+      default:
+         ASSERT( 0 );
+      }
+
+      index.i[0] += reg->Dimension.Index;
+      index.i[1] += reg->Dimension.Index;
+      index.i[2] += reg->Dimension.Index;
+      index.i[3] += reg->Dimension.Index;
+
+      if (reg->Dimension.Indirect) {
+         union spu_exec_channel index2;
+         union spu_exec_channel indir_index;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->DimIndirect.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
+         fetch_src_file_channel(
+            mach,
+            reg->DimIndirect.File,
+            swizzle,
+            &index2,
+            &indir_index );
+
+         index.q = si_a(index.q, indir_index.q);
+      }
+   }
+
+   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
+   fetch_src_file_channel(
+      mach,
+      reg->Register.File,
+      swizzle,
+      &index,
+      chan );
+
+   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      chan->q = micro_abs(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      chan->q = micro_set_sign(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      chan->q = micro_neg(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   if (reg->RegisterExtMod.Complement) {
+      chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q);
+   }
+}
+
+static void
+store_dest(
+   struct spu_exec_machine *mach,
+   const union spu_exec_channel *chan,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   uint chan_index )
+{
+   union spu_exec_channel *dst;
+
+   switch( reg->Register.File ) {
+   case TGSI_FILE_NULL:
+      return;
+
+   case TGSI_FILE_OUTPUT:
+      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+                           + reg->Register.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      dst = &mach->Temps[reg->Register.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      dst = &mach->Addrs[reg->Register.Index].xyzw[chan_index];
+      break;
+
+   default:
+      ASSERT( 0 );
+      return;
+   }
+
+   switch (inst->Instruction.Saturate)
+   {
+   case TGSI_SAT_NONE:
+      if (mach->ExecMask & 0x1)
+         dst->i[0] = chan->i[0];
+      if (mach->ExecMask & 0x2)
+         dst->i[1] = chan->i[1];
+      if (mach->ExecMask & 0x4)
+         dst->i[2] = chan->i[2];
+      if (mach->ExecMask & 0x8)
+         dst->i[3] = chan->i[3];
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* XXX need to obey ExecMask here */
+      dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+      dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      ASSERT( 0 );
+      break;
+
+   default:
+      ASSERT( 0 );
+   }
+}
+
+#define FETCH(VAL,INDEX,CHAN)\
+    fetch_source (mach, VAL, &inst->Src[INDEX], CHAN)
+
+#define STORE(VAL,INDEX,CHAN)\
+    store_dest (mach, VAL, &inst->Dst[INDEX], inst, CHAN )
+
+
+/**
+ * Execute ARB-style KIL which is predicated by a src register.
+ * Kill fragment if any of the four values is less than zero.
+ */
+static void
+exec_kil(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   uint uniquemask;
+   uint chan_index;
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+   union spu_exec_channel r[1];
+
+   /* This mask stores component bits that were already tested. */
+   uniquemask = 0;
+
+   for (chan_index = 0; chan_index < 4; chan_index++)
+   {
+      uint swizzle;
+      uint i;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_swizzle (
+                        &inst->Src[0],
+                        chan_index);
+
+      /* check if the component has not been already tested */
+      if (uniquemask & (1 << swizzle))
+         continue;
+      uniquemask |= 1 << swizzle;
+
+      FETCH(&r[0], 0, chan_index);
+      for (i = 0; i < 4; i++)
+         if (r[0].f[i] < 0.0f)
+            kilmask |= 1 << i;
+   }
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+/**
+ * Execute NVIDIA-style KIL which is predicated by a condition code.
+ * Kill fragment if the condition code is TRUE.
+ */
+static void
+exec_kilp(struct spu_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+
+   /* TODO: build kilmask from CC mask */
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+/*
+ * Fetch a texel using STR texture coordinates.
+ */
+static void
+fetch_texel( struct spu_sampler *sampler,
+             const union spu_exec_channel *s,
+             const union spu_exec_channel *t,
+             const union spu_exec_channel *p,
+             float lodbias,  /* XXX should be float[4] */
+             union spu_exec_channel *r,
+             union spu_exec_channel *g,
+             union spu_exec_channel *b,
+             union spu_exec_channel *a )
+{
+   qword rgba[4];
+   qword out[4];
+
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, 
+			(float (*)[4]) rgba);
+
+   _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) rgba);
+   r->q = out[0];
+   g->q = out[1];
+   b->q = out[2];
+   a->q = out[3];
+}
+
+
+static void
+exec_tex(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst,
+         boolean biasLod, boolean projected)
+{
+   const uint unit = inst->Src[1].Register.Index;
+   union spu_exec_channel r[8];
+   uint chan_index;
+   float lodBias;
+
+   /*   printf("Sampler %u unit %u\n", sampler, unit); */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      if (projected) {
+         FETCH(&r[1], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[1].q);
+      }
+
+      if (biasLod) {
+         FETCH(&r[1], 0, CHAN_W);
+         lodBias = r[2].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      if (projected) {
+         FETCH(&r[3], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      if (projected) {
+         FETCH(&r[3], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      ASSERT (0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE( &r[chan_index], 0, chan_index );
+   }
+}
+
+
+
+static void
+constant_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
+   }
+}
+
+static void
+linear_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
+   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
+   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
+   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
+}
+
+static void
+perspective_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
+}
+
+
+typedef void (* interpolation_func)(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan );
+
+static void
+exec_declaration(struct spu_exec_machine *mach,
+                 const struct tgsi_full_declaration *decl)
+{
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+         unsigned first, last, mask;
+         interpolation_func interp;
+
+         first = decl->Range.First;
+         last = decl->Range.Last;
+         mask = decl->Declaration.UsageMask;
+
+         switch( decl->Declaration.Interpolate ) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = constant_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = linear_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = perspective_interpolation;
+            break;
+
+         default:
+            ASSERT( 0 );
+         }
+
+         if( mask == TGSI_WRITEMASK_XYZW ) {
+            unsigned i, j;
+
+            for( i = first; i <= last; i++ ) {
+               for( j = 0; j < NUM_CHANNELS; j++ ) {
+                  interp( mach, i, j );
+               }
+            }
+         }
+         else {
+            unsigned i, j;
+
+            for( j = 0; j < NUM_CHANNELS; j++ ) {
+               if( mask & (1 << j) ) {
+                  for( i = first; i <= last; i++ ) {
+                     interp( mach, i, j );
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+exec_instruction(
+   struct spu_exec_machine *mach,
+   const struct tgsi_full_instruction *inst,
+   int *pc )
+{
+   uint chan_index;
+   union spu_exec_channel r[8];
+
+   (*pc)++;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 FETCH( &r[0], 0, chan_index );
+         r[0].q = si_cflts(r[0].q, 0);
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_X );
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+	    STORE( &r[0], 0, CHAN_Y );
+	 }
+
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            FETCH( &r[1], 0, CHAN_Y );
+            r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+
+            FETCH( &r[2], 0, CHAN_W );
+            r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q);
+            r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q);
+            r[1].q = micro_pow(r[1].q, r[2].q);
+
+            /* r0 = (r0 > 0.0) ? r1 : 0.0
+             */
+            r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+            r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q,
+                             r[0].q);
+            STORE( &r[0], 0, CHAN_Z );
+         }
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_sqrt(r[0].q);
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_LOG:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
+      {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = si_fm(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_fa(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+
+      FETCH( &r[1], 0, CHAN_Z );
+      FETCH( &r[2], 1, CHAN_Z );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+    case TGSI_OPCODE_DP4:
+    /* TGSI_OPCODE_DOT4 */
+       FETCH(&r[0], 0, CHAN_X);
+       FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+       FETCH(&r[1], 0, CHAN_Y);
+       FETCH(&r[2], 1, CHAN_Y);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+       FETCH(&r[1], 0, CHAN_Z);
+       FETCH(&r[2], 1, CHAN_Z);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+       FETCH(&r[1], 0, CHAN_W);
+       FETCH(&r[2], 1, CHAN_W);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	 FETCH( &r[0], 0, CHAN_Y );
+	 FETCH( &r[1], 1, CHAN_Y);
+      r[0].q = si_fm(r[0].q, r[1].q);
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_Z );
+	 STORE( &r[0], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 FETCH( &r[0], 1, CHAN_W );
+	 STORE( &r[0], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = micro_min(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = micro_max(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         FETCH( &r[2], 2, chan_index );
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = si_fs(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_LRP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         r[1].q = si_fs(r[1].q, r[2].q);
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_DP2A:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_FRC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_frc(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_FLR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_flr(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_rnd(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EX2:
+      FETCH(&r[0], 0, CHAN_X);
+
+      r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LG2:
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_lg2(r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POW:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = micro_pow(r[0].q, r[1].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_XPD:
+      /* TGSI_OPCODE_XPD */
+      FETCH(&r[0], 0, CHAN_Y);
+      FETCH(&r[1], 1, CHAN_Z);
+      FETCH(&r[3], 0, CHAN_Z);
+      FETCH(&r[4], 1, CHAN_Y);
+
+      /* r2 = (r0 * r1) - (r3 * r5)
+       */
+      r[2].q = si_fm(r[3].q, r[5].q);
+      r[2].q = si_fms(r[0].q, r[1].q, r[2].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[2], 0, CHAN_X );
+      }
+
+      FETCH(&r[2], 1, CHAN_X);
+      FETCH(&r[5], 0, CHAN_X);
+
+      /* r3 = (r3 * r2) - (r1 * r5)
+       */
+      r[1].q = si_fm(r[1].q, r[5].q);
+      r[3].q = si_fms(r[3].q, r[2].q, r[1].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         STORE( &r[3], 0, CHAN_Y );
+      }
+
+      /* r5 = (r5 * r4) - (r0 * r2)
+       */
+      r[0].q = si_fm(r[0].q, r[2].q);
+      r[5].q = si_fms(r[5].q, r[4].q, r[0].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[5], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+    case TGSI_OPCODE_ABS:
+       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+          FETCH(&r[0], 0, chan_index);
+
+          r[0].q = micro_abs(r[0].q);
+
+          STORE(&r[0], 0, chan_index);
+       }
+       break;
+
+   case TGSI_OPCODE_RCC:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 1, CHAN_Y);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FETCH(&r[1], 0, CHAN_Z);
+      FETCH(&r[2], 1, CHAN_Z);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FETCH(&r[1], 1, CHAN_W);
+
+      r[0].q = si_fa(r[0].q, r[1].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH(&r[0], 0, CHAN_X);
+
+      r[0].q = micro_cos(r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ddx(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ddy(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      exec_kilp (mach, inst);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      exec_kil (mach, inst);
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_RFL:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_sin(r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* simple texture lookup */
+      /* src[0] = texcoord */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* Texture lookup with lod bias */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* Texture lookup with explict partial derivatives */
+      /* src[0] = texcoord */
+      /* src[1] = d[strq]/dx */
+      /* src[2] = d[strq]/dy */
+      /* src[3] = sampler unit */
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TXL:
+      /* Texture lookup with explit LOD */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      /* Texture lookup with projection */
+      /* src[0] = texcoord (src[0].w = projection) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, TRUE);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_X2D:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_ARA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_ARR:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* skip the call if no execution channels are enabled */
+      if (mach->ExecMask) {
+         /* do the call */
+
+         /* push the Cond, Loop, Cont stacks */
+         ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+
+         ASSERT(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
+
+         /* note that PC was already incremented above */
+         mach->CallStack[mach->CallStackTop++] = *pc;
+         *pc = inst->InstructionExtLabel.Label;
+      }
+      break;
+
+   case TGSI_OPCODE_RET:
+      mach->FuncMask &= ~mach->ExecMask;
+      UPDATE_EXEC_MASK(mach);
+
+      if (mach->ExecMask == 0x0) {
+         /* really return now (otherwise, keep executing */
+
+         if (mach->CallStackTop == 0) {
+            /* returning from main() */
+            *pc = -1;
+            return;
+         }
+         *pc = mach->CallStack[--mach->CallStackTop];
+
+         /* pop the Cond, Loop, Cont stacks */
+         ASSERT(mach->CondStackTop > 0);
+         mach->CondMask = mach->CondStack[--mach->CondStackTop];
+         ASSERT(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         ASSERT(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+         ASSERT(mach->FuncStackTop > 0);
+         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
+
+         UPDATE_EXEC_MASK(mach);
+      }
+      break;
+
+   case TGSI_OPCODE_SSG:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         /* r0 = (r0 < 0.0) ? r1 : r2
+          */
+         r[3].q = si_xor(r[3].q, r[3].q);
+         r[0].q = micro_lt(r[0].q, r[3].q);
+         r[0].q = si_selb(r[1].q, r[2].q, r[0].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( &r[0], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+         r[1].q = micro_cos(r[0].q);
+         STORE( &r[1], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         r[1].q = micro_sin(r[0].q);
+         STORE( &r[1], 0, CHAN_Y );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_NRM:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_DIV:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* push CondMask */
+      ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+      FETCH( &r[0], 0, CHAN_X );
+      /* update CondMask */
+      if( ! r[0].u[0] ) {
+         mach->CondMask &= ~0x1;
+      }
+      if( ! r[0].u[1] ) {
+         mach->CondMask &= ~0x2;
+      }
+      if( ! r[0].u[2] ) {
+         mach->CondMask &= ~0x4;
+      }
+      if( ! r[0].u[3] ) {
+         mach->CondMask &= ~0x8;
+      }
+      UPDATE_EXEC_MASK(mach);
+      /* Todo: If CondMask==0, jump to ELSE */
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* invert CondMask wrt previous mask */
+      {
+         uint prevMask;
+         ASSERT(mach->CondStackTop > 0);
+         prevMask = mach->CondStack[mach->CondStackTop - 1];
+         mach->CondMask = ~mach->CondMask & prevMask;
+         UPDATE_EXEC_MASK(mach);
+         /* Todo: If CondMask==0, jump to ENDIF */
+      }
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* pop CondMask */
+      ASSERT(mach->CondStackTop > 0);
+      mach->CondMask = mach->CondStack[--mach->CondStackTop];
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* halt execution */
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_PUSHA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_POPA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ceil(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = si_csflt(r[0].q, 0);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_NOT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = si_xorbi(r[0].q, 0xff);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_trunc(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_shl(r[0].q, r[1].q);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ISHR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = micro_ishr(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_AND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_and(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_OR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_or(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOD:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_xor(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SAD:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TXF:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      break;
+
+   case TGSI_OPCODE_BGNLOOP:
+      /* push LoopMask and ContMasks */
+      ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      /* Restore ContMask, but don't pop */
+      ASSERT(mach->ContStackTop > 0);
+      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
+      if (mach->LoopMask) {
+         /* repeat loop: jump to instruction just past BGNLOOP */
+         *pc = inst->InstructionExtLabel.Label + 1;
+      }
+      else {
+         /* exit loop: pop LoopMask */
+         ASSERT(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         /* pop ContMask */
+         ASSERT(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+      }
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* turn off cont channels for each enabled exec channel */
+      mach->ContMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      ASSERT( 0 );
+   }
+}
+
+
+/**
+ * Run TGSI interpreter.
+ * \return bitmask of "alive" quad components
+ */
+uint
+spu_exec_machine_run( struct spu_exec_machine *mach )
+{
+   uint i;
+   int pc = 0;
+
+   mach->CondMask = 0xf;
+   mach->LoopMask = 0xf;
+   mach->ContMask = 0xf;
+   mach->FuncMask = 0xf;
+   mach->ExecMask = 0xf;
+
+   mach->CondStackTop = 0; /* temporarily subvert this ASSERTion */
+   ASSERT(mach->CondStackTop == 0);
+   ASSERT(mach->LoopStackTop == 0);
+   ASSERT(mach->ContStackTop == 0);
+   ASSERT(mach->CallStackTop == 0);
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
+   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
+
+   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
+      mach->Primitives[0] = 0;
+   }
+
+
+   /* execute declarations (interpolants) */
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      for (i = 0; i < mach->NumDeclarations; i++) {
+         PIPE_ALIGN_VAR(16)
+         union {
+            struct tgsi_full_declaration decl;
+            qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16];
+         } d;
+         unsigned ea = (unsigned) (mach->Declarations + pc);
+
+         spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
+
+         exec_declaration( mach, &d.decl );
+      }
+   }
+
+   /* execute instructions, until pc is set to -1 */
+   while (pc != -1) {
+      PIPE_ALIGN_VAR(16)
+      union {
+         struct tgsi_full_instruction inst;
+         qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16];
+      } i;
+      unsigned ea = (unsigned) (mach->Instructions + pc);
+
+      spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
+      exec_instruction( mach, & i.inst, &pc );
+   }
+
+#if 0
+   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /*
+       * Scale back depth component.
+       */
+      for (i = 0; i < 4; i++)
+         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
+   }
+#endif
+
+   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+}
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
new file mode 100644
index 0000000000..68f4479e53
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -0,0 +1,173 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#if !defined SPU_EXEC_H
+#define SPU_EXEC_H
+
+#include "pipe/p_compiler.h"
+
+#include "spu_tgsi_exec.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/**
+  * Registers may be treated as float, signed int or unsigned int.
+  */
+union spu_exec_channel
+{
+   float    f[QUAD_SIZE];
+   int      i[QUAD_SIZE];
+   unsigned u[QUAD_SIZE];
+   qword    q;
+};
+
+/**
+  * A vector[RGBA] of channels[4 pixels]
+  */
+struct spu_exec_vector
+{
+   union spu_exec_channel xyzw[NUM_CHANNELS];
+};
+
+/**
+ * For fragment programs, information for computing fragment input
+ * values from plane equation of the triangle/line.
+ */
+struct spu_interp_coef
+{
+   float a0[NUM_CHANNELS];	/* in an xyzw layout */
+   float dadx[NUM_CHANNELS];
+   float dady[NUM_CHANNELS];
+};
+
+
+struct softpipe_tile_cache;  /**< Opaque to TGSI */
+
+/**
+ * Information for sampling textures, which must be implemented
+ * by code outside the TGSI executor.
+ */
+struct spu_sampler
+{
+   const struct pipe_sampler_state *state;
+   struct pipe_resource *texture;
+   /** Get samples for four fragments in a quad */
+   void (*get_samples)(struct spu_sampler *sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
+   void *pipe; /*XXX temporary*/
+   struct softpipe_tile_cache *cache;
+};
+
+
+/**
+ * Run-time virtual machine state for executing TGSI shader.
+ */
+struct spu_exec_machine
+{
+   /*
+    * 32 program temporaries
+    * 4  internal temporaries
+    * 1  address
+    */
+   PIPE_ALIGN_VAR(16)
+   struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
+                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1];
+
+   struct spu_exec_vector       *Addrs;
+
+   struct spu_sampler           *Samplers;
+
+   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
+   unsigned                      ImmLimit;
+   float                         (*Consts)[4];
+   struct spu_exec_vector       *Inputs;
+   struct spu_exec_vector       *Outputs;
+   unsigned                      Processor;
+
+   /* GEOMETRY processor only. */
+   unsigned                      *Primitives;
+
+   /* FRAGMENT processor only. */
+   const struct spu_interp_coef *InterpCoefs;
+   struct spu_exec_vector       QuadPos;
+
+   /* Conditional execution masks */
+   uint CondMask;  /**< For IF/ELSE/ENDIF */
+   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
+   uint ContMask;  /**< For loop CONT statements */
+   uint FuncMask;  /**< For function calls */
+   uint ExecMask;  /**< = CondMask & LoopMask */
+
+   /** Condition mask stack (for nested conditionals) */
+   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
+   int CondStackTop;
+
+   /** Loop mask stack (for nested loops) */
+   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int LoopStackTop;
+
+   /** Loop continue mask stack (see comments in tgsi_exec.c) */
+   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int ContStackTop;
+
+   /** Function execution mask stack (for executing subroutine code) */
+   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int FuncStackTop;
+
+   /** Function call stack for saving/restoring the program counter */
+   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int CallStackTop;
+
+   struct tgsi_full_instruction *Instructions;
+   uint NumInstructions;
+
+   struct tgsi_full_declaration *Declarations;
+   uint NumDeclarations;
+};
+
+
+extern void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor);
+
+extern uint
+spu_exec_machine_run( struct spu_exec_machine *mach );
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* SPU_EXEC_H */
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
new file mode 100644
index 0000000000..98919c43ff
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -0,0 +1,173 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU functions accessed by shaders.
+ *
+ * Authors: Brian Paul
+ */
+
+
+#include <string.h>
+#include <libmisc.h>
+#include <math.h>
+#include <cos14_v.h>
+#include <sin14_v.h>
+#include <simdmath/exp2f4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_funcs.h"
+#include "spu_texture.h"
+
+
+/** For "return"-ing four vectors */
+struct vec_4x4
+{
+   vector float v[4];
+};
+
+
+static vector float
+spu_cos(vector float x)
+{
+   return _cos14_v(x);
+}
+
+static vector float
+spu_sin(vector float x)
+{
+   return _sin14_v(x);
+}
+
+static vector float
+spu_pow(vector float x, vector float y)
+{
+   return _powf4(x, y);
+}
+
+static vector float
+spu_exp2(vector float x)
+{
+   return _exp2f4(x);
+}
+
+static vector float
+spu_log2(vector float x)
+{
+   return _log2f4(x);
+}
+
+
+static struct vec_4x4
+spu_tex_2d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_3d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_cube(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) q;
+   sample_texture_cube(s, t, r, unit, colors.v);
+   return colors;
+}
+
+
+/**
+ * Add named function to list of "exported" functions that will be
+ * made available to the PPU-hosted code generator.
+ */
+static void
+export_func(struct cell_spu_function_info *spu_functions,
+            const char *name, void *addr)
+{
+   uint n = spu_functions->num;
+   ASSERT(strlen(name) < 16);
+   strcpy(spu_functions->names[n], name);
+   spu_functions->addrs[n] = (uint) addr;
+   spu_functions->num++;
+   ASSERT(spu_functions->num <= 16);
+}
+
+
+/**
+ * Return info about the SPU's function to the PPU / main memory.
+ * The PPU needs to know the address of some SPU-side functions so
+ * that we can generate shader code with function calls.
+ */
+void
+return_function_info(void)
+{
+   PIPE_ALIGN_VAR(16) struct cell_spu_function_info funcs;
+   int tag = TAG_MISC;
+
+   ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
+
+   funcs.num = 0;
+   export_func(&funcs, "spu_cos", &spu_cos);
+   export_func(&funcs, "spu_sin", &spu_sin);
+   export_func(&funcs, "spu_pow", &spu_pow);
+   export_func(&funcs, "spu_exp2", &spu_exp2);
+   export_func(&funcs, "spu_log2", &spu_log2);
+   export_func(&funcs, "spu_tex_2d", &spu_tex_2d);
+   export_func(&funcs, "spu_tex_3d", &spu_tex_3d);
+   export_func(&funcs, "spu_tex_cube", &spu_tex_cube);
+
+   /* Send the function info back to the PPU / main memory */
+   mfc_put((void *) &funcs,  /* src in local store */
+           (unsigned int) spu.init.spu_functions, /* dst in main memory */
+           sizeof(funcs),  /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << tag);
+}
+
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.h b/src/gallium/drivers/cell/spu/spu_funcs.h
new file mode 100644
index 0000000000..3adb6ae99f
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_FUNCS_H
+#define SPU_FUNCS_H
+
+extern void
+return_function_info(void);
+
+#endif
+
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
new file mode 100644
index 0000000000..97c86d194d
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -0,0 +1,117 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/* main() for Cell SPU code */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_funcs.h"
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+//#include "spu_test.h"
+#include "cell/common.h"
+
+
+/*
+helpful headers:
+/usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h
+/opt/cell/sdk/usr/include/libmisc.h
+*/
+
+struct spu_global spu;
+
+
+static void
+one_time_init(void)
+{
+   memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
+   memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
+   invalidate_tex_cache();
+}
+
+/* In some versions of the SDK the SPE main takes 'unsigned long' as a
+ * parameter.  In others it takes 'unsigned long long'.  Use a define to
+ * select between the two.
+ */
+#ifdef SPU_MAIN_PARAM_LONG_LONG
+typedef unsigned long long main_param_t;
+#else
+typedef unsigned long main_param_t;
+#endif
+
+/**
+ * SPE entrypoint.
+ */
+int
+main(main_param_t speid, main_param_t argp)
+{
+   int tag = 0;
+
+   (void) speid;
+
+   ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
+   ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+   ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
+
+   one_time_init();
+   spu_command_init();
+
+   D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
+
+   /* get initialization data */
+   mfc_get(&spu.init,  /* dest */
+           (unsigned int) argp, /* src */
+           sizeof(struct cell_init_info), /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask( 1 << tag );
+
+   if (spu.init.id == 0) {
+      return_function_info();
+   }
+
+#if 0
+   if (spu.init.id==0)
+      spu_test_misc(spu.init.id);
+#endif
+
+   command_loop();
+
+   spu_command_close();
+
+   return 0;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
new file mode 100644
index 0000000000..a9d72f84d5
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -0,0 +1,269 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_MAIN_H
+#define SPU_MAIN_H
+
+
+#include <spu_mfcio.h>
+
+#include "cell/common.h"
+#include "draw/draw_vertex.h"
+#include "pipe/p_state.h"
+
+
+#if DEBUG
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#else
+#define D_PRINTF(...)
+#endif
+
+
+/**
+ * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
+ * The data may be addressed through several different types.
+ */
+typedef union {
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
+} tile_t;
+
+
+#define TILE_STATUS_CLEAR   1
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
+
+
+/** Function for sampling textures */
+typedef void (*spu_sample_texture_2d_func)(vector float s,
+                                           vector float t,
+                                           uint unit, uint level, uint face,
+                                           vector float colors[4]);
+
+
+/** Function for performing per-fragment ops */
+typedef void (*spu_fragment_ops_func)(uint x, uint y,
+                                      tile_t *colorTile,
+                                      tile_t *depthStencilTile,
+                                      vector float fragZ,
+                                      vector float fragRed,
+                                      vector float fragGreen,
+                                      vector float fragBlue,
+                                      vector float fragAlpha,
+                                      vector unsigned int mask);
+
+/** Function for running fragment program */
+typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
+                                                         vector float *outputs,
+                                                         vector float *constants);
+
+
+PIPE_ALIGN_TYPE(16,
+struct spu_framebuffer
+{
+   void *color_start;              /**< addr of color surface in main memory */
+   void *depth_start;              /**< addr of depth surface in main memory */
+   enum pipe_format color_format;
+   enum pipe_format depth_format;
+   uint width;                     /**< width in pixels */
+   uint height;                    /**< height in pixels */
+   uint width_tiles;               /**< width in tiles */
+   uint height_tiles;              /**< width in tiles */
+
+   uint color_clear_value;
+   uint depth_clear_value;
+
+   uint zsize;                     /**< 0, 2 or 4 bytes per Z */
+   float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
+});
+
+
+/** per-texture level info */
+PIPE_ALIGN_TYPE(16,
+struct spu_texture_level
+{
+   void *start;
+   ushort width;
+   ushort height;
+   ushort depth;
+   ushort tiles_per_row;
+   uint bytes_per_image;
+   /** texcoord scale factors */
+   vector float scale_s;
+   vector float scale_t;
+   vector float scale_r;
+   /** texcoord masks (if REPEAT then size-1, else ~0) */
+   vector signed int mask_s;
+   vector signed int mask_t;
+   vector signed int mask_r;
+   /** texcoord clamp limits */
+   vector signed int max_s;
+   vector signed int max_t;
+   vector signed int max_r;
+});
+
+
+PIPE_ALIGN_TYPE(16,
+struct spu_texture
+{
+   struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+   uint max_level;
+   uint target;  /**< PIPE_TEXTURE_x */
+});
+
+
+/**
+ * All SPU global/context state will be in a singleton object of this type:
+ */
+PIPE_ALIGN_TYPE(16,
+struct spu_global
+{
+   /** One-time init/constant info */
+   struct cell_init_info init;
+
+   /*
+    * Current state
+    */
+   struct spu_framebuffer fb;
+   struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
+   struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
+   struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_rasterizer_state rasterizer;
+   struct spu_texture texture[PIPE_MAX_SAMPLERS];
+   struct vertex_info vertex_info;
+
+   /** Current color and Z tiles */
+   PIPE_ALIGN_VAR(16) tile_t ctile;
+   PIPE_ALIGN_VAR(16) tile_t ztile;
+
+   /** Read depth/stencil tiles? */
+   boolean read_depth_stencil;
+
+   /** Current tiles' status */
+   ubyte cur_ctile_status;
+   ubyte cur_ztile_status;
+
+   /** Status of all tiles in framebuffer */
+   PIPE_ALIGN_VAR(16) ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE];
+   PIPE_ALIGN_VAR(16) ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE];
+
+   /** Current fragment ops machine code, at 8-byte boundary */
+   uint *fragment_ops_code;
+   uint fragment_ops_code_size;
+   /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
+   spu_fragment_ops_func fragment_ops[2];
+
+   /** Current fragment program machine code, at 8-byte boundary */
+   PIPE_ALIGN_VAR(8) uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+   /** Current fragment ops function */
+   spu_fragment_program_func fragment_program;
+
+   /** Current texture sampler function */
+   spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS];
+
+   /** Fragment program constants */
+   vector float constants[4 * CELL_MAX_CONSTANTS];
+
+});
+
+
+extern struct spu_global spu;
+
+
+
+/* DMA TAGS */
+
+#define TAG_SURFACE_CLEAR     10
+#define TAG_VERTEX_BUFFER     11
+#define TAG_READ_TILE_COLOR   12
+#define TAG_READ_TILE_Z       13
+#define TAG_WRITE_TILE_COLOR  14
+#define TAG_WRITE_TILE_Z      15
+#define TAG_INDEX_BUFFER      16
+#define TAG_BATCH_BUFFER      17
+#define TAG_MISC              18
+#define TAG_DCACHE0           20
+#define TAG_DCACHE1           21
+#define TAG_DCACHE2           22
+#define TAG_DCACHE3           23
+#define TAG_FENCE             24
+
+
+static INLINE void
+wait_on_mask(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_any();
+}
+
+
+static INLINE void
+wait_on_mask_all(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_all();
+}
+
+
+
+
+
+static INLINE void
+memset16(ushort *d, ushort value, uint count)
+{
+   uint i;
+   for (i = 0; i < count; i++)
+      d[i] = value;
+}
+
+
+static INLINE void
+memset32(uint *d, uint value, uint count)
+{
+   uint i;
+   for (i = 0; i < count; i++)
+      d[i] = value;
+}
+
+
+#endif /* SPU_MAIN_H */
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
new file mode 100644
index 0000000000..3b9566042a
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -0,0 +1,631 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \author Brian Paul
+ */
+
+
+#include <transpose_matrix4x4.h>
+#include "pipe/p_format.h"
+#include "spu_main.h"
+#include "spu_colorpack.h"
+#include "spu_per_fragment_op.h"
+
+
+#define LINEAR_QUAD_LAYOUT 1
+
+
+static INLINE vector float
+spu_min(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(a, b, m);
+}
+
+
+static INLINE vector float
+spu_max(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(b, a, m);
+}
+
+
+/**
+ * Called by rasterizer for each quad after the shader has run.  Do
+ * all the per-fragment operations including alpha test, z test,
+ * stencil test, blend, colormask and logicops.  This is a
+ * fallback/debug function.  In reality we'll use a generated function
+ * produced by the PPU.  But this function is useful for
+ * debug/validation.
+ */
+void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragR,
+                          vector float fragG,
+                          vector float fragB,
+                          vector float fragA,
+                          vector unsigned int mask)
+{
+   vector float frag_aos[4];
+   unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
+   unsigned int fragc0, fragc1, fragc2, fragc3;  /* fragment colors */
+
+   /*
+    * Do alpha test
+    */
+   if (spu.depth_stencil_alpha.alpha.enabled) {
+      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref_value);
+      vector unsigned int amask;
+
+      switch (spu.depth_stencil_alpha.alpha.func) {
+      case PIPE_FUNC_LESS:
+         amask = spu_cmpgt(ref, fragA);  /* mask = (fragA < ref) */
+         break;
+      case PIPE_FUNC_GREATER:
+         amask = spu_cmpgt(fragA, ref);  /* mask = (fragA > ref) */
+         break;
+      case PIPE_FUNC_GEQUAL:
+         amask = spu_cmpgt(ref, fragA);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_LEQUAL:
+         amask = spu_cmpgt(fragA, ref);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_EQUAL:
+         amask = spu_cmpeq(ref, fragA);
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         amask = spu_cmpeq(ref, fragA);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_ALWAYS:
+         amask = spu_splats(0xffffffffU);
+         break;
+      case PIPE_FUNC_NEVER:
+         amask = spu_splats( 0x0U);
+         break;
+      default:
+         ;
+      }
+
+      mask = spu_and(mask, amask);
+   }
+
+
+   /*
+    * Z and/or stencil testing...
+    */
+   if (spu.depth_stencil_alpha.depth.enabled ||
+       spu.depth_stencil_alpha.stencil[0].enabled) {
+
+      /* get four Z/Stencil values from tile */
+      vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU);
+      vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2];
+      vector unsigned int ifbZ = spu_and(ifbZS, mask24);
+      vector unsigned int ifbS = spu_andc(ifbZS, mask24);
+
+      if (spu.depth_stencil_alpha.stencil[0].enabled) {
+         /* do stencil test */
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z24_UNORM_S8_USCALED);
+
+      }
+      else if (spu.depth_stencil_alpha.depth.enabled) {
+         /* do depth test */
+
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z24_UNORM_S8_USCALED ||
+                spu.fb.depth_format == PIPE_FORMAT_Z24X8_UNORM);
+
+         vector unsigned int ifragZ;
+         vector unsigned int zmask;
+
+         /* convert four fragZ from float to uint */
+         fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff));
+         ifragZ = spu_convtu(fragZ, 0);
+
+         /* do depth comparison, setting zmask with results */
+         switch (spu.depth_stencil_alpha.depth.func) {
+         case PIPE_FUNC_LESS:
+            zmask = spu_cmpgt(ifbZ, ifragZ);  /* mask = (ifragZ < ifbZ) */
+            break;
+         case PIPE_FUNC_GREATER:
+            zmask = spu_cmpgt(ifragZ, ifbZ);  /* mask = (ifbZ > ifragZ) */
+            break;
+         case PIPE_FUNC_GEQUAL:
+            zmask = spu_cmpgt(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_LEQUAL:
+            zmask = spu_cmpgt(ifragZ, ifbZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_EQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            break;
+         case PIPE_FUNC_NOTEQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_ALWAYS:
+            zmask = spu_splats(0xffffffffU);
+            break;
+         case PIPE_FUNC_NEVER:
+            zmask = spu_splats( 0x0U);
+            break;
+         default:
+            ;
+         }
+
+         mask = spu_and(mask, zmask);
+
+         /* merge framebuffer Z and fragment Z according to the mask */
+         ifbZ = spu_or(spu_and(ifragZ, mask),
+                       spu_andc(ifbZ, mask));
+      }
+
+      if (spu_extract(spu_orx(mask), 0)) {
+         /* put new fragment Z/Stencil values back into Z/Stencil tile */
+         depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS);
+
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+   }
+
+
+   /*
+    * If we'll need the current framebuffer/tile colors for blending
+    * or logicop or colormask, fetch them now.
+    */
+   if (spu.blend.rt[0].blend_enable ||
+       spu.blend.logicop_enable ||
+       spu.blend.rt[0].colormask != 0xf) {
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+      fbc0 = colorTile->ui[y][x*2+0];
+      fbc1 = colorTile->ui[y][x*2+1];
+      fbc2 = colorTile->ui[y][x*2+2];
+      fbc3 = colorTile->ui[y][x*2+3];
+#else
+      fbc0 = colorTile->ui[y+0][x+0];
+      fbc1 = colorTile->ui[y+0][x+1];
+      fbc2 = colorTile->ui[y+1][x+0];
+      fbc3 = colorTile->ui[y+1][x+1];
+#endif
+   }
+
+
+   /*
+    * Do blending
+    */
+   if (spu.blend.rt[0].blend_enable) {
+      /* blending terms, misc regs */
+      vector float term1r, term1g, term1b, term1a;
+      vector float term2r, term2g, term2b, term2a;
+      vector float one, tmp;
+
+      vector float fbRGBA[4];  /* current framebuffer colors */
+
+      /* convert framebuffer colors from packed int to vector float */
+      {
+         vector float temp[4]; /* float colors in AOS form */
+         switch (spu.fb.color_format) {
+         case PIPE_FORMAT_A8R8G8B8_UNORM:
+            temp[0] = spu_unpack_B8G8R8A8(fbc0);
+            temp[1] = spu_unpack_B8G8R8A8(fbc1);
+            temp[2] = spu_unpack_B8G8R8A8(fbc2);
+            temp[3] = spu_unpack_B8G8R8A8(fbc3);
+            break;
+         case PIPE_FORMAT_B8G8R8A8_UNORM:
+            temp[0] = spu_unpack_A8R8G8B8(fbc0);
+            temp[1] = spu_unpack_A8R8G8B8(fbc1);
+            temp[2] = spu_unpack_A8R8G8B8(fbc2);
+            temp[3] = spu_unpack_A8R8G8B8(fbc3);
+            break;
+         default:
+            ASSERT(0);
+         }
+         _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
+      }
+
+      /*
+       * Compute Src RGB terms (fragment color * factor)
+       */
+      switch (spu.blend.rt[0].rgb_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1r = fragR;
+         term1g = fragG;
+         term1b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term1r =
+         term1g =
+         term1b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1r = spu_mul(fragR, fragR);
+         term1g = spu_mul(fragG, fragG);
+         term1b = spu_mul(fragB, fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1r = spu_mul(fragR, fragA);
+         term1g = spu_mul(fragG, fragA);
+         term1b = spu_mul(fragB, fragA);
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term1r = spu_mul(fragR, fbRGBA[0]);
+         term1g = spu_mul(fragG, fbRGBA[1]);
+         term1b = spu_mul(fragB, fbRGBA[1]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1r = spu_mul(fragR, fbRGBA[3]);
+         term1g = spu_mul(fragG, fbRGBA[3]);
+         term1b = spu_mul(fragB, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3]));
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Src Alpha term (fragment alpha * factor)
+       */
+      switch (spu.blend.rt[0].alpha_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1a = spu_mul(fragA, fragA);
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1a = spu_mul(fragA, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest RGB terms (framebuffer color * factor)
+       */
+      switch (spu.blend.rt[0].rgb_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2r = fbRGBA[0];
+         term2g = fbRGBA[1];
+         term2b = fbRGBA[2];
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term2r =
+         term2g =
+         term2b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2r = spu_mul(fbRGBA[0], fragR);
+         term2g = spu_mul(fbRGBA[1], fragG);
+         term2b = spu_mul(fbRGBA[2], fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fragA);
+         term2g = spu_mul(fbRGBA[1], fragA);
+         term2b = spu_mul(fbRGBA[2], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2r = spu_mul(fbRGBA[0], tmp);
+         term2g = spu_mul(fbRGBA[1], tmp);
+         term2b = spu_mul(fbRGBA[2], tmp);
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[0]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[1]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[2]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[3]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[3]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3]));
+         break;
+       /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest Alpha term (framebuffer alpha * factor)
+       */
+      switch (spu.blend.rt[0].alpha_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2a = fbRGBA[3];
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2a = spu_mul(fbRGBA[3], tmp);
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3]));
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest RGB terms
+       */
+      switch (spu.blend.rt[0].rgb_func) {
+      case PIPE_BLEND_ADD:
+         fragR = spu_add(term1r, term2r);
+         fragG = spu_add(term1g, term2g);
+         fragB = spu_add(term1b, term2b);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragR = spu_sub(term1r, term2r);
+         fragG = spu_sub(term1g, term2g);
+         fragB = spu_sub(term1b, term2b);
+         break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragR = spu_sub(term2r, term1r);
+         fragG = spu_sub(term2g, term1g);
+         fragB = spu_sub(term2b, term1b);
+         break;
+      case PIPE_BLEND_MIN:
+         fragR = spu_min(term1r, term2r);
+         fragG = spu_min(term1g, term2g);
+         fragB = spu_min(term1b, term2b);
+         break;
+      case PIPE_BLEND_MAX:
+         fragR = spu_max(term1r, term2r);
+         fragG = spu_max(term1g, term2g);
+         fragB = spu_max(term1b, term2b);
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest A term
+       */
+      switch (spu.blend.rt[0].alpha_func) {
+      case PIPE_BLEND_ADD:
+         fragA = spu_add(term1a, term2a);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragA = spu_sub(term1a, term2a);
+         break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragA = spu_sub(term2a, term1a);
+         break;
+      case PIPE_BLEND_MIN:
+         fragA = spu_min(term1a, term2a);
+         break;
+      case PIPE_BLEND_MAX:
+         fragA = spu_max(term1a, term2a);
+         break;
+      default:
+         ASSERT(0);
+      }
+   }
+
+
+   /*
+    * Convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA.
+    */
+#if 0
+   /* original code */
+   {
+      vector float frag_soa[4];
+      frag_soa[0] = fragR;
+      frag_soa[1] = fragG;
+      frag_soa[2] = fragB;
+      frag_soa[3] = fragA;
+      _transpose_matrix4x4(frag_aos, frag_soa);
+   }
+#else
+   /* short-cut relying on function parameter layout: */
+   _transpose_matrix4x4(frag_aos, &fragR);
+   (void) fragG;
+   (void) fragB;
+#endif
+
+   /*
+    * Pack fragment float colors into 32-bit RGBA words.
+    */
+   switch (spu.fb.color_format) {
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
+      break;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
+      break;
+   default:
+      fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
+      ASSERT(0);
+   }
+
+
+   /*
+    * Do color masking
+    */
+   if (spu.blend.rt[0].colormask != 0xf) {
+      uint cmask = 0x0; /* each byte corresponds to a color channel */
+
+      /* Form bitmask depending on color buffer format and colormask bits */
+      switch (spu.fb.color_format) {
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         if (spu.blend.rt[0].colormask & PIPE_MASK_R)
+            cmask |= 0x00ff0000; /* red */
+         if (spu.blend.rt[0].colormask & PIPE_MASK_G)
+            cmask |= 0x0000ff00; /* green */
+         if (spu.blend.rt[0].colormask & PIPE_MASK_B)
+            cmask |= 0x000000ff; /* blue */
+         if (spu.blend.rt[0].colormask & PIPE_MASK_A)
+            cmask |= 0xff000000; /* alpha */
+         break;
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         if (spu.blend.rt[0].colormask & PIPE_MASK_R)
+            cmask |= 0x0000ff00; /* red */
+         if (spu.blend.rt[0].colormask & PIPE_MASK_G)
+            cmask |= 0x00ff0000; /* green */
+         if (spu.blend.rt[0].colormask & PIPE_MASK_B)
+            cmask |= 0xff000000; /* blue */
+         if (spu.blend.rt[0].colormask & PIPE_MASK_A)
+            cmask |= 0x000000ff; /* alpha */
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Apply color mask to the 32-bit packed colors.
+       * if (cmask[i])
+       *    frag color[i] = frag color[i];
+       * else
+       *    frag color[i] = framebuffer color[i];
+       */
+      fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
+      fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
+      fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
+      fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
+   }
+
+
+   /*
+    * Do logic ops
+    */
+   if (spu.blend.logicop_enable) {
+      /* XXX to do */
+      /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
+   }
+
+
+   /*
+    * If mask is non-zero, mark tile as dirty.
+    */
+   if (spu_extract(spu_orx(mask), 0)) {
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   else {
+      /* write no fragments */
+      return;
+   }
+
+
+   /*
+    * Write new fragment/quad colors to the framebuffer/tile.
+    * Only write pixels where the corresponding mask word is set.
+    */
+#if LINEAR_QUAD_LAYOUT
+   /*
+    * Quad layout:
+    *  +--+--+--+--+
+    *  |p0|p1|p2|p3|...
+    *  +--+--+--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y][x*2] = fragc0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y][x*2+1] = fragc1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y][x*2+2] = fragc2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y][x*2+3] = fragc3;
+#else
+   /*
+    * Quad layout:
+    *  +--+--+
+    *  |p0|p1|...
+    *  +--+--+
+    *  |p2|p3|...
+    *  +--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y+0][x+0] = fragc0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y+0][x+1] = fragc1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y+1][x+0] = fragc2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y+1][x+1] = fragc3;
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
new file mode 100644
index 0000000000..f817abf046
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -0,0 +1,44 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_PER_FRAGMENT_OP
+#define SPU_PER_FRAGMENT_OP
+
+
+extern void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragRed,
+                          vector float fragGreen,
+                          vector float fragBlue,
+                          vector float fragAlpha,
+                          vector unsigned int mask);
+
+
+#endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
new file mode 100644
index 0000000000..14987e3c3a
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -0,0 +1,356 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdio.h>
+#include <libmisc.h>
+#include <spu_mfcio.h>
+
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_shuffle.h"
+#include "spu_tri.h"
+#include "spu_tile.h"
+#include "cell/common.h"
+#include "util/u_memory.h"
+
+
+/**
+ * Given a rendering command's bounding box (in pixels) compute the
+ * location of the corresponding screen tile bounding box.
+ */
+static INLINE void
+tile_bounding_box(const struct cell_command_render *render,
+                  uint *txmin, uint *tymin,
+                  uint *box_num_tiles, uint *box_width_tiles)
+{
+#if 0
+   /* Debug: full-window bounding box */
+   uint txmax = spu.fb.width_tiles - 1;
+   uint tymax = spu.fb.height_tiles - 1;
+   *txmin = 0;
+   *tymin = 0;
+   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   *box_width_tiles = spu.fb.width_tiles;
+   (void) render;
+   (void) txmax;
+   (void) tymax;
+#else
+   uint txmax, tymax, box_height_tiles;
+
+   *txmin = (uint) render->xmin / TILE_SIZE;
+   *tymin = (uint) render->ymin / TILE_SIZE;
+   txmax = (uint) render->xmax / TILE_SIZE;
+   tymax = (uint) render->ymax / TILE_SIZE;
+   if (txmax >= spu.fb.width_tiles)
+      txmax = spu.fb.width_tiles-1;
+   if (tymax >= spu.fb.height_tiles)
+      tymax = spu.fb.height_tiles-1;
+   *box_width_tiles = txmax - *txmin + 1;
+   box_height_tiles = tymax - *tymin + 1;
+   *box_num_tiles = *box_width_tiles * box_height_tiles;
+#endif
+#if 0
+   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
+          render->xmin, render->ymin, render->xmax, render->ymax);
+   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
+           spu.init.id, *txmin, *tymin, txmax, tymax);
+   ASSERT(render->xmin <= render->xmax);
+   ASSERT(render->ymin <= render->ymax);
+#endif
+}
+
+
+/** Check if the tile at (tx,ty) belongs to this SPU */
+static INLINE boolean
+my_tile(uint tx, uint ty)
+{
+   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
+}
+
+
+/**
+ * Start fetching non-clear color/Z tiles from main memory
+ */
+static INLINE void
+get_cz_tiles(uint tx, uint ty)
+{
+   if (spu.read_depth_stencil) {
+      if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
+         //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
+         get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
+         spu.cur_ztile_status = TILE_STATUS_GETTING;
+      }
+   }
+
+   if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
+      //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty);
+      get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_GETTING;
+   }
+}
+
+
+/**
+ * Start putting dirty color/Z tiles back to main memory
+ */
+static INLINE void
+put_cz_tiles(uint tx, uint ty)
+{
+   if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty);
+      put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
+   }
+   else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty);
+   }
+
+   if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty);
+      put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty);
+   }
+}
+
+
+/**
+ * Wait for 'put' of color/z tiles to complete.
+ */
+static INLINE void
+wait_put_cz_tiles(void)
+{
+   wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
+   if (spu.read_depth_stencil) {
+      wait_on_mask(1 << TAG_WRITE_TILE_Z);
+   }
+}
+
+
+/**
+ * Render primitives
+ * \param pos_incr  returns value indicating how may words to skip after
+ *                  this command in the batch buffer
+ */
+void
+cmd_render(const struct cell_command_render *render, uint *pos_incr)
+{
+   /* we'll DMA into these buffers */
+   PIPE_ALIGN_VAR(16) ubyte vertex_data[CELL_BUFFER_SIZE];
+   const uint vertex_size = render->vertex_size; /* in bytes */
+   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
+   uint index_bytes;
+   const ubyte *vertices;
+   const ushort *indexes;
+   uint i, j;
+   uint num_tiles;
+
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
+            render->prim_type,
+            render->num_verts,
+            render->num_indexes,
+            render->inline_verts);
+
+   ASSERT(sizeof(*render) % 4 == 0);
+   ASSERT(total_vertex_bytes % 16 == 0);
+   ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
+   ASSERT(render->num_indexes % 3 == 0);
+
+
+   /* indexes are right after the render command in the batch buffer */
+   indexes = (const ushort *) (render + 1);
+   index_bytes = ROUNDUP8(render->num_indexes * 2);
+   *pos_incr = index_bytes / 8 + sizeof(*render) / 8;
+
+
+   if (render->inline_verts) {
+      /* Vertices are after indexes in batch buffer at next 16-byte addr */
+      vertices = (const ubyte *) render + (*pos_incr * 8);
+      vertices = (const ubyte *) align_pointer((void *) vertices, 16);
+      ASSERT_ALIGN16(vertices);
+      *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8;
+   }
+   else {
+      /* Begin DMA fetch of vertex buffer */
+      ubyte *src = spu.init.buffers[render->vertex_buf];
+      ubyte *dest = vertex_data;
+
+      /* skip vertex data we won't use */
+#if 01
+      src += render->min_index * vertex_size;
+      dest += render->min_index * vertex_size;
+      total_vertex_bytes -= render->min_index * vertex_size;
+#endif
+      ASSERT(total_vertex_bytes % 16 == 0);
+      ASSERT_ALIGN16(dest);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(dest,   /* in vertex_data[] array */
+              (unsigned int) src,  /* src in main memory */
+              total_vertex_bytes,  /* size */
+              TAG_VERTEX_BUFFER,
+              0, /* tid */
+              0  /* rid */);
+
+      vertices = vertex_data;
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+   }
+
+
+   /**
+    ** find tiles which intersect the prim bounding box
+    **/
+   uint txmin, tymin, box_width_tiles, box_num_tiles;
+   tile_bounding_box(render, &txmin, &tymin,
+                     &box_num_tiles, &box_width_tiles);
+
+
+   /* make sure any pending clears have completed */
+   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
+
+
+   num_tiles = 0;
+
+   /**
+    ** loop over tiles, rendering tris
+    **/
+   for (i = 0; i < box_num_tiles; i++) {
+      const uint tx = txmin + i % box_width_tiles;
+      const uint ty = tymin + i / box_width_tiles;
+
+      ASSERT(tx < spu.fb.width_tiles);
+      ASSERT(ty < spu.fb.height_tiles);
+
+      if (!my_tile(tx, ty))
+         continue;
+
+      num_tiles++;
+
+      spu.cur_ctile_status = spu.ctile_status[ty][tx];
+      spu.cur_ztile_status = spu.ztile_status[ty][tx];
+
+      get_cz_tiles(tx, ty);
+
+      uint drawn = 0;
+
+      const qword vertex_sizes = (qword)spu_splats(vertex_size);
+      const qword verticess = (qword)spu_splats((uint)vertices);
+
+      ASSERT_ALIGN16(&indexes[0]);
+
+      const uint num_indexes = render->num_indexes;
+
+      /* loop over tris
+	   * &indexes[0] will be 16 byte aligned.  This loop is heavily unrolled
+	   * avoiding variable rotates when extracting vertex indices.
+	   */
+      for (j = 0; j < num_indexes; j += 24) {
+         /* Load three vectors, containing 24 ushort indices */
+         const qword* lower_qword = (qword*)&indexes[j];
+         const qword indices0 = lower_qword[0];
+         const qword indices1 = lower_qword[1];
+         const qword indices2 = lower_qword[2];
+
+         /* stores three indices for each tri n in slots 0, 1 and 2 of vsn */
+		 /* Straightforward rotates for these */
+         qword vs0 = indices0;
+         qword vs1 = si_shlqbyi(indices0, 6);
+         qword vs3 = si_shlqbyi(indices1, 2);
+         qword vs4 = si_shlqbyi(indices1, 8);
+         qword vs6 = si_shlqbyi(indices2, 4);
+         qword vs7 = si_shlqbyi(indices2, 10);
+
+         /* For tri 2 and 5, the three indices are split across two machine
+		  * words - rotate and combine */
+         const qword tmp2a = si_shlqbyi(indices0, 12);
+         const qword tmp2b = si_rotqmbyi(indices1, 12|16);
+         qword vs2 = si_selb(tmp2a, tmp2b, si_fsmh(si_from_uint(0x20)));
+
+         const qword tmp5a = si_shlqbyi(indices1, 14);
+         const qword tmp5b = si_rotqmbyi(indices2, 14|16);
+         qword vs5 = si_selb(tmp5a, tmp5b, si_fsmh(si_from_uint(0x60)));
+
+         /* unpack indices from halfword slots to word slots */
+         vs0 = si_shufb(vs0, vs0, SHUFB8(0,A,0,B,0,C,0,0));
+         vs1 = si_shufb(vs1, vs1, SHUFB8(0,A,0,B,0,C,0,0));
+         vs2 = si_shufb(vs2, vs2, SHUFB8(0,A,0,B,0,C,0,0));
+         vs3 = si_shufb(vs3, vs3, SHUFB8(0,A,0,B,0,C,0,0));
+         vs4 = si_shufb(vs4, vs4, SHUFB8(0,A,0,B,0,C,0,0));
+         vs5 = si_shufb(vs5, vs5, SHUFB8(0,A,0,B,0,C,0,0));
+         vs6 = si_shufb(vs6, vs6, SHUFB8(0,A,0,B,0,C,0,0));
+         vs7 = si_shufb(vs7, vs7, SHUFB8(0,A,0,B,0,C,0,0));
+
+         /* Calculate address of vertex in vertices[] */
+         vs0 = si_mpya(vs0, vertex_sizes, verticess);
+         vs1 = si_mpya(vs1, vertex_sizes, verticess);
+         vs2 = si_mpya(vs2, vertex_sizes, verticess);
+         vs3 = si_mpya(vs3, vertex_sizes, verticess);
+         vs4 = si_mpya(vs4, vertex_sizes, verticess);
+         vs5 = si_mpya(vs5, vertex_sizes, verticess);
+         vs6 = si_mpya(vs6, vertex_sizes, verticess);
+         vs7 = si_mpya(vs7, vertex_sizes, verticess);
+
+         /* Select the appropriate call based on the number of vertices 
+		  * remaining */
+         switch(num_indexes - j) {
+            default: drawn += tri_draw(vs7, tx, ty);
+            case 21: drawn += tri_draw(vs6, tx, ty);
+            case 18: drawn += tri_draw(vs5, tx, ty);
+            case 15: drawn += tri_draw(vs4, tx, ty);
+            case 12: drawn += tri_draw(vs3, tx, ty);
+            case 9:  drawn += tri_draw(vs2, tx, ty);
+            case 6:  drawn += tri_draw(vs1, tx, ty);
+            case 3:  drawn += tri_draw(vs0, tx, ty);
+         }
+      }
+
+      //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
+
+      /* write color/z tiles back to main framebuffer, if dirtied */
+      put_cz_tiles(tx, ty);
+
+      wait_put_cz_tiles(); /* XXX seems unnecessary... */
+
+      spu.ctile_status[ty][tx] = spu.cur_ctile_status;
+      spu.ztile_status[ty][tx] = spu.cur_ztile_status;
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER done (%u tiles hit)\n",
+            num_tiles);
+}
diff --git a/src/gallium/drivers/cell/spu/spu_render.h b/src/gallium/drivers/cell/spu/spu_render.h
new file mode 100644
index 0000000000..493434f087
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_render.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_RENDER_H
+#define SPU_RENDER_H
+
+#include "cell/common.h"
+
+extern void
+cmd_render(const struct cell_command_render *render, uint *pos_incr);
+
+#endif /* SPU_RENDER_H */
+
diff --git a/src/gallium/drivers/cell/spu/spu_shuffle.h b/src/gallium/drivers/cell/spu/spu_shuffle.h
new file mode 100644
index 0000000000..74f2a0b6d2
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_shuffle.h
@@ -0,0 +1,186 @@
+#ifndef SPU_SHUFFLE_H
+#define SPU_SHUFFLE_H
+
+/*
+ * Generate shuffle patterns with minimal fuss.
+ *
+ * Based on ideas from 
+ * http://www.insomniacgames.com/tech/articles/0408/files/shuffles.pdf
+ *
+ * A-P indicates 0-15th position in first vector
+ * a-p indicates 0-15th position in second vector
+ *
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * |00|01|02|03|04|05|06|07|08|09|0a|0b|0c|0d|0e|0f|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * |          A|          B|          C|          D|
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |    A|    B|    C|    D|    E|    F|    G|    H|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * | A| B| C| D| E| F| G| H| I| J| K| L| M| N| O| P|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ *
+ * x or X indicates 0xff
+ * 8 indicates 0x80
+ * 0 indicates 0x00
+ *
+ * The macros SHUFFLE4() SHUFFLE8() and SHUFFLE16() provide a const vector 
+ * unsigned char literal suitable for use with spu_shuffle().
+ *
+ * The macros SHUFB4() SHUFB8() and SHUFB16() provide a const qword vector 
+ * literal suitable for use with si_shufb().
+ *
+ *
+ * For example :
+ * SHUFB4(A,A,A,A)
+ * expands to :
+ * ((const qword){0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3})
+ * 
+ * SHUFFLE8(A,B,a,b,C,c,8,8)
+ * expands to :
+ * ((const vector unsigned char){0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,
+ *				 0x04,0x05,0x14,0x15,0xe0,0xe0,0xe0,0xe0})
+ *
+ */
+
+#include <spu_intrinsics.h>
+
+#define SHUFFLE_PATTERN_4_A__  0x00, 0x01, 0x02, 0x03
+#define SHUFFLE_PATTERN_4_B__  0x04, 0x05, 0x06, 0x07
+#define SHUFFLE_PATTERN_4_C__  0x08, 0x09, 0x0a, 0x0b
+#define SHUFFLE_PATTERN_4_D__  0x0c, 0x0d, 0x0e, 0x0f
+#define SHUFFLE_PATTERN_4_a__  0x10, 0x11, 0x12, 0x13
+#define SHUFFLE_PATTERN_4_b__  0x14, 0x15, 0x16, 0x17
+#define SHUFFLE_PATTERN_4_c__  0x18, 0x19, 0x1a, 0x1b
+#define SHUFFLE_PATTERN_4_d__  0x1c, 0x1d, 0x1e, 0x1f
+#define SHUFFLE_PATTERN_4_X__  0xc0, 0xc0, 0xc0, 0xc0
+#define SHUFFLE_PATTERN_4_x__  0xc0, 0xc0, 0xc0, 0xc0
+#define SHUFFLE_PATTERN_4_0__  0x80, 0x80, 0x80, 0x80
+#define SHUFFLE_PATTERN_4_8__  0xe0, 0xe0, 0xe0, 0xe0
+
+#define SHUFFLE_VECTOR_4__(A, B, C, D) \
+   SHUFFLE_PATTERN_4_##A##__, \
+   SHUFFLE_PATTERN_4_##B##__, \
+   SHUFFLE_PATTERN_4_##C##__, \
+   SHUFFLE_PATTERN_4_##D##__
+
+#define SHUFFLE4(A, B, C, D) \
+   ((const vector unsigned char){ \
+      SHUFFLE_VECTOR_4__(A, B, C, D) \
+   })
+
+#define SHUFB4(A, B, C, D) \
+   ((const qword){ \
+      SHUFFLE_VECTOR_4__(A, B, C, D) \
+   })
+
+
+#define SHUFFLE_PATTERN_8_A__  0x00, 0x01
+#define SHUFFLE_PATTERN_8_B__  0x02, 0x03
+#define SHUFFLE_PATTERN_8_C__  0x04, 0x05
+#define SHUFFLE_PATTERN_8_D__  0x06, 0x07
+#define SHUFFLE_PATTERN_8_E__  0x08, 0x09
+#define SHUFFLE_PATTERN_8_F__  0x0a, 0x0b
+#define SHUFFLE_PATTERN_8_G__  0x0c, 0x0d
+#define SHUFFLE_PATTERN_8_H__  0x0e, 0x0f
+#define SHUFFLE_PATTERN_8_a__  0x10, 0x11
+#define SHUFFLE_PATTERN_8_b__  0x12, 0x13
+#define SHUFFLE_PATTERN_8_c__  0x14, 0x15
+#define SHUFFLE_PATTERN_8_d__  0x16, 0x17
+#define SHUFFLE_PATTERN_8_e__  0x18, 0x19
+#define SHUFFLE_PATTERN_8_f__  0x1a, 0x1b
+#define SHUFFLE_PATTERN_8_g__  0x1c, 0x1d
+#define SHUFFLE_PATTERN_8_h__  0x1e, 0x1f
+#define SHUFFLE_PATTERN_8_X__  0xc0, 0xc0
+#define SHUFFLE_PATTERN_8_x__  0xc0, 0xc0
+#define SHUFFLE_PATTERN_8_0__  0x80, 0x80
+#define SHUFFLE_PATTERN_8_8__  0xe0, 0xe0
+
+
+#define SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+   SHUFFLE_PATTERN_8_##A##__, \
+   SHUFFLE_PATTERN_8_##B##__, \
+   SHUFFLE_PATTERN_8_##C##__, \
+   SHUFFLE_PATTERN_8_##D##__, \
+   SHUFFLE_PATTERN_8_##E##__, \
+   SHUFFLE_PATTERN_8_##F##__, \
+   SHUFFLE_PATTERN_8_##G##__, \
+   SHUFFLE_PATTERN_8_##H##__
+
+#define SHUFFLE8(A, B, C, D, E, F, G, H) \
+   ((const vector unsigned char){ \
+      SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+   })
+
+#define SHUFB8(A, B, C, D, E, F, G, H) \
+   ((const qword){ \
+      SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+   })
+
+
+#define SHUFFLE_PATTERN_16_A__  0x00
+#define SHUFFLE_PATTERN_16_B__  0x01
+#define SHUFFLE_PATTERN_16_C__  0x02
+#define SHUFFLE_PATTERN_16_D__  0x03
+#define SHUFFLE_PATTERN_16_E__  0x04
+#define SHUFFLE_PATTERN_16_F__  0x05
+#define SHUFFLE_PATTERN_16_G__  0x06
+#define SHUFFLE_PATTERN_16_H__  0x07
+#define SHUFFLE_PATTERN_16_I__  0x08
+#define SHUFFLE_PATTERN_16_J__  0x09
+#define SHUFFLE_PATTERN_16_K__  0x0a
+#define SHUFFLE_PATTERN_16_L__  0x0b
+#define SHUFFLE_PATTERN_16_M__  0x0c
+#define SHUFFLE_PATTERN_16_N__  0x0d
+#define SHUFFLE_PATTERN_16_O__  0x0e
+#define SHUFFLE_PATTERN_16_P__  0x0f
+#define SHUFFLE_PATTERN_16_a__  0x10
+#define SHUFFLE_PATTERN_16_b__  0x11
+#define SHUFFLE_PATTERN_16_c__  0x12
+#define SHUFFLE_PATTERN_16_d__  0x13
+#define SHUFFLE_PATTERN_16_e__  0x14
+#define SHUFFLE_PATTERN_16_f__  0x15
+#define SHUFFLE_PATTERN_16_g__  0x16
+#define SHUFFLE_PATTERN_16_h__  0x17
+#define SHUFFLE_PATTERN_16_i__  0x18
+#define SHUFFLE_PATTERN_16_j__  0x19
+#define SHUFFLE_PATTERN_16_k__  0x1a
+#define SHUFFLE_PATTERN_16_l__  0x1b
+#define SHUFFLE_PATTERN_16_m__  0x1c
+#define SHUFFLE_PATTERN_16_n__  0x1d
+#define SHUFFLE_PATTERN_16_o__  0x1e
+#define SHUFFLE_PATTERN_16_p__  0x1f
+#define SHUFFLE_PATTERN_16_X__  0xc0
+#define SHUFFLE_PATTERN_16_x__  0xc0
+#define SHUFFLE_PATTERN_16_0__  0x80
+#define SHUFFLE_PATTERN_16_8__  0xe0
+
+#define SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   SHUFFLE_PATTERN_16_##A##__, \
+   SHUFFLE_PATTERN_16_##B##__, \
+   SHUFFLE_PATTERN_16_##C##__, \
+   SHUFFLE_PATTERN_16_##D##__, \
+   SHUFFLE_PATTERN_16_##E##__, \
+   SHUFFLE_PATTERN_16_##F##__, \
+   SHUFFLE_PATTERN_16_##G##__, \
+   SHUFFLE_PATTERN_16_##H##__, \
+   SHUFFLE_PATTERN_16_##I##__, \
+   SHUFFLE_PATTERN_16_##J##__, \
+   SHUFFLE_PATTERN_16_##K##__, \
+   SHUFFLE_PATTERN_16_##L##__, \
+   SHUFFLE_PATTERN_16_##M##__, \
+   SHUFFLE_PATTERN_16_##N##__, \
+   SHUFFLE_PATTERN_16_##O##__, \
+   SHUFFLE_PATTERN_16_##P##__
+
+#define SHUFFLE16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   ((const vector unsigned char){ \
+      SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   })
+
+#define SHUFB16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   ((const qword){ \
+      SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   })
+
+#endif
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
new file mode 100644
index 0000000000..69784c8978
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -0,0 +1,641 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <math.h>
+
+#include "pipe/p_compiler.h"
+#include "spu_main.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_colorpack.h"
+#include "spu_dcache.h"
+
+
+/**
+ * Mark all tex cache entries as invalid.
+ */
+void
+invalidate_tex_cache(void)
+{
+   uint lvl;
+   for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
+      uint unit = 0;
+      uint bytes = 4 * spu.texture[unit].level[lvl].width
+         * spu.texture[unit].level[lvl].height;
+
+      if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
+         bytes *= 6;
+      else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
+         bytes *= spu.texture[unit].level[lvl].depth;
+
+      spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+   }
+}
+
+
+/**
+ * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
+ *
+ * NOTE: in the typical case of bilinear filtering, the four texels
+ * are in a 2x2 group so we could get by with just two dcache fetches
+ * (two side-by-side texels per fetch).  But when bilinear filtering
+ * wraps around a texture edge, we'll probably need code like we have
+ * now.
+ * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time,
+ * it's quite likely that the four pixels in a quad will need some of the
+ * same texels.  So look into doing texture fetches for four pixels at
+ * a time.
+ */
+static void
+get_four_texels(const struct spu_texture_level *tlevel, uint face,
+                vec_int4 x, vec_int4 y,
+                vec_uint4 *texels)
+{
+   unsigned texture_ea = (uintptr_t) tlevel->start;
+   const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
+   const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
+
+   const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
+   const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
+
+   qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
+   tile_offset = si_mpy((qword) tile_offset, tile_size);
+
+   qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x);
+   texel_offset = si_mpyui(texel_offset, 4);
+   
+   vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
+   
+   texture_ea = texture_ea + face * tlevel->bytes_per_image;
+
+   spu_dcache_fetch_unaligned((qword *) & texels[0],
+                              texture_ea + spu_extract(offset, 0), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[1],
+                              texture_ea + spu_extract(offset, 1), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[2],
+                              texture_ea + spu_extract(offset, 2), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[3],
+                              texture_ea + spu_extract(offset, 3), 4);
+}
+
+
+/** clamp vec to [0, max] */
+static INLINE vector signed int
+spu_clamp(vector signed int vec, vector signed int max)
+{
+   static const vector signed int zero = {0,0,0,0};
+   vector unsigned int c;
+   c = spu_cmpgt(vec, zero);    /* c = vec > zero ? ~0 : 0 */
+   vec = spu_sel(zero, vec, c);
+   c = spu_cmpgt(vec, max);    /* c = vec > max ? ~0 : 0 */
+   vec = spu_sel(vec, max, c);
+   return vec;
+}
+
+
+
+/**
+ * Do nearest texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4])
+{
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_mul(s, tlevel->scale_s);
+   vector float tt = spu_mul(t, tlevel->scale_t);
+   vector signed int is = spu_convts(ss, 0);
+   vector signed int it = spu_convts(tt, 0);
+   vec_uint4 texels[4];
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is = spu_and(is, tlevel->mask_s);
+   it = spu_and(it, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is = spu_clamp(is, tlevel->max_s);
+   it = spu_clamp(it, tlevel->max_t);
+
+   get_four_texels(tlevel, face, is, it, texels);
+
+   /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
+   spu_unpack_A8R8G8B8_transpose4(texels, colors);
+}
+
+
+/**
+ * Do bilinear texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
+{
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
+
+   vector signed int is0 = spu_convts(ss, 0);
+   vector signed int it0 = spu_convts(tt, 0);
+
+   /* is + 1, it + 1 */
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
+
+   /* convert packed int texels to float colors */
+   vector float ftexels[16];
+   spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
+   spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
+   spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
+   spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
+   vector signed int iss1024 = spu_convts(ss1024, 0);
+   iss1024 = spu_and(iss1024, 1023);
+   vector float sWeights0 = spu_convtf(iss1024, 10);
+
+   vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
+   vector signed int itt1024 = spu_convts(tt1024, 0);
+   itt1024 = spu_and(itt1024, 1023);
+   vector float tWeights0 = spu_convtf(itt1024, 10);
+
+   /* 1 - sWeight and 1 - tWeight */
+   vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
+   vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
+
+   /* reds, for four pixels */
+   ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
+                       spu_add(ftexels[8], ftexels[12]));
+
+   /* greens, for four pixels */
+   ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
+                       spu_add(ftexels[9], ftexels[13]));
+
+   /* blues, for four pixels */
+   ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
+                       spu_add(ftexels[10], ftexels[14]));
+
+   /* alphas, for four pixels */
+   ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
+                       spu_add(ftexels[11], ftexels[15]));
+}
+
+
+
+/**
+ * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
+ */
+static INLINE void
+transpose(vector unsigned int *mOut0,
+          vector unsigned int *mOut1,
+          vector unsigned int *mOut2,
+          vector unsigned int *mOut3,
+          vector unsigned int *mIn)
+{
+  vector unsigned int abcd, efgh, ijkl, mnop;	/* input vectors */
+  vector unsigned int aeim, bfjn, cgko, dhlp;	/* output vectors */
+  vector unsigned int aibj, ckdl, emfn, gohp;	/* intermediate vectors */
+
+  vector unsigned char shufflehi = ((vector unsigned char) {
+					       0x00, 0x01, 0x02, 0x03,
+					       0x10, 0x11, 0x12, 0x13,
+					       0x04, 0x05, 0x06, 0x07,
+					       0x14, 0x15, 0x16, 0x17});
+  vector unsigned char shufflelo = ((vector unsigned char) {
+					       0x08, 0x09, 0x0A, 0x0B,
+					       0x18, 0x19, 0x1A, 0x1B,
+					       0x0C, 0x0D, 0x0E, 0x0F,
+					       0x1C, 0x1D, 0x1E, 0x1F});
+  abcd = *(mIn+0);
+  efgh = *(mIn+1);
+  ijkl = *(mIn+2);
+  mnop = *(mIn+3);
+
+  aibj = spu_shuffle(abcd, ijkl, shufflehi);
+  ckdl = spu_shuffle(abcd, ijkl, shufflelo);
+  emfn = spu_shuffle(efgh, mnop, shufflehi);
+  gohp = spu_shuffle(efgh, mnop, shufflelo);
+
+  aeim = spu_shuffle(aibj, emfn, shufflehi);
+  bfjn = spu_shuffle(aibj, emfn, shufflelo);
+  cgko = spu_shuffle(ckdl, gohp, shufflehi);
+  dhlp = spu_shuffle(ckdl, gohp, shufflelo);
+
+  *mOut0 = aeim;
+  *mOut1 = bfjn;
+  *mOut2 = cgko;
+  *mOut3 = dhlp;
+}
+
+
+/**
+ * Bilinear filtering, using int instead of float arithmetic for computing
+ * sample weights.
+ */
+void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4])
+{
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   /* Scale texcoords by size of texture, and add half pixel bias */
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
+
+   /* convert float coords to fixed-pt coords with 7 fraction bits */
+   vector signed int is = spu_convts(ss, 7);  /* XXX really need floor() here */
+   vector signed int it = spu_convts(tt, 7);  /* XXX really need floor() here */
+
+   /* compute integer texel weights in [0, 127] */
+   vector signed int sWeights0 = spu_and(is, 127);
+   vector signed int tWeights0 = spu_and(it, 127);
+   vector signed int sWeights1 = spu_sub(127, sWeights0);
+   vector signed int tWeights1 = spu_sub(127, tWeights0);
+
+   /* texel coords: is0 = is / 128, it0 = is / 128 */
+   vector signed int is0 = spu_rlmask(is, -7);
+   vector signed int it0 = spu_rlmask(it, -7);
+
+   /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
+
+   /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
+   {
+      static const unsigned char ZERO = 0x80;
+      int i;
+      for (i = 0; i < 16; i++) {
+         texels[i] = spu_shuffle(texels[i], texels[i],
+                                 ((vector unsigned char) {
+                                    ZERO, ZERO, ZERO, 1,
+                                    ZERO, ZERO, ZERO, 2,
+                                    ZERO, ZERO, ZERO, 3,
+                                    ZERO, ZERO, ZERO, 0}));
+      }
+   }
+
+   /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
+   vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
+      texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
+   transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
+   transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
+   transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
+   transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
+
+   /* computed weighted colors */
+   vector unsigned int c0, c1, c2, c3, cSum;
+
+   /* red */
+   c0 = (vector unsigned int) si_mpy((qword) texel0, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel4, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel8, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel12, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[0] = spu_convtf(cSum, 22);
+
+   /* green */
+   c0 = (vector unsigned int) si_mpy((qword) texel1, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel5, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel9, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel13, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[1] = spu_convtf(cSum, 22);
+
+   /* blue */
+   c0 = (vector unsigned int) si_mpy((qword) texel2, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel6, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel10, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel14, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[2] = spu_convtf(cSum, 22);
+
+   /* alpha */
+   c0 = (vector unsigned int) si_mpy((qword) texel3, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel7, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel11, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel15, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[3] = spu_convtf(cSum, 22);
+}
+
+
+
+/**
+ * Compute level of detail factor from texcoords.
+ */
+static INLINE float
+compute_lambda_2d(uint unit, vector float s, vector float t)
+{
+   uint baseLevel = 0;
+   float width = spu.texture[unit].level[baseLevel].width;
+   float height = spu.texture[unit].level[baseLevel].width;
+   float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
+   float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
+   float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
+   float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+#if 0
+   /* ideal value */
+   float x = dsdx * dsdx + dtdx * dtdx;
+   float y = dsdy * dsdy + dtdy * dtdy;
+   float rho = x > y ? x : y;
+   rho = sqrtf(rho);
+#else
+   /* approximation */
+   dsdx = fabsf(dsdx);
+   dsdy = fabsf(dsdy);
+   dtdx = fabsf(dtdx);
+   dtdy = fabsf(dtdy);
+   float rho = (dsdx + dsdy + dtdx + dtdy) * 0.5;
+#endif
+   float lambda = logf(rho) * 1.442695f; /* compute logbase2(rho) */
+   return lambda;
+}
+
+
+/**
+ * Blend two sets of colors according to weight.
+ */
+static void
+blend_colors(vector float c0[4], const vector float c1[4], float weight)
+{
+   vector float t = spu_splats(weight);
+   vector float dc0 = spu_sub(c1[0], c0[0]);
+   vector float dc1 = spu_sub(c1[1], c0[1]);
+   vector float dc2 = spu_sub(c1[2], c0[2]);
+   vector float dc3 = spu_sub(c1[3], c0[3]);
+   c0[0] = spu_madd(dc0, t, c0[0]);
+   c0[1] = spu_madd(dc1, t, c0[1]);
+   c0[2] = spu_madd(dc2, t, c0[2]);
+   c0[3] = spu_madd(dc3, t, c0[3]);
+}
+
+
+/**
+ * Texture sampling with level of detail selection and possibly mipmap
+ * interpolation.
+ */
+void
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level_ignored, uint face,
+                      vector float colors[4])
+{
+   /*
+    * Note that we're computing a lambda/lod here that's used for all
+    * four pixels in the quad.
+    */
+   float lambda = compute_lambda_2d(unit, s, t);
+
+   (void) face;
+   (void) level_ignored;
+
+   /* apply lod bias */
+   lambda += spu.sampler[unit].lod_bias;
+
+   /* clamp */
+   if (lambda < spu.sampler[unit].min_lod)
+      lambda = spu.sampler[unit].min_lod;
+   else if (lambda > spu.sampler[unit].max_lod)
+      lambda = spu.sampler[unit].max_lod;
+
+   if (lambda <= 0.0f) {
+      /* magnify */
+      spu.mag_sample_texture_2d[unit](s, t, unit, 0, face, colors);
+   }
+   else {
+      /* minify */
+      if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+         /* sample two mipmap levels and interpolate */
+         int level = (int) lambda;
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+         if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+            /* sample second mipmap level */
+            float weight = lambda - (float) level;
+            level++;
+            if (level <= (int) spu.texture[unit].max_level) {
+               vector float colors2[4];
+               spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors2);
+               blend_colors(colors, colors2, weight);
+            }
+         }
+      }
+      else {
+         /* sample one mipmap level */
+         int level = (int) (lambda + 0.5f);
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+      }
+   }
+}
+
+
+/** XXX need a SIMD version of this */
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx);
+   const float ary = fabsf(ry);
+   const float arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = (sc / ma + 1.0F) * 0.5F;
+   *newT = (tc / ma + 1.0F) * 0.5F;
+
+   return face;
+}
+
+
+
+void
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4])
+{
+   uint p, faces[4], level = 0;
+   float newS[4], newT[4];
+
+   /* Compute cube faces referenced by the four sets of texcoords.
+    * XXX we should SIMD-ize this.
+    */
+   for (p = 0; p < 4; p++) {      
+      float rx = spu_extract(s, p);
+      float ry = spu_extract(t, p);
+      float rz = spu_extract(r, p);
+      faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
+   }
+
+   if (faces[0] == faces[1] &&
+       faces[0] == faces[2] &&
+       faces[0] == faces[3]) {
+      /* GOOD!  All four texcoords refer to the same cube face */
+      s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
+      t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
+      spu.sample_texture_2d[unit](s, t, unit, level, faces[0], colors);
+   }
+   else {
+      /* BAD!  The four texcoords refer to different faces */
+      for (p = 0; p < 4; p++) {      
+         vector float c[4];
+
+         spu.sample_texture_2d[unit](spu_splats(newS[p]), spu_splats(newT[p]),
+                                     unit, level, faces[p], c);
+
+         float red = spu_extract(c[0], p);
+         float green = spu_extract(c[1], p);
+         float blue = spu_extract(c[2], p);
+         float alpha = spu_extract(c[3], p);
+
+         colors[0] = spu_insert(red,   colors[0], p);
+         colors[1] = spu_insert(green, colors[1], p);
+         colors[2] = spu_insert(blue,  colors[2], p);
+         colors[3] = spu_insert(alpha, colors[3], p);
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
new file mode 100644
index 0000000000..7b75b007b5
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -0,0 +1,67 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TEXTURE_H
+#define SPU_TEXTURE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+extern void
+invalidate_tex_cache(void);
+
+
+extern void
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4]);
+
+
+extern void
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4]);
+
+extern void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4]);
+
+
+extern void
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level, uint face,
+                      vector float colors[4]);
+
+
+extern void
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4]);
+
+
+#endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tgsi_exec.h b/src/gallium/drivers/cell/spu/spu_tgsi_exec.h
new file mode 100644
index 0000000000..6f2a3d30b9
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tgsi_exec.h
@@ -0,0 +1,158 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TGSI_EXEC_H
+#define SPU_TGSI_EXEC_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+
+#define NUM_CHANNELS 4  /* R,G,B,A */
+#define QUAD_SIZE    4  /* 4 pixel/quad */
+
+
+
+#define TGSI_EXEC_NUM_TEMPS       128
+#define TGSI_EXEC_NUM_IMMEDIATES  256
+
+/*
+ * Locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TGSI_EXEC_TEMP_00000000_IDX    (TGSI_EXEC_NUM_TEMPS + 0)
+#define TGSI_EXEC_TEMP_00000000_CHAN   0
+
+#define TGSI_EXEC_TEMP_7FFFFFFF_IDX    (TGSI_EXEC_NUM_TEMPS + 0)
+#define TGSI_EXEC_TEMP_7FFFFFFF_CHAN   1
+
+#define TGSI_EXEC_TEMP_80000000_IDX    (TGSI_EXEC_NUM_TEMPS + 0)
+#define TGSI_EXEC_TEMP_80000000_CHAN   2
+
+#define TGSI_EXEC_TEMP_FFFFFFFF_IDX    (TGSI_EXEC_NUM_TEMPS + 0)
+#define TGSI_EXEC_TEMP_FFFFFFFF_CHAN   3
+
+#define TGSI_EXEC_TEMP_ONE_IDX         (TGSI_EXEC_NUM_TEMPS + 1)
+#define TGSI_EXEC_TEMP_ONE_CHAN        0
+
+#define TGSI_EXEC_TEMP_TWO_IDX         (TGSI_EXEC_NUM_TEMPS + 1)
+#define TGSI_EXEC_TEMP_TWO_CHAN        1
+
+#define TGSI_EXEC_TEMP_128_IDX         (TGSI_EXEC_NUM_TEMPS + 1)
+#define TGSI_EXEC_TEMP_128_CHAN        2
+
+#define TGSI_EXEC_TEMP_MINUS_128_IDX   (TGSI_EXEC_NUM_TEMPS + 1)
+#define TGSI_EXEC_TEMP_MINUS_128_CHAN  3
+
+#define TGSI_EXEC_TEMP_KILMASK_IDX     (TGSI_EXEC_NUM_TEMPS + 2)
+#define TGSI_EXEC_TEMP_KILMASK_CHAN    0
+
+#define TGSI_EXEC_TEMP_OUTPUT_IDX      (TGSI_EXEC_NUM_TEMPS + 2)
+#define TGSI_EXEC_TEMP_OUTPUT_CHAN     1
+
+#define TGSI_EXEC_TEMP_PRIMITIVE_IDX   (TGSI_EXEC_NUM_TEMPS + 2)
+#define TGSI_EXEC_TEMP_PRIMITIVE_CHAN  2
+
+/* NVIDIA condition code (CC) vector
+ */
+#define TGSI_EXEC_CC_GT       0x01
+#define TGSI_EXEC_CC_EQ       0x02
+#define TGSI_EXEC_CC_LT       0x04
+#define TGSI_EXEC_CC_UN       0x08
+
+#define TGSI_EXEC_CC_X_MASK   0x000000ff
+#define TGSI_EXEC_CC_X_SHIFT  0
+#define TGSI_EXEC_CC_Y_MASK   0x0000ff00
+#define TGSI_EXEC_CC_Y_SHIFT  8
+#define TGSI_EXEC_CC_Z_MASK   0x00ff0000
+#define TGSI_EXEC_CC_Z_SHIFT  16
+#define TGSI_EXEC_CC_W_MASK   0xff000000
+#define TGSI_EXEC_CC_W_SHIFT  24
+
+#define TGSI_EXEC_TEMP_CC_IDX         (TGSI_EXEC_NUM_TEMPS + 2)
+#define TGSI_EXEC_TEMP_CC_CHAN         3
+
+#define TGSI_EXEC_TEMP_THREE_IDX      (TGSI_EXEC_NUM_TEMPS + 3)
+#define TGSI_EXEC_TEMP_THREE_CHAN      0
+
+#define TGSI_EXEC_TEMP_HALF_IDX       (TGSI_EXEC_NUM_TEMPS + 3)
+#define TGSI_EXEC_TEMP_HALF_CHAN       1
+
+/* execution mask, each value is either 0 or ~0 */
+#define TGSI_EXEC_MASK_IDX            (TGSI_EXEC_NUM_TEMPS + 3)
+#define TGSI_EXEC_MASK_CHAN            2
+
+/* 4 register buffer for various purposes */
+#define TGSI_EXEC_TEMP_R0           (TGSI_EXEC_NUM_TEMPS + 4)
+#define TGSI_EXEC_NUM_TEMP_R        4
+
+#define TGSI_EXEC_TEMP_ADDR         (TGSI_EXEC_NUM_TEMPS + 8)
+#define TGSI_EXEC_NUM_ADDRS         1
+
+/* predicate register */
+#define TGSI_EXEC_TEMP_P0           (TGSI_EXEC_NUM_TEMPS + 9)
+#define TGSI_EXEC_NUM_PREDS         1
+
+#define TGSI_EXEC_NUM_TEMP_EXTRAS   10
+
+
+
+#define TGSI_EXEC_MAX_NESTING  32
+#define TGSI_EXEC_MAX_COND_NESTING  TGSI_EXEC_MAX_NESTING
+#define TGSI_EXEC_MAX_LOOP_NESTING  TGSI_EXEC_MAX_NESTING
+#define TGSI_EXEC_MAX_SWITCH_NESTING TGSI_EXEC_MAX_NESTING
+#define TGSI_EXEC_MAX_CALL_NESTING  TGSI_EXEC_MAX_NESTING
+
+/* The maximum number of input attributes per vertex. For 2D
+ * input register files, this is the stride between two 1D
+ * arrays.
+ */
+#define TGSI_EXEC_MAX_INPUT_ATTRIBS 17
+
+/* The maximum number of constant vectors per constant buffer.
+ */
+#define TGSI_EXEC_MAX_CONST_BUFFER  4096
+
+/* The maximum number of vertices per primitive */
+#define TGSI_MAX_PRIM_VERTICES 6
+
+/* The maximum number of primitives to be generated */
+#define TGSI_MAX_PRIMITIVES 64
+
+/* The maximum total number of vertices */
+#define TGSI_MAX_TOTAL_VERTICES (TGSI_MAX_PRIM_VERTICES * TGSI_MAX_PRIMITIVES * PIPE_MAX_ATTRIBS)
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* TGSI_EXEC_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
new file mode 100644
index 0000000000..6905015a48
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -0,0 +1,126 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#include "spu_tile.h"
+#include "spu_main.h"
+
+
+/**
+ * Get tile of color or Z values from main memory, put into SPU memory.
+ */
+void
+get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
+{
+   const uint offset = ty * spu.fb.width_tiles + tx;
+   const uint bytesPerTile = TILE_SIZE * TILE_SIZE * (zBuf ? spu.fb.zsize : 4);
+   const ubyte *src = zBuf ? spu.fb.depth_start : spu.fb.color_start;
+
+   src += offset * bytesPerTile;
+
+   ASSERT(tx < spu.fb.width_tiles);
+   ASSERT(ty < spu.fb.height_tiles);
+   ASSERT_ALIGN16(tile);
+   /*
+   printf("get_tile:  dest: %p  src: 0x%x  size: %d\n",
+          tile, (unsigned int) src, bytesPerTile);
+   */
+   mfc_get(tile->ui,  /* dest in local memory */
+           (unsigned int) src, /* src in main memory */
+           bytesPerTile,
+           tag,
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * Move tile of color or Z values from SPU memory to main memory.
+ */
+void
+put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
+{
+   const uint offset = ty * spu.fb.width_tiles + tx;
+   const uint bytesPerTile = TILE_SIZE * TILE_SIZE * (zBuf ? spu.fb.zsize : 4);
+   ubyte *dst = zBuf ? spu.fb.depth_start : spu.fb.color_start;
+
+   dst += offset * bytesPerTile;
+
+   ASSERT(tx < spu.fb.width_tiles);
+   ASSERT(ty < spu.fb.height_tiles);
+   ASSERT_ALIGN16(tile);
+   /*
+   printf("SPU %u: put_tile:  src: %p  dst: 0x%x  size: %d\n",
+          spu.init.id,
+          tile, (unsigned int) dst, bytesPerTile);
+   */
+   mfc_put((void *) tile->ui,  /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           bytesPerTile,
+           tag,
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
+ * tiles back to the main framebuffer.
+ */
+void
+really_clear_tiles(uint surfaceIndex)
+{
+   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   uint i;
+
+   if (surfaceIndex == 0) {
+      clear_c_tile(&spu.ctile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         }
+      }
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+#if 0
+   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
new file mode 100644
index 0000000000..7bfb52be8f
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -0,0 +1,75 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TILE_H
+#define SPU_TILE_H
+
+
+#include <libmisc.h>
+#include <spu_mfcio.h>
+#include "spu_main.h"
+#include "cell/common.h"
+
+
+
+extern void
+get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
+
+extern void
+put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
+
+extern void
+really_clear_tiles(uint surfaceIndex);
+
+
+static INLINE void
+clear_c_tile(tile_t *ctile)
+{
+   memset32((uint*) ctile->ui,
+            spu.fb.color_clear_value,
+            TILE_SIZE * TILE_SIZE);
+}
+
+
+static INLINE void
+clear_z_tile(tile_t *ztile)
+{
+   if (spu.fb.zsize == 2) {
+      memset16((ushort*) ztile->us,
+               spu.fb.depth_clear_value,
+               TILE_SIZE * TILE_SIZE);
+   }
+   else {
+      ASSERT(spu.fb.zsize != 0);
+      memset32((uint*) ztile->ui,
+               spu.fb.depth_clear_value,
+               TILE_SIZE * TILE_SIZE);
+   }
+}
+
+
+#endif /* SPU_TILE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
new file mode 100644
index 0000000000..efeebca27b
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -0,0 +1,843 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Triangle rendering within a tile.
+ */
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "util/u_math.h"
+#include "spu_colorpack.h"
+#include "spu_main.h"
+#include "spu_shuffle.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_tri.h"
+
+
+/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
+typedef vector unsigned int mask_t;
+
+
+
+/**
+ * Simplified types taken from other parts of Gallium
+ */
+struct vertex_header {
+   vector float data[1];
+};
+
+
+
+/* XXX fix this */
+#undef CEILF
+#define CEILF(X) ((float) (int) ((X) + 0.99999f))
+
+
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+#define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
+#define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
+#define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
+#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
+#define MASK_ALL          0xf
+
+
+#define CHAN0 0
+#define CHAN1 1
+#define CHAN2 2
+#define CHAN3 3
+
+
+#define DEBUG_VERTS 0
+
+/**
+ * Triangle edge info
+ */
+struct edge {
+   union {
+      struct {
+         float dx;	/**< X(v1) - X(v0), used only during setup */
+         float dy;	/**< Y(v1) - Y(v0), used only during setup */
+      };
+      vec_float4 ds;    /**< vector accessor for dx and dy */
+   };
+   float dxdy;		/**< dx/dy */
+   float sx, sy;	/**< first sample point coord */
+   int lines;		/**< number of lines on this edge */
+};
+
+
+struct interp_coef
+{
+   vector float a0;
+   vector float dadx;
+   vector float dady;
+};
+
+
+/**
+ * Triangle setup info (derived from draw_stage).
+ * Also used for line drawing (taking some liberties).
+ */
+struct setup_stage {
+
+   /* Vertices are just an array of floats making up each attribute in
+    * turn.  Currently fixed at 4 floats, but should change in time.
+    * Codegen will help cope with this.
+    */
+   union {
+      struct {
+         const struct vertex_header *vmin;
+         const struct vertex_header *vmid;
+         const struct vertex_header *vmax;
+         const struct vertex_header *vprovoke;
+      };
+      qword vertex_headers;
+   };
+
+   struct edge ebot;
+   struct edge etop;
+   struct edge emaj;
+
+   float oneOverArea;  /* XXX maybe make into vector? */
+
+   uint facing;
+
+   uint tx, ty;  /**< position of current tile (x, y) */
+
+   union {
+      struct {
+         int cliprect_minx;
+         int cliprect_miny;
+         int cliprect_maxx;
+         int cliprect_maxy;
+      };
+      qword cliprect;
+   };
+
+   struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
+
+   struct {
+      vec_int4 quad; /**< [0] = row0, [1] = row1; {left[0],left[1],right[0],right[1]} */
+      int y;
+      unsigned y_flags;
+      unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
+   } span;
+};
+
+
+static struct setup_stage setup;
+
+
+static INLINE vector float
+splatx(vector float v)
+{
+   return spu_splats(spu_extract(v, CHAN0));
+}
+
+static INLINE vector float
+splaty(vector float v)
+{
+   return spu_splats(spu_extract(v, CHAN1));
+}
+
+static INLINE vector float
+splatz(vector float v)
+{
+   return spu_splats(spu_extract(v, CHAN2));
+}
+
+static INLINE vector float
+splatw(vector float v)
+{
+   return spu_splats(spu_extract(v, CHAN3));
+}
+
+
+/**
+ * Setup fragment shader inputs by evaluating triangle's vertex
+ * attribute coefficient info.
+ * \param x  quad x pos
+ * \param y  quad y pos
+ * \param fragZ  returns quad Z values
+ * \param fragInputs  returns fragment program inputs
+ * Note: this code could be incorporated into the fragment program
+ * itself to avoid the loop and switch.
+ */
+static void
+eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
+{
+   static const vector float deltaX = (const vector float) {0, 1, 0, 1};
+   static const vector float deltaY = (const vector float) {0, 0, 1, 1};
+
+   const uint posSlot = 0;
+   const vector float pos = setup.coef[posSlot].a0;
+   const vector float dposdx = setup.coef[posSlot].dadx;
+   const vector float dposdy = setup.coef[posSlot].dady;
+   const vector float fragX = spu_splats(x) + deltaX;
+   const vector float fragY = spu_splats(y) + deltaY;
+   vector float fragW, wInv;
+   uint i;
+
+   *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
+   fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
+   wInv = spu_re(fragW);  /* 1 / w */
+
+   /* loop over fragment program inputs */
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      uint attr = i + 1;
+      enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;
+
+      /* constant term */
+      vector float a0 = setup.coef[attr].a0;
+      vector float r0 = splatx(a0);
+      vector float r1 = splaty(a0);
+      vector float r2 = splatz(a0);
+      vector float r3 = splatw(a0);
+
+      if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
+         /* linear term */
+         vector float dadx = setup.coef[attr].dadx;
+         vector float dady = setup.coef[attr].dady;
+         /* Use SPU intrinsics here to get slightly better code.
+          * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
+          */
+         r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
+         r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
+         r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
+         r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
+         if (interp == INTERP_PERSPECTIVE) {
+            /* perspective term */
+            r0 *= wInv;
+            r1 *= wInv;
+            r2 *= wInv;
+            r3 *= wInv;
+         }
+      }
+      fragInputs[CHAN0] = r0;
+      fragInputs[CHAN1] = r1;
+      fragInputs[CHAN2] = r2;
+      fragInputs[CHAN3] = r3;
+      fragInputs += 4;
+   }
+}
+
+
+/**
+ * Emit a quad (pass to next stage).  No clipping is done.
+ * Note: about 1/5 to 1/7 of the time, mask is zero and this function
+ * should be skipped.  But adding the test for that slows things down
+ * overall.
+ */
+static INLINE void
+emit_quad( int x, int y, mask_t mask)
+{
+   /* If any bits in mask are set... */
+   if (spu_extract(spu_orx(mask), 0)) {
+      const int ix = x - setup.cliprect_minx;
+      const int iy = y - setup.cliprect_miny;
+
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+
+      {
+         /*
+          * Run fragment shader, execute per-fragment ops, update fb/tile.
+          */
+         vector float inputs[4*4], outputs[2*4];
+         vector unsigned int kill_mask;
+         vector float fragZ;
+
+         eval_inputs((float) x, (float) y, &fragZ, inputs);
+
+         ASSERT(spu.fragment_program);
+         ASSERT(spu.fragment_ops);
+
+         /* Execute the current fragment program */
+         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
+
+         mask = spu_andc(mask, kill_mask);
+
+         /* Execute per-fragment/quad operations, including:
+          * alpha test, z test, stencil test, blend and framebuffer writing.
+          * Note that there are two different fragment operations functions
+          * that can be called, one for front-facing fragments, and one
+          * for back-facing fragments.  (Often the two are the same;
+          * but in some cases, like two-sided stenciling, they can be
+          * very different.)  So choose the correct function depending
+          * on the calculated facing.
+          */
+         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
+                          fragZ,
+                          outputs[0*4+0],
+                          outputs[0*4+1],
+                          outputs[0*4+2],
+                          outputs[0*4+3],
+                          mask);
+      }
+   }
+}
+
+
+/**
+ * Given an X or Y coordinate, return the block/quad coordinate that it
+ * belongs to.
+ */
+static INLINE int
+block(int x)
+{
+   return x & ~1;
+}
+
+
+/**
+ * Render a horizontal span of quads
+ */
+static void
+flush_spans(void)
+{
+   int minleft, maxright;
+
+   const int l0 = spu_extract(setup.span.quad, 0);
+   const int l1 = spu_extract(setup.span.quad, 1);
+   const int r0 = spu_extract(setup.span.quad, 2);
+   const int r1 = spu_extract(setup.span.quad, 3);
+
+   switch (setup.span.y_flags) {
+   case 0x3:
+      /* both odd and even lines written (both quad rows) */
+      minleft = MIN2(l0, l1);
+      maxright = MAX2(r0, r1);
+      break;
+
+   case 0x1:
+      /* only even line written (quad top row) */
+      minleft = l0;
+      maxright = r0;
+      break;
+
+   case 0x2:
+      /* only odd line written (quad bottom row) */
+      minleft = l1;
+      maxright = r1;
+      break;
+
+   default:
+      return;
+   }
+
+   /* OK, we're very likely to need the tile data now.
+    * clear or finish waiting if needed.
+    */
+   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* wait for mfc_get() to complete */
+      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
+      wait_on_mask(1 << TAG_READ_TILE_COLOR);
+      spu.cur_ctile_status = TILE_STATUS_CLEAN;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+      clear_c_tile(&spu.ctile);
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
+
+   if (spu.read_depth_stencil) {
+      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+         /* wait for mfc_get() to complete */
+         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
+         wait_on_mask(1 << TAG_READ_TILE_Z);
+         spu.cur_ztile_status = TILE_STATUS_CLEAN;
+      }
+      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
+         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+         clear_z_tile(&spu.ztile);
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
+   }
+
+   /* XXX this loop could be moved into the above switch cases... */
+   
+   /* Setup for mask calculation */
+   const vec_int4 quad_LlRr = setup.span.quad;
+   const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
+   const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
+   const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));
+
+   const vec_int4 twos = spu_splats(2);
+
+   const int x = block(minleft);
+   vec_int4 xs = {x, x+1, x, x+1};
+
+   for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
+      /**
+       * Computes mask to indicate which pixels in the 2x2 quad are actually
+       * inside the triangle's bounds.
+       */
+      
+      /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
+      const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
+      const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); 
+      
+      /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
+      const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);
+
+      /* Combine results to create mask */
+      const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);
+
+      emit_quad(spu_extract(xs, 0), setup.span.y, mask);
+   }
+
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   /* Zero right elements */
+   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
+}
+
+
+#if DEBUG_VERTS
+static void
+print_vertex(const struct vertex_header *v)
+{
+   uint i;
+   fprintf(stderr, "  Vertex: (%p)\n", v);
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
+              spu_extract(v->data[i], 0),
+              spu_extract(v->data[i], 1),
+              spu_extract(v->data[i], 2),
+              spu_extract(v->data[i], 3));
+   }
+}
+#endif
+
+/* Returns the minimum of each slot of two vec_float4s as qwords.
+ * i.e. return[n] = min(q0[n],q1[n]);
+ */
+static qword
+minfq(qword q0, qword q1)
+{
+   const qword q0q1m = si_fcgt(q0, q1);
+   return si_selb(q0, q1, q0q1m);
+}
+
+/* Returns the minimum of each slot of three vec_float4s as qwords.
+ * i.e. return[n] = min(q0[n],q1[n],q2[n]);
+ */
+static qword
+min3fq(qword q0, qword q1, qword q2)
+{
+   return minfq(minfq(q0, q1), q2);
+}
+
+/* Returns the maximum of each slot of two vec_float4s as qwords.
+ * i.e. return[n] = min(q0[n],q1[n],q2[n]);
+ */
+static qword
+maxfq(qword q0, qword q1) {
+   const qword q0q1m = si_fcgt(q0, q1);
+   return si_selb(q1, q0, q0q1m);
+}
+
+/* Returns the maximum of each slot of three vec_float4s as qwords.
+ * i.e. return[n] = min(q0[n],q1[n],q2[n]);
+ */
+static qword
+max3fq(qword q0, qword q1, qword q2) {
+   return maxfq(maxfq(q0, q1), q2);
+}
+
+/**
+ * Sort vertices from top to bottom.
+ * Compute area and determine front vs. back facing.
+ * Do coarse clip test against tile bounds
+ * \return  FALSE if tri is totally outside tile, TRUE otherwise
+ */
+static boolean
+setup_sort_vertices(const qword vs)
+{
+   float area, sign;
+
+#if DEBUG_VERTS
+   if (spu.init.id==0) {
+      fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
+      print_vertex(v0);
+      print_vertex(v1);
+      print_vertex(v2);
+   }
+#endif
+
+   {
+      /* Load the float values for various processing... */
+      const qword f0 = (qword)(((const struct vertex_header*)si_to_ptr(vs))->data[0]);
+      const qword f1 = (qword)(((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 4)))->data[0]);
+      const qword f2 = (qword)(((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 8)))->data[0]);
+
+      /* Check if triangle is completely outside the tile bounds
+       * Find the min and max x and y positions of the three poits */
+      const qword minf = min3fq(f0, f1, f2);
+      const qword maxf = max3fq(f0, f1, f2);
+
+      /* Compare min and max against cliprect vals */
+      const qword maxsmins = si_shufb(maxf, minf, SHUFB4(A,B,a,b));
+      const qword outside = si_fcgt(maxsmins, si_csflt(setup.cliprect, 0));
+
+      /* Use a little magic to work out of the tri is visible or not */
+      if(si_to_uint(si_xori(si_gb(outside), 0xc))) return FALSE;
+
+      /* determine bottom to top order of vertices */
+      /* A table of shuffle patterns for putting vertex_header pointers into
+         correct order.  Quite magical. */
+      const qword sort_order_patterns[] = {
+         SHUFB4(A,B,C,C),
+         SHUFB4(C,A,B,C),
+         SHUFB4(A,C,B,C),
+         SHUFB4(B,C,A,C),
+         SHUFB4(B,A,C,C),
+         SHUFB4(C,B,A,C) };
+
+      /* Collate y values into two vectors for comparison.
+         Using only one shuffle constant! ;) */
+      const qword y_02_ = si_shufb(f0, f2, SHUFB4(0,B,b,C));
+      const qword y_10_ = si_shufb(f1, f0, SHUFB4(0,B,b,C));
+      const qword y_012 = si_shufb(y_02_, f1, SHUFB4(0,B,b,C));
+      const qword y_120 = si_shufb(y_10_, f2, SHUFB4(0,B,b,C));
+
+      /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */
+      const qword compare = si_fcgt(y_012, y_120);
+      /* Compress the result of the comparison into 4 bits */
+      const qword gather = si_gb(compare);
+      /* Subtract one to attain the index into the LUT.  Magical. */
+      const unsigned int index = si_to_uint(gather) - 1;
+
+      /* Load the appropriate pattern and construct the desired vector. */
+      setup.vertex_headers = si_shufb(vs, vs, sort_order_patterns[index]);
+
+      /* Using the result of the comparison, set sign.
+         Very magical. */
+      sign = ((si_to_uint(si_cntb(gather)) == 2) ? 1.0f : -1.0f);
+   }
+
+   setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]);
+   setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]);
+   setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]);
+
+   /*
+    * Compute triangle's area.  Use 1/area to compute partial
+    * derivatives of attributes later.
+    */
+   area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
+
+   setup.oneOverArea = 1.0f / area;
+
+   /* The product of area * sign indicates front/back orientation (0/1).
+    * Just in case someone gets the bright idea of switching the front
+    * and back constants without noticing that we're assuming their
+    * values in this operation, also assert that the values are
+    * what we think they are.
+    */
+   ASSERT(CELL_FACING_FRONT == 0);
+   ASSERT(CELL_FACING_BACK == 1);
+   setup.facing = (area * sign > 0.0f)
+      ^ (!spu.rasterizer.front_ccw);
+
+   return TRUE;
+}
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex->data[slot].
+ * The result will be put into setup.coef[slot].a0.
+ * \param slot  which attribute slot 
+ */
+static INLINE void
+const_coeff4(uint slot)
+{
+   setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0 = setup.vprovoke->data[slot];
+}
+
+
+/**
+ * As above, but interp setup all four vector components.
+ */
+static INLINE void
+tri_linear_coeff4(uint slot)
+{
+   const vector float vmin_d = setup.vmin->data[slot];
+   const vector float vmid_d = setup.vmid->data[slot];
+   const vector float vmax_d = setup.vmax->data[slot];
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
+                         
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void
+tri_persp_coeff4(uint slot)
+{
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
+   const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
+   const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
+
+   vector float vmin_d = setup.vmin->data[slot];
+   vector float vmid_d = setup.vmid->data[slot];
+   vector float vmax_d = setup.vmax->data[slot];
+
+   vmin_d = spu_mul(vmin_d, vmin_w);
+   vmid_d = spu_mul(vmid_d, vmid_w);
+   vmax_d = spu_mul(vmax_d, vmax_w);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
+                         
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
+}
+
+
+
+/**
+ * Compute the setup.coef[] array dadx, dady, a0 values.
+ * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
+ */
+static void
+setup_tri_coefficients(void)
+{
+   uint i;
+
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      switch (spu.vertex_info.attrib[i].interp_mode) {
+      case INTERP_NONE:
+         break;
+      case INTERP_CONSTANT:
+         const_coeff4(i);
+         break;
+      case INTERP_POS:
+         /* fall-through */
+      case INTERP_LINEAR:
+         tri_linear_coeff4(i);
+         break;
+      case INTERP_PERSPECTIVE:
+         tri_persp_coeff4(i);
+         break;
+      default:
+         ASSERT(0);
+      }
+   }
+}
+
+
+static void
+setup_tri_edges(void)
+{
+   float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
+   float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
+
+   float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+   float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
+   float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
+
+   setup.emaj.sy = CEILF(vmin_y);
+   setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
+   setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
+   setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
+
+   setup.etop.sy = CEILF(vmid_y);
+   setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
+   setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
+   setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
+
+   setup.ebot.sy = CEILF(vmin_y);
+   setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
+   setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
+   setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
+}
+
+
+/**
+ * Render the upper or lower half of a triangle.
+ * Scissoring/cliprect is applied here too.
+ */
+static void
+subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
+{
+   const int minx = setup.cliprect_minx;
+   const int maxx = setup.cliprect_maxx;
+   const int miny = setup.cliprect_miny;
+   const int maxy = setup.cliprect_maxy;
+   int y, start_y, finish_y;
+   int sy = (int)eleft->sy;
+
+   ASSERT((int)eleft->sy == (int) eright->sy);
+
+   /* clip top/bottom */
+   start_y = sy;
+   finish_y = sy + lines;
+
+   if (start_y < miny)
+      start_y = miny;
+
+   if (finish_y > maxy)
+      finish_y = maxy;
+
+   start_y -= sy;
+   finish_y -= sy;
+
+   /*
+   printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);  
+   */
+
+   for (y = start_y; y < finish_y; y++) {
+
+      /* avoid accumulating adds as floats don't have the precision to
+       * accurately iterate large triangle edges that way.  luckily we
+       * can just multiply these days.
+       *
+       * this is all drowned out by the attribute interpolation anyway.
+       */
+      int left = (int)(eleft->sx + y * eleft->dxdy);
+      int right = (int)(eright->sx + y * eright->dxdy);
+
+      /* clip left/right */
+      if (left < minx)
+         left = minx;
+      if (right > maxx)
+         right = maxx;
+
+      if (left < right) {
+         int _y = sy + y;
+         if (block(_y) != setup.span.y) {
+            flush_spans();
+            setup.span.y = block(_y);
+         }
+
+         int offset = _y&1;
+         vec_int4 quad_LlRr = {left, left, right, right};
+         /* Store left and right in 0 or 1 row of quad based on offset */
+         setup.span.quad = spu_sel(quad_LlRr, setup.span.quad, spu_maskw(5<<offset));
+         setup.span.y_flags |= 1<<offset;
+      }
+   }
+
+
+   /* save the values so that emaj can be restarted:
+    */
+   eleft->sx += lines * eleft->dxdy;
+   eright->sx += lines * eright->dxdy;
+   eleft->sy += lines;
+   eright->sy += lines;
+}
+
+
+/**
+ * Draw triangle into tile at (tx, ty) (tile coords)
+ * The tile data should have already been fetched.
+ */
+boolean
+tri_draw(const qword vs,
+         uint tx, uint ty)
+{
+   setup.tx = tx;
+   setup.ty = ty;
+
+   /* set clipping bounds to tile bounds */
+   const qword clipbase = (qword)((vec_uint4){tx, ty});
+   const qword clipmin = si_mpyui(clipbase, TILE_SIZE);
+   const qword clipmax = si_ai(clipmin, TILE_SIZE);
+   setup.cliprect = si_shufb(clipmin, clipmax, SHUFB4(A,B,a,b));
+
+   if(!setup_sort_vertices(vs)) {
+      return FALSE; /* totally clipped */
+   }
+
+   setup_tri_coefficients();
+   setup_tri_edges();
+
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   /* Zero right elements */
+   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
+
+   if (setup.oneOverArea < 0.0) {
+      /* emaj on left */
+      subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
+      subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
+   }
+   else {
+      /* emaj on right */
+      subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
+      subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
+   }
+
+   flush_spans();
+
+   return TRUE;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
new file mode 100644
index 0000000000..82e3b19ad7
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_TRI_H
+#define SPU_TRI_H
+
+
+extern boolean
+tri_draw(const qword vs, uint tx, uint ty);
+
+
+#endif /* SPU_TRI_H */
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
new file mode 100644
index 0000000000..24057e29e3
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -0,0 +1,77 @@
+
+#include "cell/common.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_debug.h"
+#include "tgsi/tgsi_parse.h"
+//#include "tgsi_build.h"
+#include "tgsi/tgsi_util.h"
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->SwizzleX;
+   case 1:
+      return reg->SwizzleY;
+   case 2:
+      return reg->SwizzleZ;
+   case 3:
+      return reg->SwizzleW;
+   default:
+      ASSERT( 0 );
+   }
+   return 0;
+}
+
+
+unsigned
+tgsi_util_get_full_src_register_swizzle(
+   const struct tgsi_full_src_register  *reg,
+   unsigned component )
+{
+   return tgsi_util_get_src_register_swizzle(
+      reg->Register,
+      component );
+}
+
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct  tgsi_full_src_register *reg,
+   unsigned component )
+{
+   unsigned sign_mode;
+
+   if( reg->RegisterExtMod.Absolute ) {
+      /* Consider only the post-abs negation. */
+
+      if( reg->RegisterExtMod.Negate ) {
+         sign_mode = TGSI_UTIL_SIGN_SET;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_CLEAR;
+      }
+   }
+   else {
+      /* Accumulate the three negations. */
+
+      unsigned negate;
+
+      negate = reg->Register.Negate;
+      if( reg->RegisterExtMod.Negate ) {
+         negate = !negate;
+      }
+
+      if( negate ) {
+         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_KEEP;
+      }
+   }
+
+   return sign_mode;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
new file mode 100644
index 0000000000..087963960d
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -0,0 +1,146 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_exec.h"
+#include "spu_vertex_shader.h"
+#include "spu_main.h"
+#include "spu_dcache.h"
+
+typedef void (*spu_fetch_func)(qword *out, const qword *in,
+			       const qword *shuffle_data);
+
+
+PIPE_ALIGN_VAR(16) static const qword
+fetch_shuffle_data[5] = {
+   /* Shuffle used by CVT_64_FLOAT
+    */
+   {
+      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+   },
+
+   /* Shuffle used by CVT_8_USCALED and CVT_8_SSCALED
+    */
+   {
+      0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80,
+      0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80,
+   },
+   
+   /* Shuffle used by CVT_16_USCALED and CVT_16_SSCALED
+    */
+   {
+      0x00, 0x01, 0x80, 0x80, 0x02, 0x03, 0x80, 0x80,
+      0x04, 0x05, 0x80, 0x80, 0x06, 0x07, 0x80, 0x80,
+   },
+   
+   /* High value shuffle used by trans4x4.
+    */
+   {
+      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17
+   },
+
+   /* Low value shuffle used by trans4x4.
+    */
+   {
+      0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+      0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F
+   }
+};
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_vertex_fetch(struct spu_vs_context *draw,
+                                 struct spu_exec_machine *machine,
+                                 const unsigned *elts,
+                                 unsigned count)
+{
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   ASSERT(count <= 4);
+
+#if DRAW_DBG
+   printf("SPU: %s count = %u, nr_attrs = %u\n", 
+          __FUNCTION__, count, nr_attrs);
+#endif
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+      const unsigned pitch = draw->vertex_fetch.pitch[attr];
+      const uint64_t src = draw->vertex_fetch.src_ptr[attr];
+      const spu_fetch_func fetch = (spu_fetch_func)
+	  (draw->vertex_fetch.code + draw->vertex_fetch.code_offset[attr]);
+      unsigned i;
+      unsigned idx;
+      const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
+      const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
+      PIPE_ALIGN_VAR(16) qword in[2 * 4];
+
+
+      /* Fetch four attributes for four vertices.  
+       */
+      idx = 0;
+      for (i = 0; i < count; i++) {
+         const uint64_t addr = src + (elts[i] * pitch);
+
+#if DRAW_DBG
+         printf("SPU: fetching = 0x%llx\n", addr);
+#endif
+
+         spu_dcache_fetch_unaligned(& in[idx], addr, bytes_per_entry);
+         idx += quads_per_entry;
+      }
+
+      /* Be nice and zero out any missing vertices.
+       */
+      (void) memset(& in[idx], 0, (8 - idx) * sizeof(qword));
+
+
+      /* Convert all 4 vertices to vectors of float.
+       */
+      (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, fetch_shuffle_data);
+   }
+}
+
+
+void spu_update_vertex_fetch( struct spu_vs_context *draw )
+{
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
new file mode 100644
index 0000000000..3e9804bf8e
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -0,0 +1,245 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include <spu_mfcio.h>
+
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "draw/draw_private.h"
+#include "draw/draw_context.h"
+#include "cell/common.h"
+#include "spu_vertex_shader.h"
+#include "spu_exec.h"
+#include "spu_main.h"
+
+
+#define MAX_VERTEX_SIZE ((2 + PIPE_MAX_SHADER_OUTPUTS) * 4 * sizeof(float))
+
+
+#define CLIP_RIGHT_BIT 0x01
+#define CLIP_LEFT_BIT 0x02
+#define CLIP_TOP_BIT 0x04
+#define CLIP_BOTTOM_BIT 0x08
+#define CLIP_FAR_BIT 0x10
+#define CLIP_NEAR_BIT 0x20
+
+
+static INLINE float
+dot4(const float *a, const float *b)
+{
+   return (a[0]*b[0] +
+           a[1]*b[1] +
+           a[2]*b[2] +
+           a[3]*b[3]);
+}
+
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+run_vertex_program(struct spu_vs_context *draw,
+                   unsigned elts[4], unsigned count,
+                   const uint64_t *vOut)
+{
+   struct spu_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   PIPE_ALIGN_VAR(16) struct spu_exec_vector inputs[PIPE_MAX_ATTRIBS];
+   PIPE_ALIGN_VAR(16) struct spu_exec_vector outputs[PIPE_MAX_ATTRIBS];
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+   ASSERT(count <= 4);
+
+   machine->Processor = TGSI_PROCESSOR_VERTEX;
+
+   ASSERT_ALIGN16(draw->constants);
+   machine->Consts = (float (*)[4]) draw->constants;
+
+   machine->Inputs = inputs;
+   machine->Outputs = outputs;
+
+   spu_vertex_fetch( draw, machine, elts, count );
+
+   /* run shader */
+   spu_exec_machine_run( machine );
+
+
+   /* store machine results */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+      PIPE_ALIGN_VAR(16)
+      unsigned char buffer[sizeof(struct vertex_header)
+          + MAX_VERTEX_SIZE];
+      struct vertex_header *const tmpOut =
+          (struct vertex_header *) buffer;
+      const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
+                                           + (sizeof(float) * 4 
+                                              * draw->num_vs_outputs));
+
+      mfc_get(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+
+      /* Handle attr[0] (position) specially:
+       *
+       * XXX: Computing the clipmask should be done in the vertex
+       * program as a set of DP4 instructions appended to the
+       * user-provided code.
+       */
+      x = tmpOut->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = tmpOut->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = tmpOut->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = tmpOut->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      tmpOut->clipmask = compute_clipmask(tmpOut->clip, draw->plane,
+					   draw->nr_planes);
+      tmpOut->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      tmpOut->data[0][0] = x * scale[0] + trans[0];
+      tmpOut->data[0][1] = y * scale[1] + trans[1];
+      tmpOut->data[0][2] = z * scale[2] + trans[2];
+      tmpOut->data[0][3] = w;
+
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         tmpOut->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         tmpOut->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         tmpOut->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+      }
+
+      mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+   } /* loop over vertices */
+}
+
+
+PIPE_ALIGN_VAR(16) unsigned char
+immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]);
+
+
+void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       struct cell_shader_info *vs)
+{
+   const unsigned immediate_addr = vs->immediates;
+   const unsigned immediate_size = 
+       ROUNDUP16((sizeof(float) * 4 * vs->num_immediates)
+		 + (immediate_addr & 0x0f));
+ 
+
+   mfc_get(immediates, immediate_addr & ~0x0f, immediate_size,
+           TAG_VERTEX_BUFFER, 0, 0);
+
+   draw->machine.Instructions = (struct tgsi_full_instruction *)
+       vs->instructions;
+   draw->machine.NumInstructions = vs->num_instructions;
+
+   draw->machine.Declarations = (struct tgsi_full_declaration *)
+       vs->declarations;
+   draw->machine.NumDeclarations = vs->num_declarations;
+
+   draw->num_vs_outputs = vs->num_outputs;
+
+   /* specify the shader to interpret/execute */
+   spu_exec_machine_init(&draw->machine,
+			 PIPE_MAX_SAMPLERS,
+			 NULL /*samplers*/,
+			 PIPE_SHADER_VERTEX);
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+   (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f],
+                 sizeof(float) * 4 * vs->num_immediates);
+}
+
+
+void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+                          const struct cell_command_vs *vs)
+{
+   unsigned i;
+
+   (void) memcpy(draw->plane, vs->plane, sizeof(float) * 4 * vs->nr_planes);
+   draw->nr_planes = vs->nr_planes;
+   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
+
+   for (i = 0; i < vs->num_elts; i += 4) {
+      const unsigned batch_size = MIN2(vs->num_elts - i, 4);
+
+      run_vertex_program(draw, & vs->elts[i], batch_size, &vs->vOut[i]);
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
new file mode 100644
index 0000000000..4c74f5e74d
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
@@ -0,0 +1,66 @@
+#ifndef SPU_VERTEX_SHADER_H
+#define SPU_VERTEX_SHADER_H
+
+#include "cell/common.h"
+#include "pipe/p_format.h"
+#include "spu_exec.h"
+
+struct spu_vs_context;
+
+typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
+				     struct spu_exec_machine *machine,
+				     const unsigned *elts,
+				     unsigned count );
+
+struct spu_vs_context {
+   struct pipe_viewport_state viewport;
+
+   struct {
+      uint64_t src_ptr[PIPE_MAX_ATTRIBS];
+      unsigned pitch[PIPE_MAX_ATTRIBS];
+      unsigned size[PIPE_MAX_ATTRIBS];
+      unsigned code_offset[PIPE_MAX_ATTRIBS];
+      unsigned nr_attrs;
+      boolean dirty;
+
+      spu_full_fetch_func fetch_func;
+      void *code;
+   } vertex_fetch;
+   
+   /* Clip derived state:
+    */
+   float plane[12][4];
+   unsigned nr_planes;
+
+   struct spu_exec_machine machine;
+   const float (*constants)[4];
+
+   unsigned num_vs_outputs;
+};
+
+extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
+
+static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
+				    struct spu_exec_machine *machine,
+				    const unsigned *elts,
+				    unsigned count)
+{
+   if (draw->vertex_fetch.dirty) {
+      spu_update_vertex_fetch(draw);
+      draw->vertex_fetch.dirty = 0;
+   }
+   
+   (*draw->vertex_fetch.fetch_func)(draw, machine, elts, count);
+}
+
+struct cell_command_vs;
+
+extern void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       struct cell_shader_info *vs);
+
+extern void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+			  const struct cell_command_vs *vs);
+
+#endif /* SPU_VERTEX_SHADER_H */
diff --git a/src/gallium/drivers/failover/Makefile b/src/gallium/drivers/failover/Makefile
new file mode 100644
index 0000000000..dfb7f5dcf6
--- /dev/null
+++ b/src/gallium/drivers/failover/Makefile
@@ -0,0 +1,11 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = failover
+
+C_SOURCES = \
+	fo_state.c \
+	fo_state_emit.c \
+	fo_context.c 
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/failover/SConscript b/src/gallium/drivers/failover/SConscript
new file mode 100644
index 0000000000..f8e9b1b491
--- /dev/null
+++ b/src/gallium/drivers/failover/SConscript
@@ -0,0 +1,13 @@
+Import('*')
+
+env = env.Clone()
+
+failover = env.ConvenienceLibrary(
+	target = 'failover',
+	source = [
+		'fo_state.c',
+		'fo_state_emit.c',
+		'fo_context.c',
+	])
+
+Export('failover')
diff --git a/src/gallium/drivers/failover/fo_context.c b/src/gallium/drivers/failover/fo_context.c
new file mode 100644
index 0000000000..9c9c1bdc45
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_context.c
@@ -0,0 +1,177 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "pipe/p_context.h"
+
+#include "fo_context.h"
+#include "fo_winsys.h"
+
+
+
+static void failover_destroy( struct pipe_context *pipe )
+{
+   struct failover_context *failover = failover_context( pipe );
+
+   FREE( failover );
+}
+
+
+void failover_fail_over( struct failover_context *failover )
+{
+   failover->dirty = TRUE;
+   failover->mode = FO_SW;
+}
+
+
+static void failover_draw_elements( struct pipe_context *pipe,
+                                    struct pipe_resource *indexResource,
+                                    unsigned indexSize,
+                                    int indexBias,
+                                    unsigned prim, 
+                                    unsigned start, 
+                                    unsigned count)
+{
+   struct failover_context *failover = failover_context( pipe );
+
+   /* If there has been any statechange since last time, try hardware
+    * rendering again:
+    */
+   if (failover->dirty) {
+      failover->mode = FO_HW;
+   }
+
+   /* Try hardware:
+    */
+   if (failover->mode == FO_HW) {
+      failover->hw->draw_elements( failover->hw, 
+                                   indexResource, 
+                                   indexSize, 
+                                   indexBias,
+                                   prim, 
+                                   start, 
+                                   count );
+   }
+
+   /* Possibly try software:
+    */
+   if (failover->mode == FO_SW) {
+
+      if (failover->dirty) {
+         failover->hw->flush( failover->hw, ~0, NULL );
+	 failover_state_emit( failover );
+      }
+
+      failover->sw->draw_elements( failover->sw, 
+				   indexResource, 
+				   indexSize, 
+				   indexBias,
+				   prim, 
+				   start, 
+				   count );
+
+      /* Be ready to switch back to hardware rendering without an
+       * intervening flush.  Unlikely to be much performance impact to
+       * this:
+       */
+      failover->sw->flush( failover->sw, ~0, NULL );
+   }
+}
+
+
+static void failover_draw_arrays( struct pipe_context *pipe,
+				     unsigned prim, unsigned start, unsigned count)
+{
+   failover_draw_elements(pipe, NULL, 0, 0, prim, start, count);
+}
+
+static unsigned int
+failover_is_resource_referenced( struct pipe_context *_pipe,
+				 struct pipe_resource *resource,
+				 unsigned face, unsigned level)
+{
+   struct failover_context *failover = failover_context( _pipe );
+   struct pipe_context *pipe = (failover->mode == FO_HW) ?
+      failover->hw : failover->sw;
+
+   return pipe->is_resource_referenced(pipe, resource, face, level);
+}
+
+struct pipe_context *failover_create( struct pipe_context *hw,
+				      struct pipe_context *sw )
+{
+   struct failover_context *failover = CALLOC_STRUCT(failover_context);
+   if (failover == NULL)
+      return NULL;
+
+   failover->hw = hw;
+   failover->sw = sw;
+   failover->pipe.winsys = hw->winsys;
+   failover->pipe.screen = hw->screen;
+   failover->pipe.destroy = failover_destroy;
+#if 0
+   failover->pipe.is_format_supported = hw->is_format_supported;
+   failover->pipe.get_name = hw->get_name;
+   failover->pipe.get_vendor = hw->get_vendor;
+   failover->pipe.get_param = hw->get_param;
+   failover->pipe.get_paramf = hw->get_paramf;
+#endif
+
+   failover->pipe.draw_arrays = failover_draw_arrays;
+   failover->pipe.draw_elements = failover_draw_elements;
+   failover->pipe.clear = hw->clear;
+   failover->pipe.clear_render_target = hw->clear_render_target;
+   failover->pipe.clear_depth_stencil = hw->clear_depth_stencil;
+
+   /* No software occlusion fallback (or other optional functionality)
+    * at this point - if the hardware doesn't support it, don't
+    * advertise it to the application.
+    */
+   failover->pipe.begin_query = hw->begin_query;
+   failover->pipe.end_query = hw->end_query;
+
+   failover_init_state_functions( failover );
+
+   failover->pipe.resource_copy_region = hw->resource_copy_region;
+
+#if 0
+   failover->pipe.texture_create = hw->texture_create;
+   failover->pipe.texture_destroy = hw->texture_destroy;
+   failover->pipe.get_tex_surface = hw->get_tex_surface;
+   failover->pipe.texture_update = hw->texture_update;
+#endif
+
+   failover->pipe.flush = hw->flush;
+   failover->pipe.is_resource_referenced = failover_is_resource_referenced;
+
+   failover->dirty = 0;
+
+   return &failover->pipe;
+}
+
diff --git a/src/gallium/drivers/failover/fo_context.h b/src/gallium/drivers/failover/fo_context.h
new file mode 100644
index 0000000000..9d3e0d0dba
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_context.h
@@ -0,0 +1,142 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef FO_CONTEXT_H
+#define FO_CONTEXT_H
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+
+
+
+#define FO_NEW_VIEWPORT        0x1
+#define FO_NEW_RASTERIZER      0x2
+#define FO_NEW_FRAGMENT_SHADER 0x4
+#define FO_NEW_BLEND           0x8
+#define FO_NEW_CLIP            0x10
+#define FO_NEW_SCISSOR         0x20
+#define FO_NEW_STIPPLE         0x40
+#define FO_NEW_FRAMEBUFFER     0x80
+#define FO_NEW_ALPHA_TEST      0x100
+#define FO_NEW_DEPTH_STENCIL   0x200
+#define FO_NEW_SAMPLER         0x400
+#define FO_NEW_SAMPLER_VIEW    0x800
+#define FO_NEW_VERTEX          0x2000
+#define FO_NEW_VERTEX_SHADER   0x4000
+#define FO_NEW_BLEND_COLOR     0x8000
+#define FO_NEW_STENCIL_REF     0x10000
+#define FO_NEW_CLEAR_COLOR     0x20000
+#define FO_NEW_VERTEX_BUFFER   0x40000
+#define FO_NEW_VERTEX_ELEMENT  0x80000
+#define FO_NEW_SAMPLE_MASK     0x100000
+
+
+
+#define FO_HW 0
+#define FO_SW 1
+
+struct fo_state {
+   void *sw_state;
+   void *hw_state;
+};
+
+struct fo_sampler_view {
+   struct pipe_sampler_view base;
+   struct pipe_sampler_view *sw;
+   struct pipe_sampler_view *hw;
+};
+
+struct failover_context {
+   struct pipe_context pipe;  /**< base class */
+
+
+   /* The most recent drawing state as set by the driver:
+    */
+   const struct fo_state     *blend;
+   const struct fo_state     *sampler[PIPE_MAX_SAMPLERS];
+   const struct fo_state     *vertex_samplers[PIPE_MAX_VERTEX_SAMPLERS];
+   const struct fo_state     *depth_stencil;
+   const struct fo_state     *rasterizer;
+   const struct fo_state     *fragment_shader;
+   const struct fo_state     *vertex_shader;
+   const struct fo_state     *vertex_elements;
+
+   struct pipe_blend_color blend_color;
+   struct pipe_stencil_ref stencil_ref;
+   struct pipe_clip_state clip;
+   unsigned sample_mask;
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
+
+   uint num_vertex_buffers;
+
+   void *sw_sampler_state[PIPE_MAX_SAMPLERS];
+   void *hw_sampler_state[PIPE_MAX_SAMPLERS];
+   void *sw_vertex_sampler_state[PIPE_MAX_VERTEX_SAMPLERS];
+   void *hw_vertex_sampler_state[PIPE_MAX_VERTEX_SAMPLERS];
+
+   struct fo_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
+   struct fo_sampler_view *vertex_sampler_views[PIPE_MAX_VERTEX_SAMPLERS];
+   unsigned num_fragment_sampler_views;
+   unsigned num_vertex_sampler_views;
+
+   unsigned dirty;
+
+   unsigned num_samplers;
+   unsigned num_vertex_samplers;
+
+   unsigned mode;
+   struct pipe_context *hw;
+   struct pipe_context *sw;
+};
+
+
+
+void failover_init_state_functions( struct failover_context *failover );
+void failover_state_emit( struct failover_context *failover );
+
+static INLINE struct failover_context *
+failover_context( struct pipe_context *pipe )
+{
+   return (struct failover_context *)pipe;
+}
+
+/* Internal functions
+ */
+void
+failover_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             struct pipe_resource *resource);
+
+
+#endif /* FO_CONTEXT_H */
diff --git a/src/gallium/drivers/failover/fo_state.c b/src/gallium/drivers/failover/fo_state.c
new file mode 100644
index 0000000000..12e42379f9
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_state.c
@@ -0,0 +1,641 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+
+#include "fo_context.h"
+
+
+/* This looks like a lot of work at the moment - we're keeping a
+ * duplicate copy of the state up-to-date.  
+ *
+ * This can change in two ways:
+ * - With constant state objects we would only need to save a pointer,
+ *     not the whole object.
+ * - By adding a callback in the state tracker to re-emit state.  The
+ *     state tracker knows the current state already and can re-emit it 
+ *     without additional complexity.
+ *
+ * This works as a proof-of-concept, but a final version will have
+ * lower overheads.
+ */
+
+
+
+static void *
+failover_create_blend_state( struct pipe_context *pipe,
+                             const struct pipe_blend_state *blend )
+{
+   struct fo_state *state = MALLOC(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_blend_state(failover->sw, blend);
+   state->hw_state = failover->hw->create_blend_state(failover->hw, blend);
+
+   return state;
+}
+
+static void
+failover_bind_blend_state( struct pipe_context *pipe,
+                           void *blend )
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state *)blend;
+   failover->blend = state;
+   failover->dirty |= FO_NEW_BLEND;
+   failover->sw->bind_blend_state( failover->sw, state->sw_state );
+   failover->hw->bind_blend_state( failover->hw, state->hw_state );
+}
+
+static void
+failover_delete_blend_state( struct pipe_context *pipe,
+                             void *blend )
+{
+   struct fo_state *state = (struct fo_state*)blend;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_blend_state(failover->sw, state->sw_state);
+   failover->hw->delete_blend_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   FREE(state);
+}
+
+static void
+failover_set_blend_color( struct pipe_context *pipe,
+                          const struct pipe_blend_color *blend_color )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->blend_color = *blend_color;
+   failover->dirty |= FO_NEW_BLEND_COLOR;
+   failover->sw->set_blend_color( failover->sw, blend_color );
+   failover->hw->set_blend_color( failover->hw, blend_color );
+}
+
+static void
+failover_set_stencil_ref( struct pipe_context *pipe,
+                          const struct pipe_stencil_ref *stencil_ref )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->stencil_ref = *stencil_ref;
+   failover->dirty |= FO_NEW_STENCIL_REF;
+   failover->sw->set_stencil_ref( failover->sw, stencil_ref );
+   failover->hw->set_stencil_ref( failover->hw, stencil_ref );
+}
+
+static void 
+failover_set_clip_state( struct pipe_context *pipe,
+                         const struct pipe_clip_state *clip )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->clip = *clip;
+   failover->dirty |= FO_NEW_CLIP;
+   failover->sw->set_clip_state( failover->sw, clip );
+   failover->hw->set_clip_state( failover->hw, clip );
+}
+
+static void
+failover_set_sample_mask(struct pipe_context *pipe,
+                         unsigned sample_mask)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sample_mask = sample_mask;
+   failover->dirty |= FO_NEW_SAMPLE_MASK;
+   failover->sw->set_sample_mask( failover->sw, sample_mask );
+   failover->hw->set_sample_mask( failover->hw, sample_mask );
+
+}
+
+
+static void *
+failover_create_depth_stencil_state(struct pipe_context *pipe,
+                              const struct pipe_depth_stencil_alpha_state *templ)
+{
+   struct fo_state *state = MALLOC(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_depth_stencil_alpha_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_depth_stencil_alpha_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_depth_stencil_state(struct pipe_context *pipe,
+                                  void *depth_stencil)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state *)depth_stencil;
+   failover->depth_stencil = state;
+   failover->dirty |= FO_NEW_DEPTH_STENCIL;
+   failover->sw->bind_depth_stencil_alpha_state(failover->sw, state->sw_state);
+   failover->hw->bind_depth_stencil_alpha_state(failover->hw, state->hw_state);
+}
+
+static void
+failover_delete_depth_stencil_state(struct pipe_context *pipe,
+                                    void *ds)
+{
+   struct fo_state *state = (struct fo_state*)ds;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_depth_stencil_alpha_state(failover->sw, state->sw_state);
+   failover->hw->delete_depth_stencil_alpha_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   FREE(state);
+}
+
+static void
+failover_set_framebuffer_state(struct pipe_context *pipe,
+			       const struct pipe_framebuffer_state *framebuffer)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->framebuffer = *framebuffer;
+   failover->dirty |= FO_NEW_FRAMEBUFFER;
+   failover->sw->set_framebuffer_state( failover->sw, framebuffer );
+   failover->hw->set_framebuffer_state( failover->hw, framebuffer );
+}
+
+
+static void *
+failover_create_fs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct fo_state *state = MALLOC(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_fs_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_fs_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state*)fs;
+   failover->fragment_shader = state;
+   failover->dirty |= FO_NEW_FRAGMENT_SHADER;
+   failover->sw->bind_fs_state(failover->sw, state->sw_state);
+   failover->hw->bind_fs_state(failover->hw, state->hw_state);
+}
+
+static void
+failover_delete_fs_state(struct pipe_context *pipe,
+                         void *fs)
+{
+   struct fo_state *state = (struct fo_state*)fs;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_fs_state(failover->sw, state->sw_state);
+   failover->hw->delete_fs_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   FREE(state);
+}
+
+static void *
+failover_create_vs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct fo_state *state = MALLOC(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_vs_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_vs_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_vs_state(struct pipe_context *pipe,
+                       void *vs)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   struct fo_state *state = (struct fo_state*)vs;
+   failover->vertex_shader = state;
+   failover->dirty |= FO_NEW_VERTEX_SHADER;
+   failover->sw->bind_vs_state(failover->sw, state->sw_state);
+   failover->hw->bind_vs_state(failover->hw, state->hw_state);
+}
+
+static void
+failover_delete_vs_state(struct pipe_context *pipe,
+                         void *vs)
+{
+   struct fo_state *state = (struct fo_state*)vs;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_vs_state(failover->sw, state->sw_state);
+   failover->hw->delete_vs_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   FREE(state);
+}
+
+
+
+static void *
+failover_create_vertex_elements_state( struct pipe_context *pipe,
+                                       unsigned count,
+                                       const struct pipe_vertex_element *velems )
+{
+   struct fo_state *state = MALLOC(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_vertex_elements_state(failover->sw, count, velems);
+   state->hw_state = failover->hw->create_vertex_elements_state(failover->hw, count, velems);
+
+   return state;
+}
+
+static void
+failover_bind_vertex_elements_state(struct pipe_context *pipe,
+                                    void *velems )
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state*)velems;
+
+   failover->vertex_elements = state;
+   failover->dirty |= FO_NEW_VERTEX_ELEMENT;
+   failover->sw->bind_vertex_elements_state( failover->sw, velems );
+   failover->hw->bind_vertex_elements_state( failover->hw, velems );
+}
+
+static void
+failover_delete_vertex_elements_state( struct pipe_context *pipe,
+                                       void *velems )
+{
+   struct fo_state *state = (struct fo_state*)velems;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_vertex_elements_state(failover->sw, state->sw_state);
+   failover->hw->delete_vertex_elements_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   FREE(state);
+}
+
+static void 
+failover_set_polygon_stipple( struct pipe_context *pipe,
+                              const struct pipe_poly_stipple *stipple )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->poly_stipple = *stipple;
+   failover->dirty |= FO_NEW_STIPPLE;
+   failover->sw->set_polygon_stipple( failover->sw, stipple );
+   failover->hw->set_polygon_stipple( failover->hw, stipple );
+}
+
+
+static void *
+failover_create_rasterizer_state(struct pipe_context *pipe,
+                                 const struct pipe_rasterizer_state *templ)
+{
+   struct fo_state *state = MALLOC(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_rasterizer_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_rasterizer_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_rasterizer_state(struct pipe_context *pipe,
+                               void *raster)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   struct fo_state *state = (struct fo_state*)raster;
+   failover->rasterizer = state;
+   failover->dirty |= FO_NEW_RASTERIZER;
+   failover->sw->bind_rasterizer_state(failover->sw, state->sw_state);
+   failover->hw->bind_rasterizer_state(failover->hw, state->hw_state);
+}
+
+static void
+failover_delete_rasterizer_state(struct pipe_context *pipe,
+                                 void *raster)
+{
+   struct fo_state *state = (struct fo_state*)raster;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_rasterizer_state(failover->sw, state->sw_state);
+   failover->hw->delete_rasterizer_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   FREE(state);
+}
+
+
+static void 
+failover_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->scissor = *scissor;
+   failover->dirty |= FO_NEW_SCISSOR;
+   failover->sw->set_scissor_state( failover->sw, scissor );
+   failover->hw->set_scissor_state( failover->hw, scissor );
+}
+
+
+static void *
+failover_create_sampler_state(struct pipe_context *pipe,
+                              const struct pipe_sampler_state *templ)
+{
+   struct fo_state *state = MALLOC(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_sampler_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_sampler_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_fragment_sampler_states(struct pipe_context *pipe,
+                                      unsigned num,
+                                      void **sampler)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state*)sampler;
+   uint i;
+   assert(num <= PIPE_MAX_SAMPLERS);
+   /* Check for no-op */
+   if (num == failover->num_samplers &&
+       !memcmp(failover->sampler, sampler, num * sizeof(void *)))
+      return;
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      failover->sw_sampler_state[i] = i < num ? state[i].sw_state : NULL;
+      failover->hw_sampler_state[i] = i < num ? state[i].hw_state : NULL;
+   }
+   failover->dirty |= FO_NEW_SAMPLER;
+   failover->num_samplers = num;
+   failover->sw->bind_fragment_sampler_states(failover->sw, num,
+                                              failover->sw_sampler_state);
+   failover->hw->bind_fragment_sampler_states(failover->hw, num,
+                                              failover->hw_sampler_state);
+}
+
+static void
+failover_bind_vertex_sampler_states(struct pipe_context *pipe,
+                                    unsigned num_samplers,
+                                    void **samplers)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state*)samplers;
+   uint i;
+
+   assert(num_samplers <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_samplers == failover->num_vertex_samplers &&
+       !memcmp(failover->vertex_samplers, samplers, num_samplers * sizeof(void *))) {
+      return;
+   }
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      failover->sw_vertex_sampler_state[i] = i < num_samplers ? state[i].sw_state : NULL;
+      failover->hw_vertex_sampler_state[i] = i < num_samplers ? state[i].hw_state : NULL;
+   }
+   failover->dirty |= FO_NEW_SAMPLER;
+   failover->num_vertex_samplers = num_samplers;
+   failover->sw->bind_vertex_sampler_states(failover->sw,
+                                            num_samplers,
+                                            failover->sw_vertex_sampler_state);
+   failover->hw->bind_vertex_sampler_states(failover->hw,
+                                            num_samplers,
+                                            failover->hw_vertex_sampler_state);
+}
+
+static void
+failover_delete_sampler_state(struct pipe_context *pipe, void *sampler)
+{
+   struct fo_state *state = (struct fo_state*)sampler;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_sampler_state(failover->sw, state->sw_state);
+   failover->hw->delete_sampler_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   FREE(state);
+}
+
+
+static struct pipe_sampler_view *
+failover_create_sampler_view(struct pipe_context *pipe,
+                             struct pipe_resource *texture,
+                             const struct pipe_sampler_view *templ)
+{
+   struct fo_sampler_view *view = MALLOC(sizeof(struct fo_sampler_view));
+   struct failover_context *failover = failover_context(pipe);
+
+   view->sw = failover->sw->create_sampler_view(failover->sw, texture, templ);
+   view->hw = failover->hw->create_sampler_view(failover->hw, texture, templ);
+
+   view->base = *templ;
+   view->base.reference.count = 1;
+   view->base.texture = NULL;
+   pipe_resource_reference(&view->base.texture, texture);
+   view->base.context = pipe;
+
+   return &view->base;
+}
+
+static void
+failover_sampler_view_destroy(struct pipe_context *pipe,
+                              struct pipe_sampler_view *view)
+{
+   struct fo_sampler_view *fo_view = (struct fo_sampler_view *)view;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->sampler_view_destroy(failover->sw, fo_view->sw);
+   failover->hw->sampler_view_destroy(failover->hw, fo_view->hw);
+
+   pipe_resource_reference(&fo_view->base.texture, NULL);
+   FREE(fo_view);
+}
+
+static void
+failover_set_fragment_sampler_views(struct pipe_context *pipe,
+                                    unsigned num,
+                                    struct pipe_sampler_view **views)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct pipe_sampler_view *hw_views[PIPE_MAX_SAMPLERS];
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == failover->num_fragment_sampler_views &&
+       !memcmp(failover->fragment_sampler_views, views, num * sizeof(struct pipe_sampler_view *)))
+      return;
+   for (i = 0; i < num; i++) {
+      struct fo_sampler_view *fo_view = (struct fo_sampler_view *)views[i];
+
+      pipe_sampler_view_reference((struct pipe_sampler_view **)&failover->fragment_sampler_views[i], views[i]);
+      hw_views[i] = fo_view->hw;
+   }
+   for (i = num; i < failover->num_fragment_sampler_views; i++)
+      pipe_sampler_view_reference((struct pipe_sampler_view **)&failover->fragment_sampler_views[i], NULL);
+   failover->dirty |= FO_NEW_SAMPLER_VIEW;
+   failover->num_fragment_sampler_views = num;
+   failover->hw->set_fragment_sampler_views(failover->hw, num, hw_views);
+}
+
+
+static void
+failover_set_vertex_sampler_views(struct pipe_context *pipe,
+                                  unsigned num,
+                                  struct pipe_sampler_view **views)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct pipe_sampler_view *hw_views[PIPE_MAX_VERTEX_SAMPLERS];
+   uint i;
+
+   assert(num <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == failover->num_vertex_sampler_views &&
+       !memcmp(failover->vertex_sampler_views, views, num * sizeof(struct pipe_sampler_view *))) {
+      return;
+   }
+   for (i = 0; i < num; i++) {
+      struct fo_sampler_view *fo_view = (struct fo_sampler_view *)views[i];
+
+      pipe_sampler_view_reference((struct pipe_sampler_view **)&failover->vertex_sampler_views[i], views[i]);
+      hw_views[i] = fo_view->hw;
+   }
+   for (i = num; i < failover->num_vertex_sampler_views; i++)
+      pipe_sampler_view_reference((struct pipe_sampler_view **)&failover->vertex_sampler_views[i], NULL);
+   failover->dirty |= FO_NEW_SAMPLER_VIEW;
+   failover->num_vertex_sampler_views = num;
+   failover->hw->set_vertex_sampler_views(failover->hw, num, hw_views);
+}
+
+
+static void 
+failover_set_viewport_state( struct pipe_context *pipe,
+			     const struct pipe_viewport_state *viewport )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->viewport = *viewport; 
+   failover->dirty |= FO_NEW_VIEWPORT;
+   failover->sw->set_viewport_state( failover->sw, viewport );
+   failover->hw->set_viewport_state( failover->hw, viewport );
+}
+
+
+static void
+failover_set_vertex_buffers(struct pipe_context *pipe,
+                            unsigned count,
+                            const struct pipe_vertex_buffer *vertex_buffers)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   memcpy(failover->vertex_buffers, vertex_buffers,
+          count * sizeof(vertex_buffers[0]));
+   failover->dirty |= FO_NEW_VERTEX_BUFFER;
+   failover->num_vertex_buffers = count;
+   failover->sw->set_vertex_buffers( failover->sw, count, vertex_buffers );
+   failover->hw->set_vertex_buffers( failover->hw, count, vertex_buffers );
+}
+
+
+void
+failover_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             struct pipe_resource *res)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   failover->sw->set_constant_buffer(failover->sw, shader, index, res);
+   failover->hw->set_constant_buffer(failover->hw, shader, index, res);
+}
+
+
+void
+failover_init_state_functions( struct failover_context *failover )
+{
+   failover->pipe.create_blend_state = failover_create_blend_state;
+   failover->pipe.bind_blend_state   = failover_bind_blend_state;
+   failover->pipe.delete_blend_state = failover_delete_blend_state;
+   failover->pipe.create_sampler_state = failover_create_sampler_state;
+   failover->pipe.bind_fragment_sampler_states  = failover_bind_fragment_sampler_states;
+   failover->pipe.bind_vertex_sampler_states  = failover_bind_vertex_sampler_states;
+   failover->pipe.delete_sampler_state = failover_delete_sampler_state;
+   failover->pipe.create_depth_stencil_alpha_state = failover_create_depth_stencil_state;
+   failover->pipe.bind_depth_stencil_alpha_state   = failover_bind_depth_stencil_state;
+   failover->pipe.delete_depth_stencil_alpha_state = failover_delete_depth_stencil_state;
+   failover->pipe.create_rasterizer_state = failover_create_rasterizer_state;
+   failover->pipe.bind_rasterizer_state = failover_bind_rasterizer_state;
+   failover->pipe.delete_rasterizer_state = failover_delete_rasterizer_state;
+   failover->pipe.create_fs_state = failover_create_fs_state;
+   failover->pipe.bind_fs_state   = failover_bind_fs_state;
+   failover->pipe.delete_fs_state = failover_delete_fs_state;
+   failover->pipe.create_vs_state = failover_create_vs_state;
+   failover->pipe.bind_vs_state   = failover_bind_vs_state;
+   failover->pipe.delete_vs_state = failover_delete_vs_state;
+   failover->pipe.create_vertex_elements_state = failover_create_vertex_elements_state;
+   failover->pipe.bind_vertex_elements_state = failover_bind_vertex_elements_state;
+   failover->pipe.delete_vertex_elements_state = failover_delete_vertex_elements_state;
+
+   failover->pipe.set_blend_color = failover_set_blend_color;
+   failover->pipe.set_stencil_ref = failover_set_stencil_ref;
+   failover->pipe.set_clip_state = failover_set_clip_state;
+   failover->pipe.set_sample_mask = failover_set_sample_mask;
+   failover->pipe.set_framebuffer_state = failover_set_framebuffer_state;
+   failover->pipe.set_polygon_stipple = failover_set_polygon_stipple;
+   failover->pipe.set_scissor_state = failover_set_scissor_state;
+   failover->pipe.set_fragment_sampler_views = failover_set_fragment_sampler_views;
+   failover->pipe.set_vertex_sampler_views = failover_set_vertex_sampler_views;
+   failover->pipe.set_viewport_state = failover_set_viewport_state;
+   failover->pipe.set_vertex_buffers = failover_set_vertex_buffers;
+   failover->pipe.set_constant_buffer = failover_set_constant_buffer;
+   failover->pipe.create_sampler_view = failover_create_sampler_view;
+   failover->pipe.sampler_view_destroy = failover_sampler_view_destroy;
+}
diff --git a/src/gallium/drivers/failover/fo_state_emit.c b/src/gallium/drivers/failover/fo_state_emit.c
new file mode 100644
index 0000000000..147f23269c
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_state_emit.c
@@ -0,0 +1,139 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "fo_context.h"
+
+/* This looks like a lot of work at the moment - we're keeping a
+ * duplicate copy of the state up-to-date.  
+ *
+ * This can change in two ways:
+ * - With constant state objects we would only need to save a pointer,
+ *     not the whole object.
+ * - By adding a callback in the state tracker to re-emit state.  The
+ *     state tracker knows the current state already and can re-emit it 
+ *     without additional complexity.
+ *
+ * This works as a proof-of-concept, but a final version will have
+ * lower overheads.
+ */
+
+
+/* Bring the software pipe uptodate with current state.
+ * 
+ * With constant state objects we would probably just send all state
+ * to both rasterizers all the time???
+ */
+void
+failover_state_emit( struct failover_context *failover )
+{
+   if (failover->dirty & FO_NEW_BLEND)
+      failover->sw->bind_blend_state( failover->sw,
+                                      failover->blend->sw_state );
+
+   if (failover->dirty & FO_NEW_BLEND_COLOR)
+      failover->sw->set_blend_color( failover->sw, &failover->blend_color );
+
+   if (failover->dirty & FO_NEW_CLIP)
+      failover->sw->set_clip_state( failover->sw, &failover->clip );
+
+   if (failover->dirty & FO_NEW_SAMPLE_MASK)
+      failover->sw->set_sample_mask( failover->sw, failover->sample_mask );
+
+   if (failover->dirty & FO_NEW_DEPTH_STENCIL)
+      failover->sw->bind_depth_stencil_alpha_state( failover->sw,
+                                                    failover->depth_stencil->sw_state );
+
+   if (failover->dirty & FO_NEW_STENCIL_REF)
+      failover->sw->set_stencil_ref( failover->sw, &failover->stencil_ref );
+
+   if (failover->dirty & FO_NEW_FRAMEBUFFER)
+      failover->sw->set_framebuffer_state( failover->sw, &failover->framebuffer );
+
+   if (failover->dirty & FO_NEW_FRAGMENT_SHADER)
+      failover->sw->bind_fs_state( failover->sw,
+                                   failover->fragment_shader->sw_state );
+
+   if (failover->dirty & FO_NEW_VERTEX_SHADER)
+      failover->sw->bind_vs_state( failover->sw,
+                                   failover->vertex_shader->sw_state );
+
+   if (failover->dirty & FO_NEW_VERTEX_ELEMENT)
+      failover->sw->bind_vertex_elements_state( failover->sw,
+                                                failover->vertex_elements->sw_state );
+
+   if (failover->dirty & FO_NEW_STIPPLE)
+      failover->sw->set_polygon_stipple( failover->sw, &failover->poly_stipple );
+
+   if (failover->dirty & FO_NEW_RASTERIZER)
+      failover->sw->bind_rasterizer_state( failover->sw,
+                                           failover->rasterizer->sw_state );
+
+   if (failover->dirty & FO_NEW_SCISSOR)
+      failover->sw->set_scissor_state( failover->sw, &failover->scissor );
+
+   if (failover->dirty & FO_NEW_VIEWPORT)
+      failover->sw->set_viewport_state( failover->sw, &failover->viewport );
+
+   if (failover->dirty & FO_NEW_SAMPLER) {
+      failover->sw->bind_fragment_sampler_states( failover->sw, failover->num_samplers,
+                                                  failover->sw_sampler_state );
+      failover->sw->bind_vertex_sampler_states(failover->sw,
+                                               failover->num_vertex_samplers,
+                                               failover->sw_vertex_sampler_state);
+   }
+
+   if (failover->dirty & FO_NEW_SAMPLER_VIEW) {
+      struct pipe_sampler_view *fragment_views[PIPE_MAX_SAMPLERS];
+      struct pipe_sampler_view *vertex_views[PIPE_MAX_VERTEX_SAMPLERS];
+      uint i;
+
+      for (i = 0; i < failover->num_fragment_sampler_views; i++) {
+         fragment_views[i] = failover->fragment_sampler_views[i]->sw;
+      }
+      failover->sw->set_fragment_sampler_views(failover->sw,
+                                               failover->num_fragment_sampler_views,
+                                               fragment_views);
+
+      for (i = 0; i < failover->num_vertex_sampler_views; i++) {
+         vertex_views[i] = failover->vertex_sampler_views[i]->sw;
+      }
+      failover->sw->set_vertex_sampler_views(failover->sw,
+                                             failover->num_vertex_sampler_views,
+                                             vertex_views);
+   }
+
+   if (failover->dirty & FO_NEW_VERTEX_BUFFER) {
+      failover->sw->set_vertex_buffers( failover->sw,
+                                        failover->num_vertex_buffers,
+                                        failover->vertex_buffers );
+   }
+
+   failover->dirty = 0;
+}
diff --git a/src/gallium/drivers/failover/fo_winsys.h b/src/gallium/drivers/failover/fo_winsys.h
new file mode 100644
index 0000000000..533122b69d
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_winsys.h
@@ -0,0 +1,48 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef FO_WINSYS_H
+#define FO_WINSYS_H
+
+
+/* This is the interface that failover requires any window system
+ * hosting it to implement.  This is the only include file in failover
+ * which is public.
+ */
+
+
+struct pipe_context;
+struct failover_context;
+
+
+struct pipe_context *failover_create( struct pipe_context *hw,
+				      struct pipe_context *sw );
+
+
+void failover_fail_over( struct failover_context *failover );
+
+#endif /* FO_WINSYS_H */
diff --git a/src/gallium/drivers/i915/Makefile b/src/gallium/drivers/i915/Makefile
new file mode 100644
index 0000000000..2cefe70850
--- /dev/null
+++ b/src/gallium/drivers/i915/Makefile
@@ -0,0 +1,29 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = i915
+
+C_SOURCES = \
+	i915_blit.c \
+	i915_clear.c \
+	i915_flush.c \
+	i915_context.c \
+	i915_debug.c \
+	i915_debug_fp.c \
+	i915_state.c \
+	i915_state_immediate.c \
+	i915_state_dynamic.c \
+	i915_state_derived.c \
+	i915_state_emit.c \
+	i915_state_sampler.c \
+	i915_screen.c \
+	i915_prim_emit.c \
+	i915_prim_vbuf.c \
+	i915_resource.c \
+	i915_resource_texture.c \
+	i915_resource_buffer.c \
+	i915_fpc_emit.c \
+	i915_fpc_translate.c \
+	i915_surface.c 
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/i915/SConscript b/src/gallium/drivers/i915/SConscript
new file mode 100644
index 0000000000..d6e7a8dbd3
--- /dev/null
+++ b/src/gallium/drivers/i915/SConscript
@@ -0,0 +1,35 @@
+Import('*')
+
+env = env.Clone()
+
+if msvc:
+	print 'warning: not building i915g'
+	Return()
+
+i915 = env.ConvenienceLibrary(
+	target = 'i915',
+	source = [
+		'i915_blit.c',
+		'i915_resource_buffer.c',
+		'i915_clear.c',
+		'i915_context.c',
+		'i915_debug.c',
+		'i915_debug_fp.c',
+		'i915_flush.c',
+		'i915_fpc_emit.c',
+		'i915_fpc_translate.c',
+		'i915_prim_emit.c',
+		'i915_prim_vbuf.c',
+		'i915_screen.c',
+		'i915_state.c',
+		'i915_state_derived.c',
+		'i915_state_dynamic.c',
+		'i915_state_emit.c',
+		'i915_state_immediate.c',
+		'i915_state_sampler.c',
+		'i915_surface.c',
+		'i915_resource.c',
+		'i915_resource_texture.c',
+	])
+
+Export('i915')
diff --git a/src/gallium/drivers/i915/i915_batch.h b/src/gallium/drivers/i915/i915_batch.h
new file mode 100644
index 0000000000..f0086695d1
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_batch.h
@@ -0,0 +1,47 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_BATCH_H
+#define I915_BATCH_H
+
+#include "i915_batchbuffer.h"
+
+#define BEGIN_BATCH(dwords, relocs) \
+   (i915_winsys_batchbuffer_check(i915->batch, dwords, relocs))
+
+#define OUT_BATCH(dword) \
+   i915_winsys_batchbuffer_dword(i915->batch, dword)
+
+#define OUT_RELOC(buf, usage, offset) \
+   i915_winsys_batchbuffer_reloc(i915->batch, buf, usage, offset)
+
+#define FLUSH_BATCH(fence) do {                 \
+   i915_winsys_batchbuffer_flush(i915->batch, fence); \
+   i915->hardware_dirty = ~0;                   \
+} while (0)
+
+#endif
diff --git a/src/gallium/drivers/i915/i915_batchbuffer.h b/src/gallium/drivers/i915/i915_batchbuffer.h
new file mode 100644
index 0000000000..27ccaa6b1f
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_batchbuffer.h
@@ -0,0 +1,87 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef I915_BATCHBUFFER_H
+#define I915_BATCHBUFFER_H
+
+#include "i915_winsys.h"
+
+static INLINE boolean
+i915_winsys_batchbuffer_check(struct i915_winsys_batchbuffer *batch,
+                              size_t dwords,
+                              size_t relocs)
+{
+   return dwords * 4 <= batch->size - (batch->ptr - batch->map) &&
+          relocs <= (batch->max_relocs - batch->relocs);
+}
+
+static INLINE size_t
+i915_winsys_batchbuffer_space(struct i915_winsys_batchbuffer *batch)
+{
+   return batch->size - (batch->ptr - batch->map);
+}
+
+static INLINE void
+i915_winsys_batchbuffer_dword(struct i915_winsys_batchbuffer *batch,
+                              unsigned dword)
+{
+   if (i915_winsys_batchbuffer_space(batch) < 4)
+      return;
+
+   *(unsigned *)batch->ptr = dword;
+   batch->ptr += 4;
+}
+
+static INLINE void
+i915_winsys_batchbuffer_write(struct i915_winsys_batchbuffer *batch,
+                       void *data,
+                       size_t size)
+{
+   if (i915_winsys_batchbuffer_space(batch) < size)
+      return;
+
+   memcpy(data, batch->ptr, size);
+   batch->ptr += size;
+}
+
+static INLINE int
+i915_winsys_batchbuffer_reloc(struct i915_winsys_batchbuffer *batch,
+                              struct i915_winsys_buffer *buffer,
+                              enum i915_winsys_buffer_usage usage,
+                              size_t offset)
+{
+   return batch->iws->batchbuffer_reloc(batch, buffer, usage, offset);
+}
+
+static INLINE void
+i915_winsys_batchbuffer_flush(struct i915_winsys_batchbuffer *batch,
+                              struct pipe_fence_handle **fence)
+{
+   batch->iws->batchbuffer_flush(batch, fence);
+}
+
+#endif
diff --git a/src/gallium/drivers/i915/i915_blit.c b/src/gallium/drivers/i915/i915_blit.c
new file mode 100644
index 0000000000..c5b5979bf9
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_blit.c
@@ -0,0 +1,150 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "i915_blit.h"
+#include "i915_reg.h"
+#include "i915_batch.h"
+#include "i915_debug.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLIT
+
+void
+i915_fill_blit(struct i915_context *i915,
+               unsigned cpp,
+               unsigned rgba_mask,
+               unsigned short dst_pitch,
+               struct i915_winsys_buffer *dst_buffer,
+               unsigned dst_offset,
+               short x, short y, 
+               short w, short h, 
+               unsigned color)
+{
+   unsigned BR13, CMD;
+
+
+   I915_DBG(i915,
+      "%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+      __FUNCTION__,
+      dst_buffer, dst_pitch, dst_offset, x, y, w, h);
+
+   switch (cpp) {
+   case 1:
+   case 2:
+   case 3:
+      BR13 = (((int) dst_pitch) & 0xffff) |
+         (0xF0 << 16) | (1 << 24);
+      CMD = XY_COLOR_BLT_CMD;
+      break;
+   case 4:
+      BR13 = (((int) dst_pitch) & 0xffff) |
+         (0xF0 << 16) | (1 << 24) | (1 << 25);
+      CMD = (XY_COLOR_BLT_CMD | rgba_mask);
+      break;
+   default:
+      return;
+   }
+
+   if (!BEGIN_BATCH(6, 1)) {
+      FLUSH_BATCH(NULL);
+      assert(BEGIN_BATCH(6, 1));
+   }
+   OUT_BATCH(CMD);
+   OUT_BATCH(BR13);
+   OUT_BATCH((y << 16) | x);
+   OUT_BATCH(((y + h) << 16) | (x + w));
+   OUT_RELOC(dst_buffer, I915_USAGE_2D_TARGET, dst_offset);
+   OUT_BATCH(color);
+   FLUSH_BATCH(NULL);
+}
+
+void
+i915_copy_blit(struct i915_context *i915,
+               unsigned cpp,
+               unsigned short src_pitch,
+               struct i915_winsys_buffer *src_buffer,
+               unsigned src_offset,
+               unsigned short dst_pitch,
+               struct i915_winsys_buffer *dst_buffer,
+               unsigned dst_offset,
+               short src_x, short src_y,
+               short dst_x, short dst_y, 
+               short w, short h)
+{
+   unsigned CMD, BR13;
+   int dst_y2 = dst_y + h;
+   int dst_x2 = dst_x + w;
+
+
+   I915_DBG(i915,
+      "%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+      __FUNCTION__,
+      src_buffer, src_pitch, src_offset, src_x, src_y,
+      dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);
+
+   switch (cpp) {
+   case 1:
+   case 2:
+   case 3:
+      BR13 = (((int) dst_pitch) & 0xffff) |
+         (0xCC << 16) | (1 << 24);
+      CMD = XY_SRC_COPY_BLT_CMD;
+      break;
+   case 4:
+      BR13 = (((int) dst_pitch) & 0xffff) |
+             (0xCC << 16) | (1 << 24) | (1 << 25);
+      CMD = (XY_SRC_COPY_BLT_CMD | XY_SRC_COPY_BLT_WRITE_ALPHA |
+            XY_SRC_COPY_BLT_WRITE_RGB);
+      break;
+   default:
+      return;
+   }
+
+   if (dst_y2 < dst_y || dst_x2 < dst_x) {
+      return;
+   }
+
+   /* Hardware can handle negative pitches but loses the ability to do
+    * proper overlapping blits in that case.  We don't really have a
+    * need for either at this stage.
+    */
+   assert (dst_pitch > 0 && src_pitch > 0);
+
+   if (!BEGIN_BATCH(8, 2)) {
+      FLUSH_BATCH(NULL);
+      assert(BEGIN_BATCH(8, 2));
+   }
+   OUT_BATCH(CMD);
+   OUT_BATCH(BR13);
+   OUT_BATCH((dst_y << 16) | dst_x);
+   OUT_BATCH((dst_y2 << 16) | dst_x2);
+   OUT_RELOC(dst_buffer, I915_USAGE_2D_TARGET, dst_offset);
+   OUT_BATCH((src_y << 16) | src_x);
+   OUT_BATCH(((int) src_pitch & 0xffff));
+   OUT_RELOC(src_buffer, I915_USAGE_2D_SOURCE, src_offset);
+   FLUSH_BATCH(NULL);
+}
diff --git a/src/gallium/drivers/i915/i915_blit.h b/src/gallium/drivers/i915/i915_blit.h
new file mode 100644
index 0000000000..d82d2f258e
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_blit.h
@@ -0,0 +1,55 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_BLIT_H
+#define I915_BLIT_H
+
+#include "i915_context.h"
+
+extern void i915_copy_blit(struct i915_context *i915,
+                           unsigned cpp,
+                           unsigned short src_pitch,
+                           struct i915_winsys_buffer *src_buffer,
+                           unsigned src_offset,
+                           unsigned short dst_pitch,
+                           struct i915_winsys_buffer *dst_buffer,
+                           unsigned dst_offset,
+                           short srcx, short srcy,
+                           short dstx, short dsty,
+                           short w, short h);
+
+extern void i915_fill_blit(struct i915_context *i915,
+                           unsigned cpp,
+                           unsigned rgba_mask,
+                           unsigned short dst_pitch,
+                           struct i915_winsys_buffer *dst_buffer,
+                           unsigned dst_offset,
+                           short x, short y,
+                           short w, short h, unsigned color);
+
+
+#endif
diff --git a/src/gallium/drivers/i915/i915_clear.c b/src/gallium/drivers/i915/i915_clear.c
new file mode 100644
index 0000000000..6d824a507a
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_clear.c
@@ -0,0 +1,47 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *    Brian Paul
+ */
+
+
+#include "util/u_clear.h"
+#include "i915_context.h"
+
+
+/**
+ * Clear the given buffers to the specified values.
+ * No masking, no scissor (clear entire buffer).
+ */
+void
+i915_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
+           double depth, unsigned stencil)
+{
+   util_clear(pipe, &i915_context(pipe)->framebuffer, buffers, rgba, depth,
+              stencil);
+}
diff --git a/src/gallium/drivers/i915/i915_context.c b/src/gallium/drivers/i915/i915_context.c
new file mode 100644
index 0000000000..2af9bdac95
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_context.c
@@ -0,0 +1,202 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_context.h"
+#include "i915_state.h"
+#include "i915_screen.h"
+#include "i915_surface.h"
+#include "i915_batch.h"
+#include "i915_resource.h"
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "pipe/p_screen.h"
+
+
+/*
+ * Draw functions
+ */
+
+
+static void
+i915_draw_range_elements(struct pipe_context *pipe,
+                         struct pipe_resource *indexBuffer,
+                         unsigned indexSize,
+                         int indexBias,
+                         unsigned min_index,
+                         unsigned max_index,
+                         unsigned prim, unsigned start, unsigned count)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   struct draw_context *draw = i915->draw;
+   unsigned i;
+
+   if (i915->dirty)
+      i915_update_derived(i915);
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < i915->num_vertex_buffers; i++) {
+      void *buf = i915_buffer(i915->vertex_buffer[i].buffer)->data;
+      draw_set_mapped_vertex_buffer(draw, i, buf);
+   }
+
+   /*
+    * Map index buffer, if present
+    */
+   if (indexBuffer) {
+      void *mapped_indexes = i915_buffer(indexBuffer)->data;
+      draw_set_mapped_element_buffer_range(draw, indexSize, indexBias,
+                                           min_index,
+                                           max_index,
+                                           mapped_indexes);
+   } else {
+      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
+   }
+
+
+   draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0,
+                                   i915->current.constants[PIPE_SHADER_VERTEX],
+                                   (i915->current.num_user_constants[PIPE_SHADER_VERTEX] * 
+                                      4 * sizeof(float)));
+
+   /*
+    * Do the drawing
+    */
+   draw_arrays(i915->draw, prim, start, count);
+
+   /*
+    * unmap vertex/index buffers
+    */
+   for (i = 0; i < i915->num_vertex_buffers; i++) {
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+   }
+
+   if (indexBuffer) {
+      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
+   }
+}
+
+static void
+i915_draw_elements(struct pipe_context *pipe,
+                   struct pipe_resource *indexBuffer,
+                   unsigned indexSize, int indexBias,
+                   unsigned prim, unsigned start, unsigned count)
+{
+   i915_draw_range_elements(pipe, indexBuffer,
+                            indexSize, indexBias,
+                            0, 0xffffffff,
+                            prim, start, count);
+}
+
+static void
+i915_draw_arrays(struct pipe_context *pipe,
+                 unsigned prim, unsigned start, unsigned count)
+{
+   i915_draw_elements(pipe, NULL, 0, 0, prim, start, count);
+}
+
+
+
+
+/*
+ * Generic context functions
+ */
+
+
+static void i915_destroy(struct pipe_context *pipe)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   int i;
+
+   draw_destroy(i915->draw);
+   
+   if(i915->batch)
+      i915->iws->batchbuffer_destroy(i915->batch);
+
+   /* unbind framebuffer */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&i915->framebuffer.cbufs[i], NULL);
+   }
+   pipe_surface_reference(&i915->framebuffer.zsbuf, NULL);
+
+   FREE(i915);
+}
+
+struct pipe_context *
+i915_create_context(struct pipe_screen *screen, void *priv)
+{
+   struct i915_context *i915;
+
+   i915 = CALLOC_STRUCT(i915_context);
+   if (i915 == NULL)
+      return NULL;
+
+   i915->iws = i915_screen(screen)->iws;
+   i915->base.winsys = NULL;
+   i915->base.screen = screen;
+   i915->base.priv = priv;
+
+   i915->base.destroy = i915_destroy;
+
+   i915->base.clear = i915_clear;
+
+   i915->base.draw_arrays = i915_draw_arrays;
+   i915->base.draw_elements = i915_draw_elements;
+   i915->base.draw_range_elements = i915_draw_range_elements;
+
+   /*
+    * Create drawing context and plug our rendering stage into it.
+    */
+   i915->draw = draw_create(&i915->base);
+   assert(i915->draw);
+   if (!debug_get_bool_option("I915_NO_VBUF", FALSE)) {
+      draw_set_rasterize_stage(i915->draw, i915_draw_vbuf_stage(i915));
+   } else {
+      draw_set_rasterize_stage(i915->draw, i915_draw_render_stage(i915));
+   }
+
+   i915_init_surface_functions(i915);
+   i915_init_state_functions(i915);
+   i915_init_flush_functions(i915);
+   i915_init_resource_functions(i915);
+
+   draw_install_aaline_stage(i915->draw, &i915->base);
+   draw_install_aapoint_stage(i915->draw, &i915->base);
+
+   i915->dirty = ~0;
+   i915->hardware_dirty = ~0;
+
+   /* Batch stream debugging is a bit hacked up at the moment:
+    */
+   i915->batch = i915->iws->batchbuffer_create(i915->iws);
+
+   return &i915->base;
+}
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
new file mode 100644
index 0000000000..acc0ffe037
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -0,0 +1,335 @@
+ /**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_CONTEXT_H
+#define I915_CONTEXT_H
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "draw/draw_vertex.h"
+
+#include "tgsi/tgsi_scan.h"
+
+
+struct i915_winsys;
+struct i915_winsys_buffer;
+struct i915_winsys_batchbuffer;
+
+
+#define I915_TEX_UNITS 8
+
+#define I915_DYNAMIC_MODES4       0
+#define I915_DYNAMIC_DEPTHSCALE_0 1 /* just the header */
+#define I915_DYNAMIC_DEPTHSCALE_1 2 
+#define I915_DYNAMIC_IAB          3
+#define I915_DYNAMIC_BC_0         4 /* just the header */
+#define I915_DYNAMIC_BC_1         5
+#define I915_DYNAMIC_BFO_0        6 
+#define I915_DYNAMIC_BFO_1        7
+#define I915_DYNAMIC_STP_0        8 
+#define I915_DYNAMIC_STP_1        9 
+#define I915_DYNAMIC_SC_ENA_0     10 
+#define I915_DYNAMIC_SC_RECT_0    11 
+#define I915_DYNAMIC_SC_RECT_1    12 
+#define I915_DYNAMIC_SC_RECT_2    13 
+#define I915_MAX_DYNAMIC          14
+
+
+#define I915_IMMEDIATE_S0         0
+#define I915_IMMEDIATE_S1         1
+#define I915_IMMEDIATE_S2         2
+#define I915_IMMEDIATE_S3         3
+#define I915_IMMEDIATE_S4         4
+#define I915_IMMEDIATE_S5         5
+#define I915_IMMEDIATE_S6         6
+#define I915_IMMEDIATE_S7         7
+#define I915_MAX_IMMEDIATE        8
+
+/* These must mach the order of LI0_STATE_* bits, as they will be used
+ * to generate hardware packets:
+ */
+#define I915_CACHE_STATIC         0 
+#define I915_CACHE_DYNAMIC        1 /* handled specially */
+#define I915_CACHE_SAMPLER        2
+#define I915_CACHE_MAP            3
+#define I915_CACHE_PROGRAM        4
+#define I915_CACHE_CONSTANTS      5
+#define I915_MAX_CACHE            6
+
+#define I915_MAX_CONSTANT  32
+
+
+/** See constant_flags[] below */
+#define I915_CONSTFLAG_USER 0x1f
+
+
+/**
+ * Subclass of pipe_shader_state
+ */
+struct i915_fragment_shader
+{
+   struct pipe_shader_state state;
+
+   struct tgsi_shader_info info;
+
+   uint *program;
+   uint program_len;
+
+   /**
+    * constants introduced during translation.
+    * These are placed at the end of the constant buffer and grow toward
+    * the beginning (eg: slot 31, 30 29, ...)
+    * User-provided constants start at 0.
+    * This allows both types of constants to co-exist (until there's too many)
+    * and doesn't require regenerating/changing the fragment program to
+    * shuffle constants around.
+    */
+   uint num_constants;
+   float constants[I915_MAX_CONSTANT][4];
+
+   /**
+    * Status of each constant
+    * if I915_CONSTFLAG_PARAM, the value must be taken from the corresponding
+    * slot of the user's constant buffer. (set by pipe->set_constant_buffer())
+    * Else, the bitmask indicates which components are occupied by immediates.
+    */
+   ubyte constant_flags[I915_MAX_CONSTANT];
+};
+
+
+struct i915_cache_context;
+
+/* Use to calculate differences between state emitted to hardware and
+ * current driver-calculated state.  
+ */
+struct i915_state 
+{
+   unsigned immediate[I915_MAX_IMMEDIATE];
+   unsigned dynamic[I915_MAX_DYNAMIC];
+
+   float constants[PIPE_SHADER_TYPES][I915_MAX_CONSTANT][4];
+   /** number of constants passed in through a constant buffer */
+   uint num_user_constants[PIPE_SHADER_TYPES];
+
+   /* texture sampler state */
+   unsigned sampler[I915_TEX_UNITS][3];
+   unsigned sampler_enable_flags;
+   unsigned sampler_enable_nr;
+
+   /* texture image buffers */
+   unsigned texbuffer[I915_TEX_UNITS][2];
+
+   /** Describes the current hardware vertex layout */
+   struct vertex_info vertex_info;
+
+   unsigned id;			/* track lost context events */
+};
+
+struct i915_blend_state {
+   unsigned iab;
+   unsigned modes4;
+   unsigned LIS5;
+   unsigned LIS6;
+};
+
+struct i915_depth_stencil_state {
+   unsigned stencil_modes4;
+   unsigned bfo[2];
+   unsigned stencil_LIS5;
+   unsigned depth_LIS6;
+};
+
+struct i915_rasterizer_state {
+   unsigned light_twoside : 1;
+   unsigned st;
+   enum interp_mode color_interp;
+
+   unsigned LIS4;
+   unsigned LIS7;
+   unsigned sc[1];
+
+   const struct pipe_rasterizer_state *templ;
+
+   union { float f; unsigned u; } ds[2];
+};
+
+struct i915_sampler_state {
+   unsigned state[3];
+   const struct pipe_sampler_state *templ;
+   unsigned minlod;
+   unsigned maxlod;
+};
+
+struct i915_velems_state {
+   unsigned count;
+   struct pipe_vertex_element velem[PIPE_MAX_ATTRIBS];
+};
+
+
+struct i915_context
+{
+   struct pipe_context base;
+
+   struct i915_winsys *iws;
+
+   struct draw_context *draw;
+
+   /* The most recent drawing state as set by the driver:
+    */
+   const struct i915_blend_state           *blend;
+   const struct i915_sampler_state         *sampler[PIPE_MAX_SAMPLERS];
+   const struct i915_depth_stencil_state   *depth_stencil;
+   const struct i915_rasterizer_state      *rasterizer;
+
+   struct i915_fragment_shader *fs;
+
+   struct pipe_blend_color blend_color;
+   struct pipe_stencil_ref stencil_ref;
+   struct pipe_clip_state clip;
+   /* XXX unneded */
+   struct pipe_resource *constants[PIPE_SHADER_TYPES];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+
+   unsigned dirty;
+
+   unsigned num_samplers;
+   unsigned num_fragment_sampler_views;
+   unsigned num_vertex_buffers;
+
+   struct i915_winsys_batchbuffer *batch;
+
+   /** Vertex buffer */
+   struct i915_winsys_buffer *vbo;
+   size_t vbo_offset;
+   unsigned vbo_flushed;
+
+   struct i915_state current;
+   unsigned hardware_dirty;
+   
+   unsigned debug;
+};
+
+/* A flag for each state_tracker state object:
+ */
+#define I915_NEW_VIEWPORT      0x1
+#define I915_NEW_RASTERIZER    0x2
+#define I915_NEW_FS            0x4
+#define I915_NEW_BLEND         0x8
+#define I915_NEW_CLIP          0x10
+#define I915_NEW_SCISSOR       0x20
+#define I915_NEW_STIPPLE       0x40
+#define I915_NEW_FRAMEBUFFER   0x80
+#define I915_NEW_ALPHA_TEST    0x100
+#define I915_NEW_DEPTH_STENCIL 0x200
+#define I915_NEW_SAMPLER       0x400
+#define I915_NEW_SAMPLER_VIEW  0x800
+#define I915_NEW_CONSTANTS     0x1000
+#define I915_NEW_VBO           0x2000
+#define I915_NEW_VS            0x4000
+
+
+/* Driver's internally generated state flags:
+ */
+#define I915_NEW_VERTEX_FORMAT    0x10000
+
+
+/* Dirty flags for hardware emit
+ */
+#define I915_HW_STATIC            (1<<I915_CACHE_STATIC)
+#define I915_HW_DYNAMIC           (1<<I915_CACHE_DYNAMIC)
+#define I915_HW_SAMPLER           (1<<I915_CACHE_SAMPLER)
+#define I915_HW_MAP               (1<<I915_CACHE_MAP)
+#define I915_HW_PROGRAM           (1<<I915_CACHE_PROGRAM)
+#define I915_HW_CONSTANTS         (1<<I915_CACHE_CONSTANTS)
+#define I915_HW_IMMEDIATE         (1<<(I915_MAX_CACHE+0))
+#define I915_HW_INVARIENT         (1<<(I915_MAX_CACHE+1))
+
+
+/***********************************************************************
+ * i915_prim_emit.c: 
+ */
+struct draw_stage *i915_draw_render_stage( struct i915_context *i915 );
+
+
+/***********************************************************************
+ * i915_prim_vbuf.c: 
+ */
+struct draw_stage *i915_draw_vbuf_stage( struct i915_context *i915 );
+
+
+/***********************************************************************
+ * i915_state_emit.c: 
+ */
+void i915_emit_hardware_state(struct i915_context *i915 );
+
+
+
+/***********************************************************************
+ * i915_clear.c: 
+ */
+void i915_clear( struct pipe_context *pipe, unsigned buffers, const float *rgba,
+                 double depth, unsigned stencil);
+
+
+/***********************************************************************
+ * 
+ */
+void i915_init_state_functions( struct i915_context *i915 );
+void i915_init_flush_functions( struct i915_context *i915 );
+void i915_init_string_functions( struct i915_context *i915 );
+
+
+/************************************************************************
+ * i915_context.c
+ */
+struct pipe_context *i915_create_context(struct pipe_screen *screen,
+					 void *priv);
+
+
+
+
+/***********************************************************************
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static INLINE struct i915_context *
+i915_context( struct pipe_context *pipe )
+{
+   return (struct i915_context *)pipe;
+}
+
+
+
+#endif
diff --git a/src/gallium/drivers/i915/i915_debug.c b/src/gallium/drivers/i915/i915_debug.c
new file mode 100644
index 0000000000..663fac3055
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_debug.c
@@ -0,0 +1,898 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_debug.h"
+#include "i915_batch.h"
+#include "util/u_debug.h"
+
+
+static void
+PRINTF(
+   struct debug_stream  *stream,
+   const char           *fmt,
+                        ... )
+{
+   va_list  args;
+
+   va_start( args, fmt );
+   debug_vprintf( fmt, args );
+   va_end( args );
+}
+
+
+static boolean debug( struct debug_stream *stream, const char *name, unsigned len )
+{
+   unsigned i;
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   
+   if (len == 0) {
+      PRINTF(stream, "Error - zero length packet (0x%08x)\n", stream->ptr[0]);
+      assert(0);
+      return FALSE;
+   }
+
+   if (stream->print_addresses)
+      PRINTF(stream, "%08x:  ", stream->offset);
+
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   for (i = 0; i < len; i++)
+      PRINTF(stream, "\t0x%08x\n",  ptr[i]);   
+   PRINTF(stream, "\n");
+
+   stream->offset += len * sizeof(unsigned);
+   
+   return TRUE;
+}
+
+
+static const char *get_prim_name( unsigned val )
+{
+   switch (val & PRIM3D_MASK) {
+   case PRIM3D_TRILIST: return "TRILIST"; break;
+   case PRIM3D_TRISTRIP: return "TRISTRIP"; break;
+   case PRIM3D_TRISTRIP_RVRSE: return "TRISTRIP_RVRSE"; break;
+   case PRIM3D_TRIFAN: return "TRIFAN"; break;
+   case PRIM3D_POLY: return "POLY"; break;
+   case PRIM3D_LINELIST: return "LINELIST"; break;
+   case PRIM3D_LINESTRIP: return "LINESTRIP"; break;
+   case PRIM3D_RECTLIST: return "RECTLIST"; break;
+   case PRIM3D_POINTLIST: return "POINTLIST"; break;
+   case PRIM3D_DIB: return "DIB"; break;
+   case PRIM3D_CLEAR_RECT: return "CLEAR_RECT"; break;
+   case PRIM3D_ZONE_INIT: return "ZONE_INIT"; break;
+   default: return "????"; break;
+   }
+}
+
+static boolean debug_prim( struct debug_stream *stream, const char *name, 
+			     boolean dump_floats,
+			     unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   const char *prim = get_prim_name( ptr[0] );
+   unsigned i;
+   
+
+
+   PRINTF(stream, "%s %s (%d dwords):\n", name, prim, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[0]);   
+   for (i = 1; i < len; i++) {
+      if (dump_floats)
+	 PRINTF(stream, "\t0x%08x // %f\n",  ptr[i], *(float *)&ptr[i]);   
+      else
+	 PRINTF(stream, "\t0x%08x\n",  ptr[i]);   
+   }
+
+      
+   PRINTF(stream, "\n");
+
+   stream->offset += len * sizeof(unsigned);
+   
+   return TRUE;
+}
+   
+
+
+
+static boolean debug_program( struct debug_stream *stream, const char *name, unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+
+   if (len == 0) {
+      PRINTF(stream, "Error - zero length packet (0x%08x)\n", stream->ptr[0]);
+      assert(0);
+      return FALSE;
+   }
+
+   if (stream->print_addresses)
+      PRINTF(stream, "%08x:  ", stream->offset);
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   i915_disassemble_program( stream, ptr, len );
+
+   stream->offset += len * sizeof(unsigned);
+   return TRUE;
+}
+
+
+static boolean debug_chain( struct debug_stream *stream, const char *name, unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned old_offset = stream->offset + len * sizeof(unsigned);
+   unsigned i;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   for (i = 0; i < len; i++)
+      PRINTF(stream, "\t0x%08x\n",  ptr[i]);
+
+   stream->offset = ptr[1] & ~0x3;
+   
+   if (stream->offset < old_offset)
+      PRINTF(stream, "\n... skipping backwards from 0x%x --> 0x%x ...\n\n", 
+		   old_offset, stream->offset );
+   else
+      PRINTF(stream, "\n... skipping from 0x%x --> 0x%x ...\n\n", 
+		   old_offset, stream->offset );
+
+
+   return TRUE;
+}
+
+
+static boolean debug_variable_length_prim( struct debug_stream *stream )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   const char *prim = get_prim_name( ptr[0] );
+   unsigned i, len;
+
+   ushort *idx = (ushort *)(ptr+1);
+   for (i = 0; idx[i] != 0xffff; i++)
+      ;
+
+   len = 1+(i+2)/2;
+
+   PRINTF(stream, "3DPRIM, %s variable length %d indicies (%d dwords):\n", prim, i, len);
+   for (i = 0; i < len; i++)
+      PRINTF(stream, "\t0x%08x\n",  ptr[i]);
+   PRINTF(stream, "\n");
+
+   stream->offset += len * sizeof(unsigned);
+   return TRUE;
+}
+
+
+static void
+BITS(
+   struct debug_stream  *stream,
+   unsigned             dw,
+   unsigned             hi,
+   unsigned             lo,
+   const char           *fmt,
+                        ... )
+{
+   va_list  args;
+   unsigned himask = ~0UL >> (31 - (hi));
+
+   PRINTF(stream, "\t\t ");
+
+   va_start( args, fmt );
+   debug_vprintf( fmt, args );
+   va_end( args );
+
+   PRINTF(stream, ": 0x%x\n", ((dw) & himask) >> (lo));
+}
+
+#ifdef DEBUG
+#define MBZ( dw, hi, lo) do {							\
+   unsigned x = (dw) >> (lo);				\
+   unsigned lomask = (1 << (lo)) - 1;			\
+   unsigned himask;					\
+   himask = (1UL << (hi)) - 1;				\
+   assert ((x & himask & ~lomask) == 0);	\
+} while (0)
+#else
+#define MBZ( dw, hi, lo) do {							\
+} while (0)
+#endif
+
+static void
+FLAG(
+   struct debug_stream  *stream,
+   unsigned             dw,
+   unsigned             bit,
+   const char           *fmt,
+                        ... )
+{
+   if (((dw) >> (bit)) & 1) {
+      va_list  args;
+
+      PRINTF(stream, "\t\t ");
+
+      va_start( args, fmt );
+      debug_vprintf( fmt, args );
+      va_end( args );
+
+      PRINTF(stream, "\n");
+   }
+}
+
+static boolean debug_load_immediate( struct debug_stream *stream,
+				       const char *name,
+				       unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned bits = (ptr[0] >> 4) & 0xff;
+   unsigned j = 0;
+   
+   PRINTF(stream, "%s (%d dwords, flags: %x):\n", name, len, bits);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   if (bits & (1<<0)) {
+      PRINTF(stream, "\t  LIS0: 0x%08x\n", ptr[j]);
+      PRINTF(stream, "\t vb address: 0x%08x\n", (ptr[j] & ~0x3));
+      BITS(stream, ptr[j], 0, 0, "vb invalidate disable");
+      j++;
+   }
+   if (bits & (1<<1)) {
+      PRINTF(stream, "\t  LIS1: 0x%08x\n", ptr[j]);
+      BITS(stream, ptr[j], 29, 24, "vb dword width");
+      BITS(stream, ptr[j], 21, 16, "vb dword pitch");
+      BITS(stream, ptr[j], 15, 0, "vb max index");
+      j++;
+   }
+   if (bits & (1<<2)) {
+      int i;
+      PRINTF(stream, "\t  LIS2: 0x%08x\n", ptr[j]);
+      for (i = 0; i < 8; i++) {
+	 unsigned tc = (ptr[j] >> (i * 4)) & 0xf;
+	 if (tc != 0xf)
+	    BITS(stream, tc, 3, 0, "tex coord %d", i);
+      }
+      j++;
+   }
+   if (bits & (1<<3)) {
+      PRINTF(stream, "\t  LIS3: 0x%08x\n", ptr[j]);
+      j++;
+   }
+   if (bits & (1<<4)) {
+      PRINTF(stream, "\t  LIS4: 0x%08x\n", ptr[j]);
+      BITS(stream, ptr[j], 31, 23, "point width");
+      BITS(stream, ptr[j], 22, 19, "line width");
+      FLAG(stream, ptr[j], 18, "alpha flatshade");
+      FLAG(stream, ptr[j], 17, "fog flatshade");
+      FLAG(stream, ptr[j], 16, "spec flatshade");
+      FLAG(stream, ptr[j], 15, "rgb flatshade");
+      BITS(stream, ptr[j], 14, 13, "cull mode");
+      FLAG(stream, ptr[j], 12, "vfmt: point width");
+      FLAG(stream, ptr[j], 11, "vfmt: specular/fog");
+      FLAG(stream, ptr[j], 10, "vfmt: rgba");
+      FLAG(stream, ptr[j], 9, "vfmt: depth offset");
+      BITS(stream, ptr[j], 8, 6, "vfmt: position (2==xyzw)");
+      FLAG(stream, ptr[j], 5, "force dflt diffuse");
+      FLAG(stream, ptr[j], 4, "force dflt specular");
+      FLAG(stream, ptr[j], 3, "local depth offset enable");
+      FLAG(stream, ptr[j], 2, "vfmt: fp32 fog coord");
+      FLAG(stream, ptr[j], 1, "sprite point");
+      FLAG(stream, ptr[j], 0, "antialiasing");
+      j++;
+   }
+   if (bits & (1<<5)) {
+      PRINTF(stream, "\t  LIS5: 0x%08x\n", ptr[j]);
+      BITS(stream, ptr[j], 31, 28, "rgba write disables");
+      FLAG(stream, ptr[j], 27,     "force dflt point width");
+      FLAG(stream, ptr[j], 26,     "last pixel enable");
+      FLAG(stream, ptr[j], 25,     "global z offset enable");
+      FLAG(stream, ptr[j], 24,     "fog enable");
+      BITS(stream, ptr[j], 23, 16, "stencil ref");
+      BITS(stream, ptr[j], 15, 13, "stencil test");
+      BITS(stream, ptr[j], 12, 10, "stencil fail op");
+      BITS(stream, ptr[j], 9, 7,   "stencil pass z fail op");
+      BITS(stream, ptr[j], 6, 4,   "stencil pass z pass op");
+      FLAG(stream, ptr[j], 3,      "stencil write enable");
+      FLAG(stream, ptr[j], 2,      "stencil test enable");
+      FLAG(stream, ptr[j], 1,      "color dither enable");
+      FLAG(stream, ptr[j], 0,      "logiop enable");
+      j++;
+   }
+   if (bits & (1<<6)) {
+      PRINTF(stream, "\t  LIS6: 0x%08x\n", ptr[j]);
+      FLAG(stream, ptr[j], 31,      "alpha test enable");
+      BITS(stream, ptr[j], 30, 28,  "alpha func");
+      BITS(stream, ptr[j], 27, 20,  "alpha ref");
+      FLAG(stream, ptr[j], 19,      "depth test enable");
+      BITS(stream, ptr[j], 18, 16,  "depth func");
+      FLAG(stream, ptr[j], 15,      "blend enable");
+      BITS(stream, ptr[j], 14, 12,  "blend func");
+      BITS(stream, ptr[j], 11, 8,   "blend src factor");
+      BITS(stream, ptr[j], 7,  4,   "blend dst factor");
+      FLAG(stream, ptr[j], 3,       "depth write enable");
+      FLAG(stream, ptr[j], 2,       "color write enable");
+      BITS(stream, ptr[j], 1,  0,   "provoking vertex"); 
+      j++;
+   }
+
+
+   PRINTF(stream, "\n");
+
+   assert(j == len);
+
+   stream->offset += len * sizeof(unsigned);
+   
+   return TRUE;
+}
+ 
+
+
+static boolean debug_load_indirect( struct debug_stream *stream,
+				      const char *name,
+				      unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned bits = (ptr[0] >> 8) & 0x3f;
+   unsigned i, j = 0;
+   
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   for (i = 0; i < 6; i++) {
+      if (bits & (1<<i)) {
+	 switch (1<<(8+i)) {
+	 case LI0_STATE_STATIC_INDIRECT:
+	    PRINTF(stream, "        STATIC: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 case LI0_STATE_DYNAMIC_INDIRECT:
+	    PRINTF(stream, "       DYNAMIC: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    break;
+	 case LI0_STATE_SAMPLER:
+	    PRINTF(stream, "       SAMPLER: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 case LI0_STATE_MAP:
+	    PRINTF(stream, "           MAP: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 case LI0_STATE_PROGRAM:
+	    PRINTF(stream, "       PROGRAM: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 case LI0_STATE_CONSTANTS:
+	    PRINTF(stream, "     CONSTANTS: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 default:
+	    assert(0);
+	    break;
+	 }
+      }
+   }
+
+   if (bits == 0) {
+      PRINTF(stream, "\t  DUMMY: 0x%08x\n", ptr[j++]);
+   }
+
+   PRINTF(stream, "\n");
+
+
+   assert(j == len);
+
+   stream->offset += len * sizeof(unsigned);
+   
+   return TRUE;
+}
+ 	
+static void BR13( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   FLAG(stream, val, 30, "clipping enable");
+   BITS(stream, val, 25, 24, "color depth (3==32bpp)");
+   BITS(stream, val, 23, 16, "raster op");
+   BITS(stream, val, 15, 0,  "dest pitch");
+}
+
+
+static void BR22( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   BITS(stream, val, 31, 16, "dest y1");
+   BITS(stream, val, 15, 0,  "dest x1");
+}
+
+static void BR23( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   BITS(stream, val, 31, 16, "dest y2");
+   BITS(stream, val, 15, 0,  "dest x2");
+}
+
+static void BR09( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x -- dest address\n",  val);
+}
+
+static void BR26( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   BITS(stream, val, 31, 16, "src y1");
+   BITS(stream, val, 15, 0,  "src x1");
+}
+
+static void BR11( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   BITS(stream, val, 15, 0,  "src pitch");
+}
+
+static void BR12( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x -- src address\n",  val);
+}
+
+static void BR16( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x -- color\n",  val);
+}
+   
+static boolean debug_copy_blit( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+   
+   BR13(stream, ptr[j++]);
+   BR22(stream, ptr[j++]);
+   BR23(stream, ptr[j++]);
+   BR09(stream, ptr[j++]);
+   BR26(stream, ptr[j++]);
+   BR11(stream, ptr[j++]);
+   BR12(stream, ptr[j++]);
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_color_blit( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   BR13(stream, ptr[j++]);
+   BR22(stream, ptr[j++]);
+   BR23(stream, ptr[j++]);
+   BR09(stream, ptr[j++]);
+   BR16(stream, ptr[j++]);
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_modes4( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+   BITS(stream, ptr[j], 21, 18, "logicop func");
+   FLAG(stream, ptr[j], 17, "stencil test mask modify-enable");
+   FLAG(stream, ptr[j], 16, "stencil write mask modify-enable");
+   BITS(stream, ptr[j], 15, 8, "stencil test mask");
+   BITS(stream, ptr[j], 7, 0,  "stencil write mask");
+   j++;
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_map_state( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+   
+   {
+      PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+      BITS(stream, ptr[j], 15, 0,   "map mask");
+      j++;
+   }
+
+   while (j < len) {
+      {
+	 PRINTF(stream, "\t  TMn.0: 0x%08x\n", ptr[j]);
+	 PRINTF(stream, "\t map address: 0x%08x\n", (ptr[j] & ~0x3));
+	 FLAG(stream, ptr[j], 1, "vertical line stride");
+	 FLAG(stream, ptr[j], 0, "vertical line stride offset");
+	 j++;
+      }
+
+      {
+	 PRINTF(stream, "\t  TMn.1: 0x%08x\n", ptr[j]);
+	 BITS(stream, ptr[j], 31, 21, "height");
+	 BITS(stream, ptr[j], 20, 10, "width");
+	 BITS(stream, ptr[j], 9, 7, "surface format");
+	 BITS(stream, ptr[j], 6, 3, "texel format");
+	 FLAG(stream, ptr[j], 2, "use fence regs");
+	 FLAG(stream, ptr[j], 1, "tiled surface");
+	 FLAG(stream, ptr[j], 0, "tile walk ymajor");
+	 j++;
+      }
+      {
+	 PRINTF(stream, "\t  TMn.2: 0x%08x\n", ptr[j]);
+	 BITS(stream, ptr[j], 31, 21, "dword pitch");
+	 BITS(stream, ptr[j], 20, 15, "cube face enables");
+	 BITS(stream, ptr[j], 14, 9, "max lod");
+	 FLAG(stream, ptr[j], 8,     "mip layout right");
+	 BITS(stream, ptr[j], 7, 0, "depth");
+	 j++;
+      }
+   }
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_sampler_state( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+   
+   {
+      PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+      BITS(stream, ptr[j], 15, 0,   "sampler mask");
+      j++;
+   }
+
+   while (j < len) {
+      {
+	 PRINTF(stream, "\t  TSn.0: 0x%08x\n", ptr[j]);
+	 FLAG(stream, ptr[j], 31, "reverse gamma");
+	 FLAG(stream, ptr[j], 30, "planar to packed");
+	 FLAG(stream, ptr[j], 29, "yuv->rgb");
+	 BITS(stream, ptr[j], 28, 27, "chromakey index");
+	 BITS(stream, ptr[j], 26, 22, "base mip level");
+	 BITS(stream, ptr[j], 21, 20, "mip mode filter");
+	 BITS(stream, ptr[j], 19, 17, "mag mode filter");
+	 BITS(stream, ptr[j], 16, 14, "min mode filter");
+	 BITS(stream, ptr[j], 13, 5,  "lod bias (s4.4)");
+	 FLAG(stream, ptr[j], 4,      "shadow enable");
+	 FLAG(stream, ptr[j], 3,      "max-aniso-4");
+	 BITS(stream, ptr[j], 2, 0,   "shadow func");
+	 j++;
+      }
+
+      {
+	 PRINTF(stream, "\t  TSn.1: 0x%08x\n", ptr[j]);
+	 BITS(stream, ptr[j], 31, 24, "min lod");
+	 MBZ( ptr[j], 23, 18 );
+	 FLAG(stream, ptr[j], 17,     "kill pixel enable");
+	 FLAG(stream, ptr[j], 16,     "keyed tex filter mode");
+	 FLAG(stream, ptr[j], 15,     "chromakey enable");
+	 BITS(stream, ptr[j], 14, 12, "tcx wrap mode");
+	 BITS(stream, ptr[j], 11, 9,  "tcy wrap mode");
+	 BITS(stream, ptr[j], 8,  6,  "tcz wrap mode");
+	 FLAG(stream, ptr[j], 5,      "normalized coords");
+	 BITS(stream, ptr[j], 4,  1,  "map (surface) index");
+	 FLAG(stream, ptr[j], 0,      "EAST deinterlacer enable");
+	 j++;
+      }
+      {
+	 PRINTF(stream, "\t  TSn.2: 0x%08x  (default color)\n", ptr[j]);
+	 j++;
+      }
+   }
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_dest_vars( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   {
+      PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+      FLAG(stream, ptr[j], 31,     "early classic ztest");
+      FLAG(stream, ptr[j], 30,     "opengl tex default color");
+      FLAG(stream, ptr[j], 29,     "bypass iz");
+      FLAG(stream, ptr[j], 28,     "lod preclamp");
+      BITS(stream, ptr[j], 27, 26, "dither pattern");
+      FLAG(stream, ptr[j], 25,     "linear gamma blend");
+      FLAG(stream, ptr[j], 24,     "debug dither");
+      BITS(stream, ptr[j], 23, 20, "dstorg x");
+      BITS(stream, ptr[j], 19, 16, "dstorg y");
+      MBZ (ptr[j], 15, 15 );
+      BITS(stream, ptr[j], 14, 12, "422 write select");
+      BITS(stream, ptr[j], 11, 8,  "cbuf format");
+      BITS(stream, ptr[j], 3, 2,   "zbuf format");
+      FLAG(stream, ptr[j], 1,      "vert line stride");
+      FLAG(stream, ptr[j], 1,      "vert line stride offset");
+      j++;
+   }
+   
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_buf_info( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   {
+      PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+      BITS(stream, ptr[j], 28, 28, "aux buffer id");
+      BITS(stream, ptr[j], 27, 24, "buffer id (7=depth, 3=back)");
+      FLAG(stream, ptr[j], 23,     "use fence regs");
+      FLAG(stream, ptr[j], 22,     "tiled surface");
+      FLAG(stream, ptr[j], 21,     "tile walk ymajor");
+      MBZ (ptr[j], 20, 14);
+      BITS(stream, ptr[j], 13, 2,  "dword pitch");
+      MBZ (ptr[j], 2,  0);
+      j++;
+   }
+   
+   PRINTF(stream, "\t0x%08x -- buffer base address\n",  ptr[j++]);
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean i915_debug_packet( struct debug_stream *stream )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned cmd = *ptr;
+   
+   switch (((cmd >> 29) & 0x7)) {
+   case 0x0:
+      switch ((cmd >> 23) & 0x3f) {
+      case 0x0:
+	 return debug(stream, "MI_NOOP", 1);
+      case 0x3:
+	 return debug(stream, "MI_WAIT_FOR_EVENT", 1);
+      case 0x4:
+	 return debug(stream, "MI_FLUSH", 1);
+      case 0xA:
+	 debug(stream, "MI_BATCH_BUFFER_END", 1);
+	 return FALSE;
+      case 0x22:
+	 return debug(stream, "MI_LOAD_REGISTER_IMM", 3);
+      case 0x31:
+	 return debug_chain(stream, "MI_BATCH_BUFFER_START", 2);
+      default:
+         (void)debug(stream, "UNKNOWN 0x0 case!", 1);
+         assert(0);
+	 break;
+      }
+      break;
+   case 0x1:
+      (void) debug(stream, "UNKNOWN 0x1 case!", 1);
+      assert(0);
+      break;
+   case 0x2:
+      switch ((cmd >> 22) & 0xff) {	 
+      case 0x50:
+	 return debug_color_blit(stream, "XY_COLOR_BLT", (cmd & 0xff) + 2);
+      case 0x53:
+	 return debug_copy_blit(stream, "XY_SRC_COPY_BLT", (cmd & 0xff) + 2);
+      default:
+	 return debug(stream, "blit command", (cmd & 0xff) + 2);
+      }
+      break;
+   case 0x3:
+      switch ((cmd >> 24) & 0x1f) {	 
+      case 0x6:
+	 return debug(stream, "3DSTATE_ANTI_ALIASING", 1);
+      case 0x7:
+	 return debug(stream, "3DSTATE_RASTERIZATION_RULES", 1);
+      case 0x8:
+	 return debug(stream, "3DSTATE_BACKFACE_STENCIL_OPS", 2);
+      case 0x9:
+	 return debug(stream, "3DSTATE_BACKFACE_STENCIL_MASKS", 1);
+      case 0xb:
+	 return debug(stream, "3DSTATE_INDEPENDENT_ALPHA_BLEND", 1);
+      case 0xc:
+	 return debug(stream, "3DSTATE_MODES5", 1);	 
+      case 0xd:
+	 return debug_modes4(stream, "3DSTATE_MODES4", 1);
+      case 0x15:
+	 return debug(stream, "3DSTATE_FOG_COLOR", 1);
+      case 0x16:
+	 return debug(stream, "3DSTATE_COORD_SET_BINDINGS", 1);
+      case 0x1c:
+	 /* 3DState16NP */
+	 switch((cmd >> 19) & 0x1f) {
+	 case 0x10:
+	    return debug(stream, "3DSTATE_SCISSOR_ENABLE", 1);
+	 case 0x11:
+	    return debug(stream, "3DSTATE_DEPTH_SUBRECTANGLE_DISABLE", 1);
+	 default:
+            (void) debug(stream, "UNKNOWN 0x1c case!", 1);
+            assert(0);
+	    break;
+	 }
+	 break;
+      case 0x1d:
+	 /* 3DStateMW */
+	 switch ((cmd >> 16) & 0xff) {
+	 case 0x0:
+	    return debug_map_state(stream, "3DSTATE_MAP_STATE", (cmd & 0x1f) + 2);
+	 case 0x1:
+	    return debug_sampler_state(stream, "3DSTATE_SAMPLER_STATE", (cmd & 0x1f) + 2);
+	 case 0x4:
+	    return debug_load_immediate(stream, "3DSTATE_LOAD_STATE_IMMEDIATE", (cmd & 0xf) + 2);
+	 case 0x5:
+	    return debug_program(stream, "3DSTATE_PIXEL_SHADER_PROGRAM", (cmd & 0x1ff) + 2);
+	 case 0x6:
+	    return debug(stream, "3DSTATE_PIXEL_SHADER_CONSTANTS", (cmd & 0xff) + 2);
+	 case 0x7:
+	    return debug_load_indirect(stream, "3DSTATE_LOAD_INDIRECT", (cmd & 0xff) + 2);
+	 case 0x80:
+	    return debug(stream, "3DSTATE_DRAWING_RECTANGLE", (cmd & 0xffff) + 2);
+	 case 0x81:
+	    return debug(stream, "3DSTATE_SCISSOR_RECTANGLE", (cmd & 0xffff) + 2);
+	 case 0x83:
+	    return debug(stream, "3DSTATE_SPAN_STIPPLE", (cmd & 0xffff) + 2);
+	 case 0x85:
+	    return debug_dest_vars(stream, "3DSTATE_DEST_BUFFER_VARS", (cmd & 0xffff) + 2);
+	 case 0x88:
+	    return debug(stream, "3DSTATE_CONSTANT_BLEND_COLOR", (cmd & 0xffff) + 2);
+	 case 0x89:
+	    return debug(stream, "3DSTATE_FOG_MODE", (cmd & 0xffff) + 2);
+	 case 0x8e:
+	    return debug_buf_info(stream, "3DSTATE_BUFFER_INFO", (cmd & 0xffff) + 2);
+	 case 0x97:
+	    return debug(stream, "3DSTATE_DEPTH_OFFSET_SCALE", (cmd & 0xffff) + 2);
+	 case 0x98:
+	    return debug(stream, "3DSTATE_DEFAULT_Z", (cmd & 0xffff) + 2);
+	 case 0x99:
+	    return debug(stream, "3DSTATE_DEFAULT_DIFFUSE", (cmd & 0xffff) + 2);
+	 case 0x9a:
+	    return debug(stream, "3DSTATE_DEFAULT_SPECULAR", (cmd & 0xffff) + 2);
+	 case 0x9c:
+	    return debug(stream, "3DSTATE_CLEAR_PARAMETERS", (cmd & 0xffff) + 2);
+	 default:
+	    assert(0);
+	    return 0;
+	 }
+	 break;
+      case 0x1e:
+	 if (cmd & (1 << 23))
+	    return debug(stream, "???", (cmd & 0xffff) + 1);
+	 else
+	    return debug(stream, "", 1);
+	 break;
+      case 0x1f:
+	 if ((cmd & (1 << 23)) == 0)	
+	    return debug_prim(stream, "3DPRIM (inline)", 1, (cmd & 0x1ffff) + 2);
+	 else if (cmd & (1 << 17)) 
+	 {
+	    if ((cmd & 0xffff) == 0)
+	       return debug_variable_length_prim(stream);
+	    else
+	       return debug_prim(stream, "3DPRIM (indexed)", 0, (((cmd & 0xffff) + 1) / 2) + 1);
+	 }
+	 else
+	    return debug_prim(stream, "3DPRIM  (indirect sequential)", 0, 2); 
+	 break;
+      default:
+	 return debug(stream, "", 0);
+      }
+      break;
+   default:
+      assert(0);
+      return 0;
+   }
+
+   assert(0);
+   return 0;
+}
+
+
+
+void
+i915_dump_batchbuffer( struct i915_winsys_batchbuffer *batch )
+{
+   struct debug_stream stream;
+   unsigned *start = (unsigned*)batch->map;
+   unsigned *end = (unsigned*)batch->ptr;
+   unsigned long bytes = (unsigned long) (end - start) * 4;
+   boolean done = FALSE;
+
+   stream.offset = 0;
+   stream.ptr = (char *)start;
+   stream.print_addresses = 0;
+
+   if (!start || !end) {
+      debug_printf( "\n\nBATCH: ???\n");
+      return;
+   }
+   
+   debug_printf( "\n\nBATCH: (%d)\n", (int)bytes / 4);
+
+   while (!done &&
+	  stream.offset < bytes)
+   {
+      if (!i915_debug_packet( &stream ))
+	 break;
+
+      assert(stream.offset <= bytes &&
+	     stream.offset >= 0);
+   }
+
+   debug_printf( "END-BATCH\n\n\n");
+}
+
+
diff --git a/src/gallium/drivers/i915/i915_debug.h b/src/gallium/drivers/i915/i915_debug.h
new file mode 100644
index 0000000000..67b8d9c2f6
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_debug.h
@@ -0,0 +1,114 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef I915_DEBUG_H
+#define I915_DEBUG_H
+
+#include <stdarg.h>
+
+struct i915_context;
+
+struct debug_stream 
+{
+   unsigned offset;		/* current gtt offset */
+   char *ptr;		/* pointer to gtt offset zero */
+   char *end;		/* pointer to gtt offset zero */
+   unsigned print_addresses;
+};
+
+
+/* Internal functions
+ */
+void i915_disassemble_program(struct debug_stream *stream, 
+			      const unsigned *program, unsigned sz);
+
+void i915_print_ureg(const char *msg, unsigned ureg);
+
+
+#define DEBUG_BATCH	 0x1
+#define DEBUG_BLIT       0x2
+#define DEBUG_BUFFER     0x4
+#define DEBUG_CONSTANTS  0x8
+#define DEBUG_CONTEXT    0x10
+#define DEBUG_DRAW	 0x20
+#define DEBUG_DYNAMIC	 0x40
+#define DEBUG_FLUSH      0x80
+#define DEBUG_MAP	 0x100
+#define DEBUG_PROGRAM	 0x200
+#define DEBUG_REGIONS    0x400
+#define DEBUG_SAMPLER	 0x800
+#define DEBUG_STATIC	 0x1000
+#define DEBUG_SURFACE    0x2000
+#define DEBUG_WINSYS     0x4000
+
+#include "pipe/p_compiler.h"
+
+#if defined(DEBUG) && defined(FILE_DEBUG_FLAG)
+
+#include "util/u_simple_screen.h"
+
+static INLINE void
+I915_DBG(
+   struct i915_context  *i915,
+   const char           *fmt,
+                        ... )
+{
+   if ((i915)->debug & FILE_DEBUG_FLAG) {
+      va_list  args;
+
+      va_start( args, fmt );
+      debug_vprintf( fmt, args );
+      va_end( args );
+   }
+}
+
+#else
+
+static INLINE void
+I915_DBG(
+   struct i915_context  *i915,
+   const char           *fmt,
+                        ... )
+{
+   (void) i915;
+   (void) fmt;
+}
+
+#endif
+
+
+struct i915_winsys_batchbuffer;
+
+void i915_dump_batchbuffer( struct i915_winsys_batchbuffer *i915 );
+
+void i915_debug_init( struct i915_context *i915 );
+
+
+#endif
diff --git a/src/gallium/drivers/i915/i915_debug_fp.c b/src/gallium/drivers/i915/i915_debug_fp.c
new file mode 100644
index 0000000000..f41c51f299
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_debug_fp.c
@@ -0,0 +1,362 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "i915_reg.h"
+#include "i915_debug.h"
+#include "util/u_debug.h"
+
+
+static void
+PRINTF(
+   struct debug_stream  *stream,
+   const char           *fmt,
+                        ... )
+{
+   va_list  args;
+
+   va_start( args, fmt );
+   debug_vprintf( fmt, args );
+   va_end( args );
+}
+
+
+static const char *opcodes[0x20] = {
+   "NOP",
+   "ADD",
+   "MOV",
+   "MUL",
+   "MAD",
+   "DP2ADD",
+   "DP3",
+   "DP4",
+   "FRC",
+   "RCP",
+   "RSQ",
+   "EXP",
+   "LOG",
+   "CMP",
+   "MIN",
+   "MAX",
+   "FLR",
+   "MOD",
+   "TRC",
+   "SGE",
+   "SLT",
+   "TEXLD",
+   "TEXLDP",
+   "TEXLDB",
+   "TEXKILL",
+   "DCL",
+   "0x1a",
+   "0x1b",
+   "0x1c",
+   "0x1d",
+   "0x1e",
+   "0x1f",
+};
+
+
+static const int args[0x20] = {
+   0,                           /* 0 nop */
+   2,                           /* 1 add */
+   1,                           /* 2 mov */
+   2,                           /* 3 m ul */
+   3,                           /* 4 mad */
+   3,                           /* 5 dp2add */
+   2,                           /* 6 dp3 */
+   2,                           /* 7 dp4 */
+   1,                           /* 8 frc */
+   1,                           /* 9 rcp */
+   1,                           /* a rsq */
+   1,                           /* b exp */
+   1,                           /* c log */
+   3,                           /* d cmp */
+   2,                           /* e min */
+   2,                           /* f max */
+   1,                           /* 10 flr */
+   1,                           /* 11 mod */
+   1,                           /* 12 trc */
+   2,                           /* 13 sge */
+   2,                           /* 14 slt */
+   1,
+   1,
+   1,
+   1,
+   0,
+   0,
+   0,
+   0,
+   0,
+   0,
+   0,
+};
+
+
+static const char *regname[0x8] = {
+   "R",
+   "T",
+   "CONST",
+   "S",
+   "OC",
+   "OD",
+   "U",
+   "UNKNOWN",
+};
+
+static void
+print_reg_type_nr(struct debug_stream *stream, unsigned type, unsigned nr)
+{
+   switch (type) {
+   case REG_TYPE_T:
+      switch (nr) {
+      case T_DIFFUSE:
+         PRINTF(stream, "T_DIFFUSE");
+         return;
+      case T_SPECULAR:
+         PRINTF(stream, "T_SPECULAR");
+         return;
+      case T_FOG_W:
+         PRINTF(stream, "T_FOG_W");
+         return;
+      default:
+         PRINTF(stream, "T_TEX%d", nr);
+         return;
+      }
+   case REG_TYPE_OC:
+      if (nr == 0) {
+         PRINTF(stream, "oC");
+         return;
+      }
+      break;
+   case REG_TYPE_OD:
+      if (nr == 0) {
+         PRINTF(stream, "oD");
+         return;
+      }
+      break;
+   default:
+      break;
+   }
+
+   PRINTF(stream, "%s[%d]", regname[type], nr);
+}
+
+#define REG_SWIZZLE_MASK 0x7777
+#define REG_NEGATE_MASK 0x8888
+
+#define REG_SWIZZLE_XYZW ((SRC_X << A2_SRC2_CHANNEL_X_SHIFT) |	\
+		      (SRC_Y << A2_SRC2_CHANNEL_Y_SHIFT) |	\
+		      (SRC_Z << A2_SRC2_CHANNEL_Z_SHIFT) |	\
+		      (SRC_W << A2_SRC2_CHANNEL_W_SHIFT))
+
+
+static void
+print_reg_neg_swizzle(struct debug_stream *stream, unsigned reg)
+{
+   int i;
+
+   if ((reg & REG_SWIZZLE_MASK) == REG_SWIZZLE_XYZW &&
+       (reg & REG_NEGATE_MASK) == 0)
+      return;
+
+   PRINTF(stream, ".");
+
+   for (i = 3; i >= 0; i--) {
+      if (reg & (1 << ((i * 4) + 3)))
+         PRINTF(stream, "-");
+
+      switch ((reg >> (i * 4)) & 0x7) {
+      case 0:
+         PRINTF(stream, "x");
+         break;
+      case 1:
+         PRINTF(stream, "y");
+         break;
+      case 2:
+         PRINTF(stream, "z");
+         break;
+      case 3:
+         PRINTF(stream, "w");
+         break;
+      case 4:
+         PRINTF(stream, "0");
+         break;
+      case 5:
+         PRINTF(stream, "1");
+         break;
+      default:
+         PRINTF(stream, "?");
+         break;
+      }
+   }
+}
+
+
+static void
+print_src_reg(struct debug_stream *stream, unsigned dword)
+{
+   unsigned nr = (dword >> A2_SRC2_NR_SHIFT) & REG_NR_MASK;
+   unsigned type = (dword >> A2_SRC2_TYPE_SHIFT) & REG_TYPE_MASK;
+   print_reg_type_nr(stream, type, nr);
+   print_reg_neg_swizzle(stream, dword);
+}
+
+
+static void
+print_dest_reg(struct debug_stream *stream, unsigned dword)
+{
+   unsigned nr = (dword >> A0_DEST_NR_SHIFT) & REG_NR_MASK;
+   unsigned type = (dword >> A0_DEST_TYPE_SHIFT) & REG_TYPE_MASK;
+   print_reg_type_nr(stream, type, nr);
+   if ((dword & A0_DEST_CHANNEL_ALL) == A0_DEST_CHANNEL_ALL)
+      return;
+   PRINTF(stream, ".");
+   if (dword & A0_DEST_CHANNEL_X)
+      PRINTF(stream, "x");
+   if (dword & A0_DEST_CHANNEL_Y)
+      PRINTF(stream, "y");
+   if (dword & A0_DEST_CHANNEL_Z)
+      PRINTF(stream, "z");
+   if (dword & A0_DEST_CHANNEL_W)
+      PRINTF(stream, "w");
+}
+
+
+#define GET_SRC0_REG(r0, r1) ((r0<<14)|(r1>>A1_SRC0_CHANNEL_W_SHIFT))
+#define GET_SRC1_REG(r0, r1) ((r0<<8)|(r1>>A2_SRC1_CHANNEL_W_SHIFT))
+#define GET_SRC2_REG(r)      (r)
+
+
+static void
+print_arith_op(struct debug_stream *stream, 
+	       unsigned opcode, const unsigned * program)
+{
+   if (opcode != A0_NOP) {
+      print_dest_reg(stream, program[0]);
+      if (program[0] & A0_DEST_SATURATE)
+         PRINTF(stream, " = SATURATE ");
+      else
+         PRINTF(stream, " = ");
+   }
+
+   PRINTF(stream, "%s ", opcodes[opcode]);
+
+   print_src_reg(stream, GET_SRC0_REG(program[0], program[1]));
+   if (args[opcode] == 1) {
+      PRINTF(stream, "\n");
+      return;
+   }
+
+   PRINTF(stream, ", ");
+   print_src_reg(stream, GET_SRC1_REG(program[1], program[2]));
+   if (args[opcode] == 2) {
+      PRINTF(stream, "\n");
+      return;
+   }
+
+   PRINTF(stream, ", ");
+   print_src_reg(stream, GET_SRC2_REG(program[2]));
+   PRINTF(stream, "\n");
+   return;
+}
+
+
+static void
+print_tex_op(struct debug_stream *stream, 
+	     unsigned opcode, const unsigned * program)
+{
+   print_dest_reg(stream, program[0] | A0_DEST_CHANNEL_ALL);
+   PRINTF(stream, " = ");
+
+   PRINTF(stream, "%s ", opcodes[opcode]);
+
+   PRINTF(stream, "S[%d],", program[0] & T0_SAMPLER_NR_MASK);
+
+   print_reg_type_nr(stream, 
+		     (program[1] >> T1_ADDRESS_REG_TYPE_SHIFT) &
+                     REG_TYPE_MASK,
+                     (program[1] >> T1_ADDRESS_REG_NR_SHIFT) & REG_NR_MASK);
+   PRINTF(stream, "\n");
+}
+
+static void
+print_texkil_op(struct debug_stream *stream, 
+                unsigned opcode, const unsigned * program)
+{
+   PRINTF(stream, "TEXKIL ");
+
+   print_reg_type_nr(stream, 
+		     (program[1] >> T1_ADDRESS_REG_TYPE_SHIFT) &
+                     REG_TYPE_MASK,
+                     (program[1] >> T1_ADDRESS_REG_NR_SHIFT) & REG_NR_MASK);
+   PRINTF(stream, "\n");
+}
+
+static void
+print_dcl_op(struct debug_stream *stream, 
+	     unsigned opcode, const unsigned * program)
+{
+   PRINTF(stream, "%s ", opcodes[opcode]);
+   print_dest_reg(stream, 
+		  program[0] | A0_DEST_CHANNEL_ALL);
+   PRINTF(stream, "\n");
+}
+
+
+void
+i915_disassemble_program(struct debug_stream *stream, 
+			 const unsigned * program, unsigned sz)
+{
+   unsigned i;
+
+   PRINTF(stream, "\t\tBEGIN\n");
+
+   assert((program[0] & 0x1ff) + 2 == sz);
+
+   program++;
+   for (i = 1; i < sz; i += 3, program += 3) {
+      unsigned opcode = program[0] & (0x1f << 24);
+
+      PRINTF(stream, "\t\t");
+
+      if ((int) opcode >= A0_NOP && opcode <= A0_SLT)
+         print_arith_op(stream, opcode >> 24, program);
+      else if (opcode >= T0_TEXLD && opcode < T0_TEXKILL)
+         print_tex_op(stream, opcode >> 24, program);
+      else if (opcode == T0_TEXKILL)
+         print_texkil_op(stream, opcode >> 24, program);
+      else if (opcode == D0_DCL)
+         print_dcl_op(stream, opcode >> 24, program);
+      else
+         PRINTF(stream, "Unknown opcode 0x%x\n", opcode);
+   }
+
+   PRINTF(stream, "\t\tEND\n\n");
+}
+
+
diff --git a/src/gallium/drivers/i915/i915_flush.c b/src/gallium/drivers/i915/i915_flush.c
new file mode 100644
index 0000000000..1582168eba
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_flush.c
@@ -0,0 +1,86 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "draw/draw_context.h"
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_batch.h"
+
+
+static void i915_flush( struct pipe_context *pipe,
+                        unsigned flags,
+                        struct pipe_fence_handle **fence )
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   draw_flush(i915->draw);
+
+#if 0
+   /* Do we need to emit an MI_FLUSH command to flush the hardware
+    * caches?
+    */
+   if (flags & (PIPE_FLUSH_RENDER_CACHE | PIPE_FLUSH_TEXTURE_CACHE)) {
+      unsigned flush = MI_FLUSH;
+      
+      if (!(flags & PIPE_FLUSH_RENDER_CACHE))
+	 flush |= INHIBIT_FLUSH_RENDER_CACHE;
+
+      if (flags & PIPE_FLUSH_TEXTURE_CACHE)
+	 flush |= FLUSH_MAP_CACHE;
+
+      if (!BEGIN_BATCH(1, 0)) {
+	 FLUSH_BATCH(NULL);
+	 assert(BEGIN_BATCH(1, 0));
+      }
+      OUT_BATCH( flush );
+   }
+#endif
+
+#if 0
+   if (i915->batch->map == i915->batch->ptr) {
+      return;
+   }
+#endif
+
+   /* If there are no flags, just flush pending commands to hardware:
+    */
+   FLUSH_BATCH(fence);
+   i915->vbo_flushed = 1;
+}
+
+
+
+void i915_init_flush_functions( struct i915_context *i915 )
+{
+   i915->base.flush = i915_flush;
+}
diff --git a/src/gallium/drivers/i915/i915_fpc.h b/src/gallium/drivers/i915/i915_fpc.h
new file mode 100644
index 0000000000..2f0f99d046
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_fpc.h
@@ -0,0 +1,207 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef I915_FPC_H
+#define I915_FPC_H
+
+
+#include "i915_context.h"
+#include "i915_reg.h"
+
+
+
+#define I915_PROGRAM_SIZE 192
+
+
+
+/**
+ * Program translation state
+ */
+struct i915_fp_compile {
+   struct i915_fragment_shader *shader;  /* the shader we're compiling */
+
+   boolean used_constants[I915_MAX_CONSTANT];
+
+   /** maps TGSI immediate index to constant slot */
+   uint num_immediates;
+   uint immediates_map[I915_MAX_CONSTANT];
+   float immediates[I915_MAX_CONSTANT][4];
+
+   boolean first_instruction;
+
+   uint declarations[I915_PROGRAM_SIZE];
+   uint program[I915_PROGRAM_SIZE];
+
+   uint *csr;            /**< Cursor, points into program. */
+
+   uint *decl;           /**< Cursor, points into declarations. */
+
+   uint decl_s;          /**< flags for which s regs need to be decl'd */
+   uint decl_t;          /**< flags for which t regs need to be decl'd */
+
+   uint temp_flag;       /**< Tracks temporary regs which are in use */
+   uint utemp_flag;      /**< Tracks TYPE_U temporary regs which are in use */
+
+   uint nr_tex_indirect;
+   uint nr_tex_insn;
+   uint nr_alu_insn;
+   uint nr_decl_insn;
+
+   boolean error;      /**< Set if i915_program_error() is called */
+   uint wpos_tex;
+   uint NumNativeInstructions;
+   uint NumNativeAluInstructions;
+   uint NumNativeTexInstructions;
+   uint NumNativeTexIndirections;
+};
+
+
+/* Having zero and one in here makes the definition of swizzle a lot
+ * easier.
+ */
+#define UREG_TYPE_SHIFT               29
+#define UREG_NR_SHIFT                 24
+#define UREG_CHANNEL_X_NEGATE_SHIFT   23
+#define UREG_CHANNEL_X_SHIFT          20
+#define UREG_CHANNEL_Y_NEGATE_SHIFT   19
+#define UREG_CHANNEL_Y_SHIFT          16
+#define UREG_CHANNEL_Z_NEGATE_SHIFT   15
+#define UREG_CHANNEL_Z_SHIFT          12
+#define UREG_CHANNEL_W_NEGATE_SHIFT   11
+#define UREG_CHANNEL_W_SHIFT          8
+#define UREG_CHANNEL_ZERO_NEGATE_MBZ  5
+#define UREG_CHANNEL_ZERO_SHIFT       4
+#define UREG_CHANNEL_ONE_NEGATE_MBZ   1
+#define UREG_CHANNEL_ONE_SHIFT        0
+
+#define UREG_BAD          0xffffffff    /* not a valid ureg */
+
+#define X    SRC_X
+#define Y    SRC_Y
+#define Z    SRC_Z
+#define W    SRC_W
+#define ZERO SRC_ZERO
+#define ONE  SRC_ONE
+
+/* Construct a ureg:
+ */
+#define UREG( type, nr ) (((type)<< UREG_TYPE_SHIFT) |		\
+			  ((nr)  << UREG_NR_SHIFT) |		\
+			  (X     << UREG_CHANNEL_X_SHIFT) |	\
+			  (Y     << UREG_CHANNEL_Y_SHIFT) |	\
+			  (Z     << UREG_CHANNEL_Z_SHIFT) |	\
+			  (W     << UREG_CHANNEL_W_SHIFT) |	\
+			  (ZERO  << UREG_CHANNEL_ZERO_SHIFT) |	\
+			  (ONE   << UREG_CHANNEL_ONE_SHIFT))
+
+#define GET_CHANNEL_SRC( reg, channel ) ((reg<<(channel*4)) & (0xf<<20))
+#define CHANNEL_SRC( src, channel ) (src>>(channel*4))
+
+#define GET_UREG_TYPE(reg) (((reg)>>UREG_TYPE_SHIFT)&REG_TYPE_MASK)
+#define GET_UREG_NR(reg)   (((reg)>>UREG_NR_SHIFT)&REG_NR_MASK)
+
+
+
+#define UREG_XYZW_CHANNEL_MASK 0x00ffff00
+
+/* One neat thing about the UREG representation:  
+ */
+static INLINE int
+swizzle(int reg, uint x, uint y, uint z, uint w)
+{
+   assert(x <= SRC_ONE);
+   assert(y <= SRC_ONE);
+   assert(z <= SRC_ONE);
+   assert(w <= SRC_ONE);
+   return ((reg & ~UREG_XYZW_CHANNEL_MASK) |
+           CHANNEL_SRC(GET_CHANNEL_SRC(reg, x), 0) |
+           CHANNEL_SRC(GET_CHANNEL_SRC(reg, y), 1) |
+           CHANNEL_SRC(GET_CHANNEL_SRC(reg, z), 2) |
+           CHANNEL_SRC(GET_CHANNEL_SRC(reg, w), 3));
+}
+
+
+
+/***********************************************************************
+ * Public interface for the compiler
+ */
+extern void
+i915_translate_fragment_program( struct i915_context *i915,
+                                 struct i915_fragment_shader *fs);
+
+
+
+extern uint i915_get_temp(struct i915_fp_compile *p);
+extern uint i915_get_utemp(struct i915_fp_compile *p);
+extern void i915_release_utemps(struct i915_fp_compile *p);
+
+
+extern uint i915_emit_texld(struct i915_fp_compile *p,
+                              uint dest,
+                              uint destmask,
+                              uint sampler, uint coord, uint op);
+
+extern uint i915_emit_arith(struct i915_fp_compile *p,
+                              uint op,
+                              uint dest,
+                              uint mask,
+                              uint saturate,
+                              uint src0, uint src1, uint src2);
+
+extern uint i915_emit_decl(struct i915_fp_compile *p,
+                             uint type, uint nr, uint d0_flags);
+
+
+extern uint i915_emit_const1f(struct i915_fp_compile *p, float c0);
+
+extern uint i915_emit_const2f(struct i915_fp_compile *p,
+                                float c0, float c1);
+
+extern uint i915_emit_const4fv(struct i915_fp_compile *p,
+                                 const float * c);
+
+extern uint i915_emit_const4f(struct i915_fp_compile *p,
+                                float c0, float c1,
+                                float c2, float c3);
+
+
+/*======================================================================
+ * i915_fpc_debug.c
+ */
+extern void i915_disassemble_program(const uint * program, uint sz);
+
+
+/*======================================================================
+ * i915_fpc_translate.c
+ */
+
+extern void
+i915_program_error(struct i915_fp_compile *p, const char *msg, ...);
+
+
+#endif
diff --git a/src/gallium/drivers/i915/i915_fpc_emit.c b/src/gallium/drivers/i915/i915_fpc_emit.c
new file mode 100644
index 0000000000..76c24d2b2f
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_fpc_emit.c
@@ -0,0 +1,375 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_fpc.h"
+#include "util/u_math.h"
+
+
+#define A0_DEST( reg ) (((reg)&UREG_TYPE_NR_MASK)>>UREG_A0_DEST_SHIFT_LEFT)
+#define D0_DEST( reg ) (((reg)&UREG_TYPE_NR_MASK)>>UREG_A0_DEST_SHIFT_LEFT)
+#define T0_DEST( reg ) (((reg)&UREG_TYPE_NR_MASK)>>UREG_A0_DEST_SHIFT_LEFT)
+#define A0_SRC0( reg ) (((reg)&UREG_MASK)>>UREG_A0_SRC0_SHIFT_LEFT)
+#define A1_SRC0( reg ) (((reg)&UREG_MASK)<<UREG_A1_SRC0_SHIFT_RIGHT)
+#define A1_SRC1( reg ) (((reg)&UREG_MASK)>>UREG_A1_SRC1_SHIFT_LEFT)
+#define A2_SRC1( reg ) (((reg)&UREG_MASK)<<UREG_A2_SRC1_SHIFT_RIGHT)
+#define A2_SRC2( reg ) (((reg)&UREG_MASK)>>UREG_A2_SRC2_SHIFT_LEFT)
+
+/* These are special, and don't have swizzle/negate bits.
+ */
+#define T0_SAMPLER( reg )     (GET_UREG_NR(reg)<<T0_SAMPLER_NR_SHIFT)
+#define T1_ADDRESS_REG( reg ) ((GET_UREG_NR(reg)<<T1_ADDRESS_REG_NR_SHIFT) | \
+			       (GET_UREG_TYPE(reg)<<T1_ADDRESS_REG_TYPE_SHIFT))
+
+
+/* Macros for translating UREG's into the various register fields used
+ * by the I915 programmable unit.
+ */
+#define UREG_A0_DEST_SHIFT_LEFT  (UREG_TYPE_SHIFT - A0_DEST_TYPE_SHIFT)
+#define UREG_A0_SRC0_SHIFT_LEFT  (UREG_TYPE_SHIFT - A0_SRC0_TYPE_SHIFT)
+#define UREG_A1_SRC0_SHIFT_RIGHT (A1_SRC0_CHANNEL_W_SHIFT - UREG_CHANNEL_W_SHIFT)
+#define UREG_A1_SRC1_SHIFT_LEFT  (UREG_TYPE_SHIFT - A1_SRC1_TYPE_SHIFT)
+#define UREG_A2_SRC1_SHIFT_RIGHT (A2_SRC1_CHANNEL_W_SHIFT - UREG_CHANNEL_W_SHIFT)
+#define UREG_A2_SRC2_SHIFT_LEFT  (UREG_TYPE_SHIFT - A2_SRC2_TYPE_SHIFT)
+
+#define UREG_MASK         0xffffff00
+#define UREG_TYPE_NR_MASK ((REG_TYPE_MASK << UREG_TYPE_SHIFT) | \
+  			   (REG_NR_MASK << UREG_NR_SHIFT))
+
+
+uint
+i915_get_temp(struct i915_fp_compile *p)
+{
+   int bit = ffs(~p->temp_flag);
+   if (!bit) {
+      i915_program_error(p, "i915_get_temp: out of temporaries\n");
+      return 0;
+   }
+
+   p->temp_flag |= 1 << (bit - 1);
+   return bit - 1;
+}
+
+
+static void
+i915_release_temp(struct i915_fp_compile *p, int reg)
+{
+   p->temp_flag &= ~(1 << reg);
+}
+
+
+/**
+ * Get unpreserved temporary, a temp whose value is not preserved between
+ * PS program phases.
+ */
+uint
+i915_get_utemp(struct i915_fp_compile * p)
+{
+   int bit = ffs(~p->utemp_flag);
+   if (!bit) {
+      i915_program_error(p, "i915_get_utemp: out of temporaries\n");
+      return 0;
+   }
+
+   p->utemp_flag |= 1 << (bit - 1);
+   return UREG(REG_TYPE_U, (bit - 1));
+}
+
+void
+i915_release_utemps(struct i915_fp_compile *p)
+{
+   p->utemp_flag = ~0x7;
+}
+
+
+uint
+i915_emit_decl(struct i915_fp_compile *p,
+               uint type, uint nr, uint d0_flags)
+{
+   uint reg = UREG(type, nr);
+
+   if (type == REG_TYPE_T) {
+      if (p->decl_t & (1 << nr))
+         return reg;
+
+      p->decl_t |= (1 << nr);
+   }
+   else if (type == REG_TYPE_S) {
+      if (p->decl_s & (1 << nr))
+         return reg;
+
+      p->decl_s |= (1 << nr);
+   }
+   else
+      return reg;
+
+   *(p->decl++) = (D0_DCL | D0_DEST(reg) | d0_flags);
+   *(p->decl++) = D1_MBZ;
+   *(p->decl++) = D2_MBZ;
+
+   p->nr_decl_insn++;
+   return reg;
+}
+
+uint
+i915_emit_arith(struct i915_fp_compile * p,
+                uint op,
+                uint dest,
+                uint mask,
+                uint saturate, uint src0, uint src1, uint src2)
+{
+   uint c[3];
+   uint nr_const = 0;
+
+   assert(GET_UREG_TYPE(dest) != REG_TYPE_CONST);
+   dest = UREG(GET_UREG_TYPE(dest), GET_UREG_NR(dest));
+   assert(dest);
+
+   if (GET_UREG_TYPE(src0) == REG_TYPE_CONST)
+      c[nr_const++] = 0;
+   if (GET_UREG_TYPE(src1) == REG_TYPE_CONST)
+      c[nr_const++] = 1;
+   if (GET_UREG_TYPE(src2) == REG_TYPE_CONST)
+      c[nr_const++] = 2;
+
+   /* Recursively call this function to MOV additional const values
+    * into temporary registers.  Use utemp registers for this -
+    * currently shouldn't be possible to run out, but keep an eye on
+    * this.
+    */
+   if (nr_const > 1) {
+      uint s[3], first, i, old_utemp_flag;
+
+      s[0] = src0;
+      s[1] = src1;
+      s[2] = src2;
+      old_utemp_flag = p->utemp_flag;
+
+      first = GET_UREG_NR(s[c[0]]);
+      for (i = 1; i < nr_const; i++) {
+         if (GET_UREG_NR(s[c[i]]) != first) {
+            uint tmp = i915_get_utemp(p);
+
+            i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_ALL, 0,
+                            s[c[i]], 0, 0);
+            s[c[i]] = tmp;
+         }
+      }
+
+      src0 = s[0];
+      src1 = s[1];
+      src2 = s[2];
+      p->utemp_flag = old_utemp_flag;   /* restore */
+   }
+
+   *(p->csr++) = (op | A0_DEST(dest) | mask | saturate | A0_SRC0(src0));
+   *(p->csr++) = (A1_SRC0(src0) | A1_SRC1(src1));
+   *(p->csr++) = (A2_SRC1(src1) | A2_SRC2(src2));
+
+   p->nr_alu_insn++;
+   return dest;
+}
+
+
+/**
+ * Emit a texture load or texkill instruction.
+ * \param dest  the dest i915 register
+ * \param destmask  the dest register writemask
+ * \param sampler  the i915 sampler register
+ * \param coord  the i915 source texcoord operand
+ * \param opcode  the instruction opcode
+ */
+uint i915_emit_texld( struct i915_fp_compile *p,
+			uint dest,
+			uint destmask,
+			uint sampler,
+			uint coord,
+			uint opcode )
+{
+   const uint k = UREG(GET_UREG_TYPE(coord), GET_UREG_NR(coord));
+   int temp = -1;
+
+   if (coord != k) {
+      /* texcoord is swizzled or negated.  Need to allocate a new temporary
+       * register (a utemp / unpreserved temp) won't do.
+       */
+      uint tempReg;
+
+      temp = i915_get_temp(p);           /* get temp reg index */
+      tempReg = UREG(REG_TYPE_R, temp);  /* make i915 register */
+
+      i915_emit_arith( p, A0_MOV,
+                       tempReg, A0_DEST_CHANNEL_ALL, /* dest reg, writemask */
+                       0,                            /* saturate */
+                       coord, 0, 0 );                /* src0, src1, src2 */
+
+      /* new src texcoord is tempReg */
+      coord = tempReg;
+   }
+
+   /* Don't worry about saturate as we only support  
+    */
+   if (destmask != A0_DEST_CHANNEL_ALL) {
+      /* if not writing to XYZW... */
+      uint tmp = i915_get_utemp(p);
+      i915_emit_texld( p, tmp, A0_DEST_CHANNEL_ALL, sampler, coord, opcode );
+      i915_emit_arith( p, A0_MOV, dest, destmask, 0, tmp, 0, 0 );
+      /* XXX release utemp here? */
+   }
+   else {
+      assert(GET_UREG_TYPE(dest) != REG_TYPE_CONST);
+      assert(dest == UREG(GET_UREG_TYPE(dest), GET_UREG_NR(dest)));
+
+      /* is the sampler coord a texcoord input reg? */
+      if (GET_UREG_TYPE(coord) != REG_TYPE_T) {
+	 p->nr_tex_indirect++;
+      }
+
+      *(p->csr++) = (opcode | 
+		     T0_DEST( dest ) |
+		     T0_SAMPLER( sampler ));
+
+      *(p->csr++) = T1_ADDRESS_REG( coord );
+      *(p->csr++) = T2_MBZ;
+
+      p->nr_tex_insn++;
+   }
+
+   if (temp >= 0)
+      i915_release_temp(p, temp);
+
+   return dest;
+}
+
+
+uint
+i915_emit_const1f(struct i915_fp_compile * p, float c0)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   unsigned reg, idx;
+
+   if (c0 == 0.0)
+      return swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
+   if (c0 == 1.0)
+      return swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE);
+
+   for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
+      if (ifs->constant_flags[reg] == I915_CONSTFLAG_USER)
+         continue;
+      for (idx = 0; idx < 4; idx++) {
+         if (!(ifs->constant_flags[reg] & (1 << idx)) ||
+             ifs->constants[reg][idx] == c0) {
+            ifs->constants[reg][idx] = c0;
+            ifs->constant_flags[reg] |= 1 << idx;
+            if (reg + 1 > ifs->num_constants)
+               ifs->num_constants = reg + 1;
+            return swizzle(UREG(REG_TYPE_CONST, reg), idx, ZERO, ZERO, ONE);
+         }
+      }
+   }
+
+   i915_program_error(p, "i915_emit_const1f: out of constants\n");
+   return 0;
+}
+
+uint
+i915_emit_const2f(struct i915_fp_compile * p, float c0, float c1)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   unsigned reg, idx;
+
+   if (c0 == 0.0)
+      return swizzle(i915_emit_const1f(p, c1), ZERO, X, Z, W);
+   if (c0 == 1.0)
+      return swizzle(i915_emit_const1f(p, c1), ONE, X, Z, W);
+
+   if (c1 == 0.0)
+      return swizzle(i915_emit_const1f(p, c0), X, ZERO, Z, W);
+   if (c1 == 1.0)
+      return swizzle(i915_emit_const1f(p, c0), X, ONE, Z, W);
+
+   for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
+      if (ifs->constant_flags[reg] == 0xf ||
+          ifs->constant_flags[reg] == I915_CONSTFLAG_USER)
+         continue;
+      for (idx = 0; idx < 3; idx++) {
+         if (!(ifs->constant_flags[reg] & (3 << idx))) {
+            ifs->constants[reg][idx + 0] = c0;
+            ifs->constants[reg][idx + 1] = c1;
+            ifs->constant_flags[reg] |= 3 << idx;
+            if (reg + 1 > ifs->num_constants)
+               ifs->num_constants = reg + 1;
+            return swizzle(UREG(REG_TYPE_CONST, reg), idx, idx + 1, ZERO, ONE);
+         }
+      }
+   }
+
+   i915_program_error(p, "i915_emit_const2f: out of constants\n");
+   return 0;
+}
+
+
+
+uint
+i915_emit_const4f(struct i915_fp_compile * p,
+                  float c0, float c1, float c2, float c3)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   unsigned reg;
+
+   for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
+      if (ifs->constant_flags[reg] == 0xf &&
+          ifs->constants[reg][0] == c0 &&
+          ifs->constants[reg][1] == c1 &&
+          ifs->constants[reg][2] == c2 &&
+          ifs->constants[reg][3] == c3) {
+         return UREG(REG_TYPE_CONST, reg);
+      }
+      else if (ifs->constant_flags[reg] == 0) {
+
+         ifs->constants[reg][0] = c0;
+         ifs->constants[reg][1] = c1;
+         ifs->constants[reg][2] = c2;
+         ifs->constants[reg][3] = c3;
+         ifs->constant_flags[reg] = 0xf;
+         if (reg + 1 > ifs->num_constants)
+            ifs->num_constants = reg + 1;
+         return UREG(REG_TYPE_CONST, reg);
+      }
+   }
+
+   i915_program_error(p, "i915_emit_const4f: out of constants\n");
+   return 0;
+}
+
+
+uint
+i915_emit_const4fv(struct i915_fp_compile * p, const float * c)
+{
+   return i915_emit_const4f(p, c[0], c[1], c[2], c[3]);
+}
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
new file mode 100644
index 0000000000..25c53210be
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -0,0 +1,1182 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdarg.h>
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_fpc.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "draw/draw_vertex.h"
+
+
+/**
+ * Simple pass-through fragment shader to use when we don't have
+ * a real shader (or it fails to compile for some reason).
+ */
+static unsigned passthrough[] = 
+{
+   _3DSTATE_PIXEL_SHADER_PROGRAM | ((2*3)-1),
+
+   /* declare input color:
+    */
+   (D0_DCL | 
+    (REG_TYPE_T << D0_TYPE_SHIFT) | 
+    (T_DIFFUSE << D0_NR_SHIFT) | 
+    D0_CHANNEL_ALL),
+   0,
+   0,
+
+   /* move to output color:
+    */
+   (A0_MOV | 
+    (REG_TYPE_OC << A0_DEST_TYPE_SHIFT) | 
+    A0_DEST_CHANNEL_ALL | 
+    (REG_TYPE_T << A0_SRC0_TYPE_SHIFT) |
+    (T_DIFFUSE << A0_SRC0_NR_SHIFT)),
+   0x01230000,			/* .xyzw */
+   0
+};
+
+
+/* 1, -1/3!, 1/5!, -1/7! */
+static const float sin_constants[4] = { 1.0,
+   -1.0f / (3 * 2 * 1),
+   1.0f / (5 * 4 * 3 * 2 * 1),
+   -1.0f / (7 * 6 * 5 * 4 * 3 * 2 * 1)
+};
+
+/* 1, -1/2!, 1/4!, -1/6! */
+static const float cos_constants[4] = { 1.0,
+   -1.0f / (2 * 1),
+   1.0f / (4 * 3 * 2 * 1),
+   -1.0f / (6 * 5 * 4 * 3 * 2 * 1)
+};
+
+
+
+/**
+ * component-wise negation of ureg
+ */
+static INLINE int
+negate(int reg, int x, int y, int z, int w)
+{
+   /* Another neat thing about the UREG representation */
+   return reg ^ (((x & 1) << UREG_CHANNEL_X_NEGATE_SHIFT) |
+                 ((y & 1) << UREG_CHANNEL_Y_NEGATE_SHIFT) |
+                 ((z & 1) << UREG_CHANNEL_Z_NEGATE_SHIFT) |
+                 ((w & 1) << UREG_CHANNEL_W_NEGATE_SHIFT));
+}
+
+
+/**
+ * In the event of a translation failure, we'll generate a simple color
+ * pass-through program.
+ */
+static void
+i915_use_passthrough_shader(struct i915_fragment_shader *fs)
+{
+   fs->program = (uint *) MALLOC(sizeof(passthrough));
+   if (fs->program) {
+      memcpy(fs->program, passthrough, sizeof(passthrough));
+      fs->program_len = Elements(passthrough);
+   }
+   fs->num_constants = 0;
+}
+
+
+void
+i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
+{
+   va_list args;
+   char buffer[1024];
+
+   debug_printf("i915_program_error: ");
+   va_start( args, msg );  
+   util_vsnprintf( buffer, sizeof(buffer), msg, args );
+   va_end( args );
+   debug_printf("%s", buffer);
+   debug_printf("\n");
+
+   p->error = 1;
+}
+
+
+
+/**
+ * Construct a ureg for the given source register.  Will emit
+ * constants, apply swizzling and negation as needed.
+ */
+static uint
+src_vector(struct i915_fp_compile *p,
+           const struct tgsi_full_src_register *source)
+{
+   uint index = source->Register.Index;
+   uint src = 0, sem_name, sem_ind;
+
+   switch (source->Register.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (source->Register.Index >= I915_MAX_TEMPORARY) {
+         i915_program_error(p, "Exceeded max temporary reg");
+         return 0;
+      }
+      src = UREG(REG_TYPE_R, index);
+      break;
+   case TGSI_FILE_INPUT:
+      /* XXX: Packing COL1, FOGC into a single attribute works for
+       * texenv programs, but will fail for real fragment programs
+       * that use these attributes and expect them to be a full 4
+       * components wide.  Could use a texcoord to pass these
+       * attributes if necessary, but that won't work in the general
+       * case.
+       * 
+       * We also use a texture coordinate to pass wpos when possible.
+       */
+
+      sem_name = p->shader->info.input_semantic_name[index];
+      sem_ind = p->shader->info.input_semantic_index[index];
+
+      switch (sem_name) {
+      case TGSI_SEMANTIC_POSITION:
+         debug_printf("SKIP SEM POS\n");
+         /*
+         assert(p->wpos_tex != -1);
+         src = i915_emit_decl(p, REG_TYPE_T, p->wpos_tex, D0_CHANNEL_ALL);
+         */
+         break;
+      case TGSI_SEMANTIC_COLOR:
+         if (sem_ind == 0) {
+            src = i915_emit_decl(p, REG_TYPE_T, T_DIFFUSE, D0_CHANNEL_ALL);
+         }
+         else {
+            /* secondary color */
+            assert(sem_ind == 1);
+            src = i915_emit_decl(p, REG_TYPE_T, T_SPECULAR, D0_CHANNEL_XYZ);
+            src = swizzle(src, X, Y, Z, ONE);
+         }
+         break;
+      case TGSI_SEMANTIC_FOG:
+         src = i915_emit_decl(p, REG_TYPE_T, T_FOG_W, D0_CHANNEL_W);
+         src = swizzle(src, W, W, W, W);
+         break;
+      case TGSI_SEMANTIC_GENERIC:
+         /* usually a texcoord */
+         src = i915_emit_decl(p, REG_TYPE_T, T_TEX0 + sem_ind, D0_CHANNEL_ALL);
+         break;
+      default:
+         i915_program_error(p, "Bad source->Index");
+         return 0;
+      }
+      break;
+
+   case TGSI_FILE_IMMEDIATE:
+      assert(index < p->num_immediates);
+      index = p->immediates_map[index];
+      /* fall-through */
+   case TGSI_FILE_CONSTANT:
+      src = UREG(REG_TYPE_CONST, index);
+      break;
+
+   default:
+      i915_program_error(p, "Bad source->File");
+      return 0;
+   }
+
+   src = swizzle(src,
+		 source->Register.SwizzleX,
+		 source->Register.SwizzleY,
+		 source->Register.SwizzleZ,
+		 source->Register.SwizzleW);
+
+
+   /* There's both negate-all-components and per-component negation.
+    * Try to handle both here.
+    */
+   {
+      int n = source->Register.Negate;
+      src = negate(src, n, n, n, n);
+   }
+
+   /* no abs() */
+#if 0
+   /* XXX assertions disabled to allow arbfplight.c to run */
+   /* XXX enable these assertions, or fix things */
+   assert(!source->Register.Absolute);
+#endif
+   return src;
+}
+
+
+/**
+ * Construct a ureg for a destination register.
+ */
+static uint
+get_result_vector(struct i915_fp_compile *p,
+                  const struct tgsi_full_dst_register *dest)
+{
+   switch (dest->Register.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         uint sem_name = p->shader->info.output_semantic_name[dest->Register.Index];
+         switch (sem_name) {
+         case TGSI_SEMANTIC_POSITION:
+            return UREG(REG_TYPE_OD, 0);
+         case TGSI_SEMANTIC_COLOR:
+            return UREG(REG_TYPE_OC, 0);
+         default:
+            i915_program_error(p, "Bad inst->DstReg.Index/semantics");
+            return 0;
+         }
+      }
+   case TGSI_FILE_TEMPORARY:
+      return UREG(REG_TYPE_R, dest->Register.Index);
+   default:
+      i915_program_error(p, "Bad inst->DstReg.File");
+      return 0;
+   }
+}
+
+
+/**
+ * Compute flags for saturation and writemask.
+ */
+static uint
+get_result_flags(const struct tgsi_full_instruction *inst)
+{
+   const uint writeMask
+      = inst->Dst[0].Register.WriteMask;
+   uint flags = 0x0;
+
+   if (inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE)
+      flags |= A0_DEST_SATURATE;
+
+   if (writeMask & TGSI_WRITEMASK_X)
+      flags |= A0_DEST_CHANNEL_X;
+   if (writeMask & TGSI_WRITEMASK_Y)
+      flags |= A0_DEST_CHANNEL_Y;
+   if (writeMask & TGSI_WRITEMASK_Z)
+      flags |= A0_DEST_CHANNEL_Z;
+   if (writeMask & TGSI_WRITEMASK_W)
+      flags |= A0_DEST_CHANNEL_W;
+
+   return flags;
+}
+
+
+/**
+ * Convert TGSI_TEXTURE_x token to DO_SAMPLE_TYPE_x token
+ */
+static uint
+translate_tex_src_target(struct i915_fp_compile *p, uint tex)
+{
+   switch (tex) {
+   case TGSI_TEXTURE_SHADOW1D:
+      /* fall-through */
+   case TGSI_TEXTURE_1D:
+      return D0_SAMPLE_TYPE_2D;
+
+   case TGSI_TEXTURE_SHADOW2D:
+      /* fall-through */
+   case TGSI_TEXTURE_2D:
+      return D0_SAMPLE_TYPE_2D;
+
+   case TGSI_TEXTURE_SHADOWRECT:
+      /* fall-through */
+   case TGSI_TEXTURE_RECT:
+      return D0_SAMPLE_TYPE_2D;
+
+   case TGSI_TEXTURE_3D:
+      return D0_SAMPLE_TYPE_VOLUME;
+
+   case TGSI_TEXTURE_CUBE:
+      return D0_SAMPLE_TYPE_CUBE;
+
+   default:
+      i915_program_error(p, "TexSrc type");
+      return 0;
+   }
+}
+
+
+/**
+ * Generate texel lookup instruction.
+ */
+static void
+emit_tex(struct i915_fp_compile *p,
+         const struct tgsi_full_instruction *inst,
+         uint opcode)
+{
+   uint texture = inst->Texture.Texture;
+   uint unit = inst->Src[1].Register.Index;
+   uint tex = translate_tex_src_target( p, texture );
+   uint sampler = i915_emit_decl(p, REG_TYPE_S, unit, tex);
+   uint coord = src_vector( p, &inst->Src[0]);
+
+   i915_emit_texld( p,
+                    get_result_vector( p, &inst->Dst[0] ),
+                    get_result_flags( inst ),
+                    sampler,
+                    coord,
+                    opcode);
+}
+
+
+/**
+ * Generate a simple arithmetic instruction
+ * \param opcode  the i915 opcode
+ * \param numArgs  the number of input/src arguments
+ */
+static void
+emit_simple_arith(struct i915_fp_compile *p,
+                  const struct tgsi_full_instruction *inst,
+                  uint opcode, uint numArgs)
+{
+   uint arg1, arg2, arg3;
+
+   assert(numArgs <= 3);
+
+   arg1 = (numArgs < 1) ? 0 : src_vector( p, &inst->Src[0] );
+   arg2 = (numArgs < 2) ? 0 : src_vector( p, &inst->Src[1] );
+   arg3 = (numArgs < 3) ? 0 : src_vector( p, &inst->Src[2] );
+
+   i915_emit_arith( p,
+                    opcode,
+                    get_result_vector( p, &inst->Dst[0]),
+                    get_result_flags( inst ), 0,
+                    arg1,
+                    arg2,
+                    arg3 );
+}
+
+
+/** As above, but swap the first two src regs */
+static void
+emit_simple_arith_swap2(struct i915_fp_compile *p,
+                        const struct tgsi_full_instruction *inst,
+                        uint opcode, uint numArgs)
+{
+   struct tgsi_full_instruction inst2;
+
+   assert(numArgs == 2);
+
+   /* transpose first two registers */
+   inst2 = *inst;
+   inst2.Src[0] = inst->Src[1];
+   inst2.Src[1] = inst->Src[0];
+
+   emit_simple_arith(p, &inst2, opcode, numArgs);
+}
+
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+/*
+ * Translate TGSI instruction to i915 instruction.
+ *
+ * Possible concerns:
+ *
+ * SIN, COS -- could use another taylor step?
+ * LIT      -- results seem a little different to sw mesa
+ * LOG      -- different to mesa on negative numbers, but this is conformant.
+ */ 
+static void
+i915_translate_instruction(struct i915_fp_compile *p,
+                           const struct tgsi_full_instruction *inst)
+{
+   uint writemask;
+   uint src0, src1, src2, flags;
+   uint tmp = 0;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ABS:
+      src0 = src_vector(p, &inst->Src[0]);
+      i915_emit_arith(p,
+                      A0_MAX,
+                      get_result_vector(p, &inst->Dst[0]),
+                      get_result_flags(inst), 0,
+                      src0, negate(src0, 1, 1, 1, 1), 0);
+      break;
+
+   case TGSI_OPCODE_ADD:
+      emit_simple_arith(p, inst, A0_ADD, 2);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
+      src2 = src_vector(p, &inst->Src[2]);
+      i915_emit_arith(p, A0_CMP, 
+                      get_result_vector(p, &inst->Dst[0]),
+                      get_result_flags(inst), 
+                      0, src0, src2, src1);   /* NOTE: order of src2, src1 */
+      break;
+
+   case TGSI_OPCODE_COS:
+      src0 = src_vector(p, &inst->Src[0]);
+      tmp = i915_get_utemp(p);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      src0, i915_emit_const1f(p, 1.0f / (float) (M_PI * 2.0)), 0);
+
+      i915_emit_arith(p, A0_MOD, tmp, A0_DEST_CHANNEL_X, 0, tmp, 0, 0);
+
+      /* By choosing different taylor constants, could get rid of this mul:
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      tmp, i915_emit_const1f(p, (float) (M_PI * 2.0)), 0);
+
+      /* 
+       * t0.xy = MUL x.xx11, x.x1111  ; x^2, x, 1, 1
+       * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, 1
+       * t0 = MUL t0.xxz1 t0.z111    ; x^6 x^4 x^2 1
+       * result = DP4 t0, cos_constants
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XY, 0,
+                      swizzle(tmp, X, X, ONE, ONE),
+                      swizzle(tmp, X, ONE, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XYZ, 0,
+                      swizzle(tmp, X, Y, X, ONE),
+                      swizzle(tmp, X, X, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XYZ, 0,
+                      swizzle(tmp, X, X, Z, ONE),
+                      swizzle(tmp, Z, ONE, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_DP4,
+                      get_result_vector(p, &inst->Dst[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(tmp, ONE, Z, Y, X),
+                      i915_emit_const4fv(p, cos_constants), 0);
+      break;
+
+   case TGSI_OPCODE_DP3:
+      emit_simple_arith(p, inst, A0_DP3, 2);
+      break;
+
+   case TGSI_OPCODE_DP4:
+      emit_simple_arith(p, inst, A0_DP4, 2);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
+
+      i915_emit_arith(p,
+                      A0_DP4,
+                      get_result_vector(p, &inst->Dst[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, X, Y, Z, ONE), src1, 0);
+      break;
+
+   case TGSI_OPCODE_DST:
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
+
+      /* result[0] = 1    * 1;
+       * result[1] = a[1] * b[1];
+       * result[2] = a[2] * 1;
+       * result[3] = 1    * b[3];
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      get_result_vector(p, &inst->Dst[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, ONE, Y, Z, ONE),
+                      swizzle(src1, ONE, Y, ONE, W), 0);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_EX2:
+      src0 = src_vector(p, &inst->Src[0]);
+
+      i915_emit_arith(p,
+                      A0_EXP,
+                      get_result_vector(p, &inst->Dst[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+
+   case TGSI_OPCODE_FLR:
+      emit_simple_arith(p, inst, A0_FLR, 1);
+      break;
+
+   case TGSI_OPCODE_FRC:
+      emit_simple_arith(p, inst, A0_FRC, 1);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* kill if src[0].x < 0 || src[0].y < 0 ... */
+      src0 = src_vector(p, &inst->Src[0]);
+      tmp = i915_get_utemp(p);
+
+      i915_emit_texld(p,
+                      tmp,                   /* dest reg: a dummy reg */
+                      A0_DEST_CHANNEL_ALL,   /* dest writemask */
+                      0,                     /* sampler */
+                      src0,                  /* coord*/
+                      T0_TEXKILL);           /* opcode */
+      break;
+
+   case TGSI_OPCODE_KILP:
+      assert(0); /* not tested yet */
+      break;
+
+   case TGSI_OPCODE_LG2:
+      src0 = src_vector(p, &inst->Src[0]);
+
+      i915_emit_arith(p,
+                      A0_LOG,
+                      get_result_vector(p, &inst->Dst[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+
+   case TGSI_OPCODE_LIT:
+      src0 = src_vector(p, &inst->Src[0]);
+      tmp = i915_get_utemp(p);
+
+      /* tmp = max( a.xyzw, a.00zw )
+       * XXX: Clamp tmp.w to -128..128
+       * tmp.y = log(tmp.y)
+       * tmp.y = tmp.w * tmp.y
+       * tmp.y = exp(tmp.y)
+       * result = cmp (a.11-x1, a.1x01, a.1xy1 )
+       */
+      i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0,
+                      src0, swizzle(src0, ZERO, ZERO, Z, W), 0);
+
+      i915_emit_arith(p, A0_LOG, tmp, A0_DEST_CHANNEL_Y, 0,
+                      swizzle(tmp, Y, Y, Y, Y), 0, 0);
+
+      i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_Y, 0,
+                      swizzle(tmp, ZERO, Y, ZERO, ZERO),
+                      swizzle(tmp, ZERO, W, ZERO, ZERO), 0);
+
+      i915_emit_arith(p, A0_EXP, tmp, A0_DEST_CHANNEL_Y, 0,
+                      swizzle(tmp, Y, Y, Y, Y), 0, 0);
+
+      i915_emit_arith(p, A0_CMP,
+                      get_result_vector(p, &inst->Dst[0]),
+                      get_result_flags(inst), 0,
+                      negate(swizzle(tmp, ONE, ONE, X, ONE), 0, 0, 1, 0),
+                      swizzle(tmp, ONE, X, ZERO, ONE),
+                      swizzle(tmp, ONE, X, Y, ONE));
+
+      break;
+
+   case TGSI_OPCODE_LRP:
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
+      src2 = src_vector(p, &inst->Src[2]);
+      flags = get_result_flags(inst);
+      tmp = i915_get_utemp(p);
+
+      /* b*a + c*(1-a)
+       *
+       * b*a + c - ca 
+       *
+       * tmp = b*a + c, 
+       * result = (-c)*a + tmp 
+       */
+      i915_emit_arith(p, A0_MAD, tmp,
+                      flags & A0_DEST_CHANNEL_ALL, 0, src1, src0, src2);
+
+      i915_emit_arith(p, A0_MAD,
+                      get_result_vector(p, &inst->Dst[0]),
+                      flags, 0, negate(src2, 1, 1, 1, 1), src0, tmp);
+      break;
+
+   case TGSI_OPCODE_MAD:
+      emit_simple_arith(p, inst, A0_MAD, 3);
+      break;
+
+   case TGSI_OPCODE_MAX:
+      emit_simple_arith(p, inst, A0_MAX, 2);
+      break;
+
+   case TGSI_OPCODE_MIN:
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
+      tmp = i915_get_utemp(p);
+      flags = get_result_flags(inst);
+
+      i915_emit_arith(p,
+                      A0_MAX,
+                      tmp, flags & A0_DEST_CHANNEL_ALL, 0,
+                      negate(src0, 1, 1, 1, 1),
+                      negate(src1, 1, 1, 1, 1), 0);
+
+      i915_emit_arith(p,
+                      A0_MOV,
+                      get_result_vector(p, &inst->Dst[0]),
+                      flags, 0, negate(tmp, 1, 1, 1, 1), 0, 0);
+      break;
+
+   case TGSI_OPCODE_MOV:
+      emit_simple_arith(p, inst, A0_MOV, 1);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      emit_simple_arith(p, inst, A0_MUL, 2);
+      break;
+
+   case TGSI_OPCODE_POW:
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
+      tmp = i915_get_utemp(p);
+      flags = get_result_flags(inst);
+
+      /* XXX: masking on intermediate values, here and elsewhere.
+       */
+      i915_emit_arith(p,
+                      A0_LOG,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+
+      i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_X, 0, tmp, src1, 0);
+
+      i915_emit_arith(p,
+                      A0_EXP,
+                      get_result_vector(p, &inst->Dst[0]),
+                      flags, 0, swizzle(tmp, X, X, X, X), 0, 0);
+      break;
+      
+   case TGSI_OPCODE_RET:
+      /* XXX: no-op? */
+      break;
+      
+   case TGSI_OPCODE_RCP:
+      src0 = src_vector(p, &inst->Src[0]);
+
+      i915_emit_arith(p,
+                      A0_RCP,
+                      get_result_vector(p, &inst->Dst[0]),
+                         get_result_flags(inst), 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+
+   case TGSI_OPCODE_RSQ:
+      src0 = src_vector(p, &inst->Src[0]);
+
+      i915_emit_arith(p,
+                      A0_RSQ,
+                      get_result_vector(p, &inst->Dst[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+
+   case TGSI_OPCODE_SCS:
+      src0 = src_vector(p, &inst->Src[0]);
+      tmp = i915_get_utemp(p);
+
+      /* 
+       * t0.xy = MUL x.xx11, x.x1111  ; x^2, x, 1, 1
+       * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, x
+       * t1 = MUL t0.xyyw t0.yz11    ; x^7 x^5 x^3 x
+       * scs.x = DP4 t1, sin_constants
+       * t1 = MUL t0.xxz1 t0.z111    ; x^6 x^4 x^2 1
+       * scs.y = DP4 t1, cos_constants
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XY, 0,
+                      swizzle(src0, X, X, ONE, ONE),
+                      swizzle(src0, X, ONE, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_ALL, 0,
+                      swizzle(tmp, X, Y, X, Y),
+                      swizzle(tmp, X, X, ONE, ONE), 0);
+
+      writemask = inst->Dst[0].Register.WriteMask;
+
+      if (writemask & TGSI_WRITEMASK_Y) {
+         uint tmp1;
+
+         if (writemask & TGSI_WRITEMASK_X)
+            tmp1 = i915_get_utemp(p);
+         else
+            tmp1 = tmp;
+
+         i915_emit_arith(p,
+                         A0_MUL,
+                         tmp1, A0_DEST_CHANNEL_ALL, 0,
+                         swizzle(tmp, X, Y, Y, W),
+                         swizzle(tmp, X, Z, ONE, ONE), 0);
+
+         i915_emit_arith(p,
+                         A0_DP4,
+                         get_result_vector(p, &inst->Dst[0]),
+                         A0_DEST_CHANNEL_Y, 0,
+                         swizzle(tmp1, W, Z, Y, X),
+                         i915_emit_const4fv(p, sin_constants), 0);
+      }
+
+      if (writemask & TGSI_WRITEMASK_X) {
+         i915_emit_arith(p,
+                         A0_MUL,
+                         tmp, A0_DEST_CHANNEL_XYZ, 0,
+                         swizzle(tmp, X, X, Z, ONE),
+                         swizzle(tmp, Z, ONE, ONE, ONE), 0);
+
+         i915_emit_arith(p,
+                         A0_DP4,
+                         get_result_vector(p, &inst->Dst[0]),
+                         A0_DEST_CHANNEL_X, 0,
+                         swizzle(tmp, ONE, Z, Y, X),
+                         i915_emit_const4fv(p, cos_constants), 0);
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+      emit_simple_arith(p, inst, A0_SGE, 2);
+      break;
+
+   case TGSI_OPCODE_SLE:
+      /* like SGE, but swap reg0, reg1 */
+      emit_simple_arith_swap2(p, inst, A0_SGE, 2);
+      break;
+
+   case TGSI_OPCODE_SIN:
+      src0 = src_vector(p, &inst->Src[0]);
+      tmp = i915_get_utemp(p);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      src0, i915_emit_const1f(p, 1.0f / (float) (M_PI * 2.0)), 0);
+
+      i915_emit_arith(p, A0_MOD, tmp, A0_DEST_CHANNEL_X, 0, tmp, 0, 0);
+
+      /* By choosing different taylor constants, could get rid of this mul:
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      tmp, i915_emit_const1f(p, (float) (M_PI * 2.0)), 0);
+
+      /* 
+       * t0.xy = MUL x.xx11, x.x1111  ; x^2, x, 1, 1
+       * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, x
+       * t1 = MUL t0.xyyw t0.yz11    ; x^7 x^5 x^3 x
+       * result = DP4 t1.wzyx, sin_constants
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XY, 0,
+                      swizzle(tmp, X, X, ONE, ONE),
+                      swizzle(tmp, X, ONE, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_ALL, 0,
+                      swizzle(tmp, X, Y, X, Y),
+                      swizzle(tmp, X, X, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_ALL, 0,
+                      swizzle(tmp, X, Y, Y, W),
+                      swizzle(tmp, X, Z, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_DP4,
+                      get_result_vector(p, &inst->Dst[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(tmp, W, Z, Y, X),
+                      i915_emit_const4fv(p, sin_constants), 0);
+      break;
+
+   case TGSI_OPCODE_SLT:
+      emit_simple_arith(p, inst, A0_SLT, 2);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      /* like SLT, but swap reg0, reg1 */
+      emit_simple_arith_swap2(p, inst, A0_SLT, 2);
+      break;
+
+   case TGSI_OPCODE_SUB:
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
+
+      i915_emit_arith(p,
+                      A0_ADD,
+                      get_result_vector(p, &inst->Dst[0]),
+                      get_result_flags(inst), 0,
+                      src0, negate(src1, 1, 1, 1, 1), 0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      emit_tex(p, inst, T0_TEXLD);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      emit_tex(p, inst, T0_TEXLDB);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      emit_tex(p, inst, T0_TEXLDP);
+      break;
+
+   case TGSI_OPCODE_XPD:
+      /* Cross product:
+       *      result.x = src0.y * src1.z - src0.z * src1.y;
+       *      result.y = src0.z * src1.x - src0.x * src1.z;
+       *      result.z = src0.x * src1.y - src0.y * src1.x;
+       *      result.w = undef;
+       */
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
+      tmp = i915_get_utemp(p);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_ALL, 0,
+                      swizzle(src0, Z, X, Y, ONE),
+                      swizzle(src1, Y, Z, X, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MAD,
+                      get_result_vector(p, &inst->Dst[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, Y, Z, X, ONE),
+                      swizzle(src1, Z, X, Y, ONE),
+                      negate(tmp, 1, 1, 1, 0));
+      break;
+
+   default:
+      i915_program_error(p, "bad opcode %d", inst->Instruction.Opcode);
+      p->error = 1;
+      return;
+   }
+
+   i915_release_utemps(p);
+}
+
+
+/**
+ * Translate TGSI fragment shader into i915 hardware instructions.
+ * \param p  the translation state
+ * \param tokens  the TGSI token array
+ */
+static void
+i915_translate_instructions(struct i915_fp_compile *p,
+                            const struct tgsi_token *tokens)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   struct tgsi_parse_context parse;
+
+   tgsi_parse_init( &parse, tokens );
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (parse.FullToken.FullDeclaration.Declaration.File
+                  == TGSI_FILE_CONSTANT) {
+            uint i;
+            for (i = parse.FullToken.FullDeclaration.Range.First;
+                 i <= parse.FullToken.FullDeclaration.Range.Last;
+                 i++) {
+               assert(ifs->constant_flags[i] == 0x0);
+               ifs->constant_flags[i] = I915_CONSTFLAG_USER;
+               ifs->num_constants = MAX2(ifs->num_constants, i + 1);
+            }
+         }
+         else if (parse.FullToken.FullDeclaration.Declaration.File
+                  == TGSI_FILE_TEMPORARY) {
+            uint i;
+            for (i = parse.FullToken.FullDeclaration.Range.First;
+                 i <= parse.FullToken.FullDeclaration.Range.Last;
+                 i++) {
+               assert(i < I915_MAX_TEMPORARY);
+               /* XXX just use shader->info->file_mask[TGSI_FILE_TEMPORARY] */
+               p->temp_flag |= (1 << i); /* mark temp as used */
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         {
+            const struct tgsi_full_immediate *imm
+               = &parse.FullToken.FullImmediate;
+            const uint pos = p->num_immediates++;
+            uint j;
+            assert( imm->Immediate.NrTokens <= 4 + 1 );
+            for (j = 0; j < imm->Immediate.NrTokens - 1; j++) {
+               p->immediates[pos][j] = imm->u[j].Float;
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (p->first_instruction) {
+            /* resolve location of immediates */
+            uint i, j;
+            for (i = 0; i < p->num_immediates; i++) {
+               /* find constant slot for this immediate */
+               for (j = 0; j < I915_MAX_CONSTANT; j++) {
+                  if (ifs->constant_flags[j] == 0x0) {
+                     memcpy(ifs->constants[j],
+                            p->immediates[i],
+                            4 * sizeof(float));
+                     /*printf("immediate %d maps to const %d\n", i, j);*/
+                     ifs->constant_flags[j] = 0xf;  /* all four comps used */
+                     p->immediates_map[i] = j;
+                     ifs->num_constants = MAX2(ifs->num_constants, j + 1);
+                     break;
+                  }
+               }
+            }
+
+            p->first_instruction = FALSE;
+         }
+
+         i915_translate_instruction(p, &parse.FullToken.FullInstruction);
+         break;
+
+      default:
+         assert( 0 );
+      }
+
+   } /* while */
+
+   tgsi_parse_free (&parse);
+}
+
+
+static struct i915_fp_compile *
+i915_init_compile(struct i915_context *i915,
+                  struct i915_fragment_shader *ifs)
+{
+   struct i915_fp_compile *p = CALLOC_STRUCT(i915_fp_compile);
+
+   p->shader = ifs;
+
+   /* Put new constants at end of const buffer, growing downward.
+    * The problem is we don't know how many user-defined constants might
+    * be specified with pipe->set_constant_buffer().
+    * Should pre-scan the user's program to determine the highest-numbered
+    * constant referenced.
+    */
+   ifs->num_constants = 0;
+   memset(ifs->constant_flags, 0, sizeof(ifs->constant_flags));
+
+   p->first_instruction = TRUE;
+
+   p->nr_tex_indirect = 1;      /* correct? */
+   p->nr_tex_insn = 0;
+   p->nr_alu_insn = 0;
+   p->nr_decl_insn = 0;
+
+   p->csr = p->program;
+   p->decl = p->declarations;
+   p->decl_s = 0;
+   p->decl_t = 0;
+   p->temp_flag = ~0x0 << I915_MAX_TEMPORARY;
+   p->utemp_flag = ~0x7;
+
+   p->wpos_tex = -1;
+
+   /* initialize the first program word */
+   *(p->decl++) = _3DSTATE_PIXEL_SHADER_PROGRAM;
+
+   return p;
+}
+
+
+/* Copy compile results to the fragment program struct and destroy the
+ * compilation context.
+ */
+static void
+i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   unsigned long program_size = (unsigned long) (p->csr - p->program);
+   unsigned long decl_size = (unsigned long) (p->decl - p->declarations);
+
+   if (p->nr_tex_indirect > I915_MAX_TEX_INDIRECT)
+      i915_program_error(p, "Exceeded max nr indirect texture lookups");
+
+   if (p->nr_tex_insn > I915_MAX_TEX_INSN)
+      i915_program_error(p, "Exceeded max TEX instructions");
+
+   if (p->nr_alu_insn > I915_MAX_ALU_INSN)
+      i915_program_error(p, "Exceeded max ALU instructions");
+
+   if (p->nr_decl_insn > I915_MAX_DECL_INSN)
+      i915_program_error(p, "Exceeded max DECL instructions");
+
+   if (p->error) {
+      p->NumNativeInstructions = 0;
+      p->NumNativeAluInstructions = 0;
+      p->NumNativeTexInstructions = 0;
+      p->NumNativeTexIndirections = 0;
+
+      i915_use_passthrough_shader(ifs);
+   }
+   else {
+      p->NumNativeInstructions
+         = p->nr_alu_insn + p->nr_tex_insn + p->nr_decl_insn;
+      p->NumNativeAluInstructions = p->nr_alu_insn;
+      p->NumNativeTexInstructions = p->nr_tex_insn;
+      p->NumNativeTexIndirections = p->nr_tex_indirect;
+
+      /* patch in the program length */
+      p->declarations[0] |= program_size + decl_size - 2;
+
+      /* Copy compilation results to fragment program struct: 
+       */
+      assert(!ifs->program);
+      ifs->program
+         = (uint *) MALLOC((program_size + decl_size) * sizeof(uint));
+      if (ifs->program) {
+         ifs->program_len = program_size + decl_size;
+
+         memcpy(ifs->program,
+                p->declarations, 
+                decl_size * sizeof(uint));
+
+         memcpy(ifs->program + decl_size, 
+                p->program, 
+                program_size * sizeof(uint));
+      }
+   }
+
+   /* Release the compilation struct: 
+    */
+   FREE(p);
+}
+
+
+/**
+ * Find an unused texture coordinate slot to use for fragment WPOS.
+ * Update p->fp->wpos_tex with the result (-1 if no used texcoord slot is found).
+ */
+static void
+i915_find_wpos_space(struct i915_fp_compile *p)
+{
+#if 0
+   const uint inputs
+      = p->shader->inputs_read | (1 << TGSI_ATTRIB_POS); /*XXX hack*/
+   uint i;
+
+   p->wpos_tex = -1;
+
+   if (inputs & (1 << TGSI_ATTRIB_POS)) {
+      for (i = 0; i < I915_TEX_UNITS; i++) {
+	 if ((inputs & (1 << (TGSI_ATTRIB_TEX0 + i))) == 0) {
+	    p->wpos_tex = i;
+	    return;
+	 }
+      }
+
+      i915_program_error(p, "No free texcoord for wpos value");
+   }
+#else
+   if (p->shader->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) {
+      /* frag shader using the fragment position input */
+#if 0
+      assert(0);
+#endif
+   }
+#endif
+}
+
+
+
+
+/**
+ * Rather than trying to intercept and jiggle depth writes during
+ * emit, just move the value into its correct position at the end of
+ * the program:
+ */
+static void
+i915_fixup_depth_write(struct i915_fp_compile *p)
+{
+   /* XXX assuming pos/depth is always in output[0] */
+   if (p->shader->info.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) {
+      const uint depth = UREG(REG_TYPE_OD, 0);
+
+      i915_emit_arith(p,
+                      A0_MOV,                     /* opcode */
+                      depth,                      /* dest reg */
+                      A0_DEST_CHANNEL_W,          /* write mask */
+                      0,                          /* saturate? */
+                      swizzle(depth, X, Y, Z, Z), /* src0 */
+                      0, 0 /* src1, src2 */);
+   }
+}
+
+
+void
+i915_translate_fragment_program( struct i915_context *i915,
+                                 struct i915_fragment_shader *fs)
+{
+   struct i915_fp_compile *p = i915_init_compile(i915, fs);
+   const struct tgsi_token *tokens = fs->state.tokens;
+
+   i915_find_wpos_space(p);
+
+#if 0
+   tgsi_dump(tokens, 0);
+#endif
+
+   i915_translate_instructions(p, tokens);
+   i915_fixup_depth_write(p);
+
+   i915_fini_compile(i915, p);
+}
diff --git a/src/gallium/drivers/i915/i915_prim_emit.c b/src/gallium/drivers/i915/i915_prim_emit.c
new file mode 100644
index 0000000000..dd997e2cf4
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_prim_emit.c
@@ -0,0 +1,226 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "draw/draw_pipe.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_state.h"
+#include "i915_batch.h"
+
+
+
+/**
+ * Primitive emit to hardware.  No support for vertex buffers or any
+ * nice fast paths.
+ */
+struct setup_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct i915_context *i915;   
+};
+
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
+{
+   return (struct setup_stage *)stage;
+}
+
+
+/**
+ * Extract the needed fields from vertex_header and emit i915 dwords.
+ * Recall that the vertices are constructed by the 'draw' module and
+ * have a couple of slots at the beginning (1-dword header, 4-dword
+ * clip pos) that we ignore here.
+ */
+static INLINE void
+emit_hw_vertex( struct i915_context *i915,
+                const struct vertex_header *vertex)
+{
+   const struct vertex_info *vinfo = &i915->current.vertex_info;
+   uint i;
+   uint count = 0;  /* for debug/sanity */
+
+   assert(!i915->dirty);
+
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      const uint j = vinfo->attrib[i].src_index;
+      const float *attrib = vertex->data[j];
+      switch (vinfo->attrib[i].emit) {
+      case EMIT_1F:
+         OUT_BATCH( fui(attrib[0]) );
+         count++;
+         break;
+      case EMIT_2F:
+         OUT_BATCH( fui(attrib[0]) );
+         OUT_BATCH( fui(attrib[1]) );
+         count += 2;
+         break;
+      case EMIT_3F:
+         OUT_BATCH( fui(attrib[0]) );
+         OUT_BATCH( fui(attrib[1]) );
+         OUT_BATCH( fui(attrib[2]) );
+         count += 3;
+         break;
+      case EMIT_4F:
+         OUT_BATCH( fui(attrib[0]) );
+         OUT_BATCH( fui(attrib[1]) );
+         OUT_BATCH( fui(attrib[2]) );
+         OUT_BATCH( fui(attrib[3]) );
+         count += 4;
+         break;
+      case EMIT_4UB:
+         OUT_BATCH( pack_ub4(float_to_ubyte( attrib[0] ),
+                             float_to_ubyte( attrib[1] ),
+                             float_to_ubyte( attrib[2] ),
+                             float_to_ubyte( attrib[3] )) );
+         count += 1;
+         break;
+      case EMIT_4UB_BGRA:
+         OUT_BATCH( pack_ub4(float_to_ubyte( attrib[2] ),
+                             float_to_ubyte( attrib[1] ),
+                             float_to_ubyte( attrib[0] ),
+                             float_to_ubyte( attrib[3] )) );
+         count += 1;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   assert(count == vinfo->size);
+}
+
+
+
+static INLINE void 
+emit_prim( struct draw_stage *stage, 
+	   struct prim_header *prim,
+	   unsigned hwprim,
+	   unsigned nr )
+{
+   struct i915_context *i915 = setup_stage(stage)->i915;
+   unsigned vertex_size;
+   unsigned i;
+
+   if (i915->dirty)
+      i915_update_derived( i915 );
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state( i915 );
+
+   /* need to do this after validation! */
+   vertex_size = i915->current.vertex_info.size * 4; /* in bytes */
+   assert(vertex_size >= 12); /* never smaller than 12 bytes */
+
+   if (!BEGIN_BATCH( 1 + nr * vertex_size / 4, 0 )) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush: 
+       */
+      i915_update_derived( i915 );
+      i915_emit_hardware_state( i915 );
+
+      if (!BEGIN_BATCH( 1 + nr * vertex_size / 4, 0 )) {
+	 assert(0);
+	 return;
+      }
+   }
+
+   /* Emit each triangle as a single primitive.  I told you this was
+    * simple.
+    */
+   OUT_BATCH(_3DPRIMITIVE | 
+	     hwprim |
+	     ((4 + vertex_size * nr)/4 - 2));
+
+   for (i = 0; i < nr; i++)
+      emit_hw_vertex(i915, prim->v[i]);
+}
+
+
+static void 
+setup_tri( struct draw_stage *stage, struct prim_header *prim )
+{
+   emit_prim( stage, prim, PRIM3D_TRILIST, 3 );
+}
+
+
+static void
+setup_line(struct draw_stage *stage, struct prim_header *prim)
+{
+   emit_prim( stage, prim, PRIM3D_LINELIST, 2 );
+}
+
+
+static void
+setup_point(struct draw_stage *stage, struct prim_header *prim)
+{
+   emit_prim( stage, prim, PRIM3D_POINTLIST, 1 );
+}
+
+
+static void setup_flush( struct draw_stage *stage, unsigned flags )
+{
+}
+
+static void reset_stipple_counter( struct draw_stage *stage )
+{
+}
+
+static void render_destroy( struct draw_stage *stage )
+{
+   FREE( stage );
+}
+
+
+/**
+ * Create a new primitive setup/render stage.  This gets plugged into
+ * the 'draw' module's pipeline.
+ */
+struct draw_stage *i915_draw_render_stage( struct i915_context *i915 )
+{
+   struct setup_stage *setup = CALLOC_STRUCT(setup_stage);
+
+   setup->i915 = i915;
+   setup->stage.draw = i915->draw;
+   setup->stage.point = setup_point;
+   setup->stage.line = setup_line;
+   setup->stage.tri = setup_tri;
+   setup->stage.flush = setup_flush;
+   setup->stage.reset_stipple_counter = reset_stipple_counter;
+   setup->stage.destroy = render_destroy;
+
+   return &setup->stage;
+}
diff --git a/src/gallium/drivers/i915/i915_prim_vbuf.c b/src/gallium/drivers/i915/i915_prim_vbuf.c
new file mode 100644
index 0000000000..f8665acbe1
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_prim_vbuf.c
@@ -0,0 +1,695 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Build post-transformation, post-clipping vertex buffers and element
+ * lists by hooking into the end of the primitive pipeline and
+ * manipulating the vertex_id field in the vertex headers.
+ *
+ * XXX: work in progress 
+ * 
+ * \author José Fonseca <jrfonseca@tungstengraphics.com>
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "util/u_debug.h"
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_fifo.h"
+
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_batch.h"
+#include "i915_state.h"
+
+
+#undef VBUF_USE_FIFO
+#undef VBUF_MAP_BUFFER
+
+/**
+ * Primitive renderer for i915.
+ */
+struct i915_vbuf_render {
+   struct vbuf_render base;
+
+   struct i915_context *i915;
+
+   /** Vertex size in bytes */
+   size_t vertex_size;
+
+   /** Software primitive */
+   unsigned prim;
+
+   /** Hardware primitive */
+   unsigned hwprim;
+
+   /** Genereate a vertex list */
+   unsigned fallback;
+
+   /* Stuff for the vbo */
+   struct i915_winsys_buffer *vbo;
+   size_t vbo_size; /**< current size of allocated buffer */
+   size_t vbo_alloc_size; /**< minimum buffer size to allocate */
+   size_t vbo_offset;
+   void *vbo_ptr;
+   size_t vbo_max_used;
+
+#ifndef VBUF_MAP_BUFFER
+   size_t map_used_start;
+   size_t map_used_end;
+   size_t map_size;
+#endif
+
+#ifdef VBUF_USE_FIFO
+   /* Stuff for the pool */
+   struct util_fifo *pool_fifo;
+   unsigned pool_used;
+   unsigned pool_buffer_size;
+   boolean pool_not_used;
+#endif
+};
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct i915_vbuf_render *
+i915_vbuf_render(struct vbuf_render *render)
+{
+   assert(render);
+   return (struct i915_vbuf_render *)render;
+}
+
+static const struct vertex_info *
+i915_vbuf_render_get_vertex_info(struct vbuf_render *render)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+
+   if (i915->dirty) {
+      /* make sure we have up to date vertex layout */
+      i915_update_derived(i915);
+   }
+
+   return &i915->current.vertex_info;
+}
+
+static boolean
+i915_vbuf_render_reserve(struct i915_vbuf_render *i915_render, size_t size)
+{
+   struct i915_context *i915 = i915_render->i915;
+
+   if (i915_render->vbo_size < size + i915_render->vbo_offset)
+      return FALSE;
+
+   if (i915->vbo_flushed)
+      return FALSE;
+
+   return TRUE;
+}
+
+static void
+i915_vbuf_render_new_buf(struct i915_vbuf_render *i915_render, size_t size)
+{
+   struct i915_context *i915 = i915_render->i915;
+   struct i915_winsys *iws = i915->iws;
+
+   if (i915_render->vbo) {
+#ifdef VBUF_USE_FIFO
+      if (i915_render->pool_not_used)
+         iws->buffer_destroy(iws, i915_render->vbo);
+      else
+         u_fifo_add(i915_render->pool_fifo, i915_render->vbo);
+      i915_render->vbo = NULL;
+#else
+      iws->buffer_destroy(iws, i915_render->vbo);
+#endif
+   }
+
+   i915->vbo_flushed = 0;
+
+   i915_render->vbo_size = MAX2(size, i915_render->vbo_alloc_size);
+   i915_render->vbo_offset = 0;
+
+#ifndef VBUF_MAP_BUFFER
+   if (i915_render->vbo_size > i915_render->map_size) {
+      i915_render->map_size = i915_render->vbo_size;
+      FREE(i915_render->vbo_ptr);
+      i915_render->vbo_ptr = MALLOC(i915_render->map_size);
+   }
+#endif
+
+#ifdef VBUF_USE_FIFO
+   if (i915_render->vbo_size != i915_render->pool_buffer_size) {
+      i915_render->pool_not_used = TRUE;
+      i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size, 64,
+            I915_NEW_VERTEX);
+   } else {
+      i915_render->pool_not_used = FALSE;
+
+      if (i915_render->pool_used >= 2) {
+         FLUSH_BATCH(NULL);
+         i915->vbo_flushed = 0;
+         i915_render->pool_used = 0;
+      }
+      u_fifo_pop(i915_render->pool_fifo, (void**)&i915_render->vbo);
+   }
+#else
+   i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size,
+                                         64, I915_NEW_VERTEX);
+#endif
+}
+
+static boolean
+i915_vbuf_render_allocate_vertices(struct vbuf_render *render,
+                                   ushort vertex_size,
+                                   ushort nr_vertices)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   size_t size = (size_t)vertex_size * (size_t)nr_vertices;
+
+   /* FIXME: handle failure */
+   assert(!i915->vbo);
+
+   if (!i915_vbuf_render_reserve(i915_render, size)) {
+#ifdef VBUF_USE_FIFO
+      /* incase we flushed reset the number of pool buffers used */
+      if (i915->vbo_flushed)
+         i915_render->pool_used = 0;
+#endif
+      i915_vbuf_render_new_buf(i915_render, size);
+   }
+
+   i915_render->vertex_size = vertex_size;
+   i915->vbo = i915_render->vbo;
+   i915->vbo_offset = i915_render->vbo_offset;
+   i915->dirty |= I915_NEW_VBO;
+
+   if (!i915_render->vbo)
+      return FALSE;
+   return TRUE;
+}
+
+static void *
+i915_vbuf_render_map_vertices(struct vbuf_render *render)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   struct i915_winsys *iws = i915->iws;
+
+   if (i915->vbo_flushed)
+      debug_printf("%s bad vbo flush occured stalling on hw\n", __FUNCTION__);
+
+#ifdef VBUF_MAP_BUFFER
+   i915_render->vbo_ptr = iws->buffer_map(iws, i915_render->vbo, TRUE);
+   return (unsigned char *)i915_render->vbo_ptr + i915_render->vbo_offset;
+#else
+   (void)iws;
+   return (unsigned char *)i915_render->vbo_ptr;
+#endif
+}
+
+static void
+i915_vbuf_render_unmap_vertices(struct vbuf_render *render,
+                                ushort min_index,
+                                ushort max_index)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   struct i915_winsys *iws = i915->iws;
+
+   i915_render->vbo_max_used = MAX2(i915_render->vbo_max_used, i915_render->vertex_size * (max_index + 1));
+#ifdef VBUF_MAP_BUFFER
+   iws->buffer_unmap(iws, i915_render->vbo);
+#else
+   i915_render->map_used_start = i915_render->vertex_size * min_index;
+   i915_render->map_used_end = i915_render->vertex_size * (max_index + 1);
+   iws->buffer_write(iws, i915_render->vbo,
+                     i915_render->map_used_start + i915_render->vbo_offset,
+                     i915_render->map_used_end - i915_render->map_used_start,
+                     (unsigned char *)i915_render->vbo_ptr + i915_render->map_used_start);
+
+#endif
+}
+
+static boolean
+i915_vbuf_render_set_primitive(struct vbuf_render *render, 
+                               unsigned prim)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   i915_render->prim = prim;
+
+   switch(prim) {
+   case PIPE_PRIM_POINTS:
+      i915_render->hwprim = PRIM3D_POINTLIST;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_LINES:
+      i915_render->hwprim = PRIM3D_LINELIST;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_LINE_LOOP:
+      i915_render->hwprim = PRIM3D_LINELIST;
+      i915_render->fallback = PIPE_PRIM_LINE_LOOP;
+      return TRUE;
+   case PIPE_PRIM_LINE_STRIP:
+      i915_render->hwprim = PRIM3D_LINESTRIP;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_TRIANGLES:
+      i915_render->hwprim = PRIM3D_TRILIST;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      i915_render->hwprim = PRIM3D_TRISTRIP;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_TRIANGLE_FAN:
+      i915_render->hwprim = PRIM3D_TRIFAN;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_QUADS:
+      i915_render->hwprim = PRIM3D_TRILIST;
+      i915_render->fallback = PIPE_PRIM_QUADS;
+      return TRUE;
+   case PIPE_PRIM_QUAD_STRIP:
+      i915_render->hwprim = PRIM3D_TRILIST;
+      i915_render->fallback = PIPE_PRIM_QUAD_STRIP;
+      return TRUE;
+   case PIPE_PRIM_POLYGON:
+      i915_render->hwprim = PRIM3D_POLY;
+      i915_render->fallback = 0;
+      return TRUE;
+   default:
+      /* FIXME: Actually, can handle a lot more just fine... */
+      return FALSE;
+   }
+}
+
+/**
+ * Used for fallbacks in draw_arrays
+ */
+static void
+draw_arrays_generate_indices(struct vbuf_render *render,
+                             unsigned start, uint nr,
+                             unsigned type)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   unsigned i;
+   unsigned end = start + nr;
+   switch(type) {
+   case 0:
+      for (i = start; i+1 < end; i += 2)
+         OUT_BATCH((i+0) | (i+1) << 16);
+      if (i < end)
+         OUT_BATCH(i);
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      if (nr >= 2) {
+         for (i = start + 1; i < end; i++)
+            OUT_BATCH((i-0) | (i+0) << 16);
+         OUT_BATCH((i-0) | ( start) << 16);
+      }
+      break;
+   case PIPE_PRIM_QUADS:
+      for (i = start; i + 3 < end; i += 4) {
+         OUT_BATCH((i+0) | (i+1) << 16);
+         OUT_BATCH((i+3) | (i+1) << 16);
+         OUT_BATCH((i+2) | (i+3) << 16);
+      }
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = start; i + 3 < end; i += 2) {
+         OUT_BATCH((i+0) | (i+1) << 16);
+         OUT_BATCH((i+3) | (i+2) << 16);
+         OUT_BATCH((i+0) | (i+3) << 16);
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+static unsigned
+draw_arrays_calc_nr_indices(uint nr, unsigned type)
+{
+   switch (type) {
+   case 0:
+      return nr;
+   case PIPE_PRIM_LINE_LOOP:
+      if (nr >= 2)
+         return nr * 2;
+      else
+         return 0;
+   case PIPE_PRIM_QUADS:
+      return (nr / 4) * 6;
+   case PIPE_PRIM_QUAD_STRIP:
+      return ((nr - 2) / 2) * 6;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static void
+draw_arrays_fallback(struct vbuf_render *render,
+                     unsigned start,
+                     uint nr)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   unsigned nr_indices;
+
+   if (i915->dirty)
+      i915_update_derived(i915);
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state(i915);
+
+   nr_indices = draw_arrays_calc_nr_indices(nr, i915_render->fallback);
+   if (!nr_indices)
+      return;
+
+   if (!BEGIN_BATCH(1 + (nr_indices + 1)/2, 1)) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush:
+       */
+      i915_update_derived(i915);
+      i915_emit_hardware_state(i915);
+      i915->vbo_flushed = 1;
+
+      if (!BEGIN_BATCH(1 + (nr_indices + 1)/2, 1)) {
+         assert(0);
+         goto out;
+      }
+   }
+   OUT_BATCH(_3DPRIMITIVE |
+             PRIM_INDIRECT |
+             i915_render->hwprim |
+             PRIM_INDIRECT_ELTS |
+             nr_indices);
+
+   draw_arrays_generate_indices(render, start, nr, i915_render->fallback);
+
+out:
+   return;
+}
+
+static void
+i915_vbuf_render_draw_arrays(struct vbuf_render *render,
+                             unsigned start,
+                             uint nr)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+
+   if (i915_render->fallback) {
+      draw_arrays_fallback(render, start, nr);
+      return;
+   }
+
+   if (i915->dirty)
+      i915_update_derived(i915);
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state(i915);
+
+   if (!BEGIN_BATCH(2, 0)) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush:
+       */
+      i915_update_derived(i915);
+      i915_emit_hardware_state(i915);
+      i915->vbo_flushed = 1;
+
+      if (!BEGIN_BATCH(2, 0)) {
+         assert(0);
+         goto out;
+      }
+   }
+
+   OUT_BATCH(_3DPRIMITIVE |
+             PRIM_INDIRECT |
+             PRIM_INDIRECT_SEQUENTIAL |
+             i915_render->hwprim |
+             nr);
+   OUT_BATCH(start); /* Beginning vertex index */
+
+out:
+   return;
+}
+
+/**
+ * Used for normal and fallback emitting of indices
+ * If type is zero normal operation assumed.
+ */
+static void
+draw_generate_indices(struct vbuf_render *render,
+                      const ushort *indices,
+                      uint nr_indices,
+                      unsigned type)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   unsigned i;
+
+   switch(type) {
+   case 0:
+      for (i = 0; i + 1 < nr_indices; i += 2) {
+         OUT_BATCH(indices[i] | indices[i+1] << 16);
+      }
+      if (i < nr_indices) {
+         OUT_BATCH(indices[i]);
+      }
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      if (nr_indices >= 2) {
+         for (i = 1; i < nr_indices; i++)
+            OUT_BATCH(indices[i-1] | indices[i] << 16);
+         OUT_BATCH(indices[i-1] | indices[0] << 16);
+      }
+      break;
+   case PIPE_PRIM_QUADS:
+      for (i = 0; i + 3 < nr_indices; i += 4) {
+         OUT_BATCH(indices[i+0] | indices[i+1] << 16);
+         OUT_BATCH(indices[i+3] | indices[i+1] << 16);
+         OUT_BATCH(indices[i+2] | indices[i+3] << 16);
+      }
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = 0; i + 3 < nr_indices; i += 2) {
+         OUT_BATCH(indices[i+0] | indices[i+1] << 16);
+         OUT_BATCH(indices[i+3] | indices[i+2] << 16);
+         OUT_BATCH(indices[i+0] | indices[i+3] << 16);
+      }
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+static unsigned
+draw_calc_nr_indices(uint nr_indices, unsigned type)
+{
+   switch (type) {
+   case 0:
+      return nr_indices;
+   case PIPE_PRIM_LINE_LOOP:
+      if (nr_indices >= 2)
+         return nr_indices * 2;
+      else
+         return 0;
+   case PIPE_PRIM_QUADS:
+      return (nr_indices / 4) * 6;
+   case PIPE_PRIM_QUAD_STRIP:
+      return ((nr_indices - 2) / 2) * 6;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static void 
+i915_vbuf_render_draw_elements(struct vbuf_render *render,
+                               const ushort *indices,
+                               uint nr_indices)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   unsigned save_nr_indices;
+
+   save_nr_indices = nr_indices;
+
+   nr_indices = draw_calc_nr_indices(nr_indices, i915_render->fallback);
+   if (!nr_indices)
+      return;
+
+   if (i915->dirty)
+      i915_update_derived(i915);
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state(i915);
+
+   if (!BEGIN_BATCH(1 + (nr_indices + 1)/2, 1)) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush: 
+       */
+      i915_update_derived(i915);
+      i915_emit_hardware_state(i915);
+      i915->vbo_flushed = 1;
+
+      if (!BEGIN_BATCH(1 + (nr_indices + 1)/2, 1)) {
+         assert(0);
+         goto out;
+      }
+   }
+
+   OUT_BATCH(_3DPRIMITIVE |
+             PRIM_INDIRECT |
+             i915_render->hwprim |
+             PRIM_INDIRECT_ELTS |
+             nr_indices);
+   draw_generate_indices(render,
+                         indices,
+                         save_nr_indices,
+                         i915_render->fallback);
+
+out:
+   return;
+}
+
+static void
+i915_vbuf_render_release_vertices(struct vbuf_render *render)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+
+   assert(i915->vbo);
+
+   i915_render->vbo_offset += i915_render->vbo_max_used;
+   i915_render->vbo_max_used = 0;
+   i915->vbo = NULL;
+   i915->dirty |= I915_NEW_VBO;
+}
+
+static void
+i915_vbuf_render_destroy(struct vbuf_render *render)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   FREE(i915_render);
+}
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+i915_vbuf_render_create(struct i915_context *i915)
+{
+   struct i915_vbuf_render *i915_render = CALLOC_STRUCT(i915_vbuf_render);
+   struct i915_winsys *iws = i915->iws;
+   int i;
+
+   i915_render->i915 = i915;
+
+   i915_render->base.max_vertex_buffer_bytes = 16*4096;
+
+   /* NOTE: it must be such that state and vertices indices fit in a single 
+    * batch buffer.
+    */
+   i915_render->base.max_indices = 16*1024;
+
+   i915_render->base.get_vertex_info = i915_vbuf_render_get_vertex_info;
+   i915_render->base.allocate_vertices = i915_vbuf_render_allocate_vertices;
+   i915_render->base.map_vertices = i915_vbuf_render_map_vertices;
+   i915_render->base.unmap_vertices = i915_vbuf_render_unmap_vertices;
+   i915_render->base.set_primitive = i915_vbuf_render_set_primitive;
+   i915_render->base.draw_elements = i915_vbuf_render_draw_elements;
+   i915_render->base.draw_arrays = i915_vbuf_render_draw_arrays;
+   i915_render->base.release_vertices = i915_vbuf_render_release_vertices;
+   i915_render->base.destroy = i915_vbuf_render_destroy;
+
+#ifndef VBUF_MAP_BUFFER
+   i915_render->map_size = 0;
+   i915_render->map_used_start = 0;
+   i915_render->map_used_end = 0;
+#endif
+
+   i915_render->vbo = NULL;
+   i915_render->vbo_ptr = NULL;
+   i915_render->vbo_size = 0;
+   i915_render->vbo_offset = 0;
+   i915_render->vbo_alloc_size = i915_render->base.max_vertex_buffer_bytes * 4;
+
+#ifdef VBUF_USE_POOL
+   i915_render->pool_used = FALSE;
+   i915_render->pool_buffer_size = i915_render->vbo_alloc_size;
+   i915_render->pool_fifo = u_fifo_create(6);
+   for (i = 0; i < 6; i++)
+      u_fifo_add(i915_render->pool_fifo,
+                 iws->buffer_create(iws, i915_render->pool_buffer_size, 64,
+                                    I915_NEW_VERTEX));
+#else
+   (void)i;
+   (void)iws;
+#endif
+
+   return &i915_render->base;
+}
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *i915_draw_vbuf_stage(struct i915_context *i915)
+{
+   struct vbuf_render *render;
+   struct draw_stage *stage;
+   
+   render = i915_vbuf_render_create(i915);
+   if(!render)
+      return NULL;
+   
+   stage = draw_vbuf_stage(i915->draw, render);
+   if(!stage) {
+      render->destroy(render);
+      return NULL;
+   }
+   /** TODO JB: this shouldn't be here */
+   draw_set_render(i915->draw, render);
+
+   return stage;
+}
diff --git a/src/gallium/drivers/i915/i915_reg.h b/src/gallium/drivers/i915/i915_reg.h
new file mode 100644
index 0000000000..04620fec68
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_reg.h
@@ -0,0 +1,978 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef I915_REG_H
+#define I915_REG_H
+
+
+#define I915_SET_FIELD( var, mask, value ) (var &= ~(mask), var |= value)
+
+#define CMD_3D (0x3<<29)
+
+#define PRIM3D_INLINE		(CMD_3D | (0x1f<<24))
+#define PRIM3D_TRILIST		(0x0<<18)
+#define PRIM3D_TRISTRIP 	(0x1<<18)
+#define PRIM3D_TRISTRIP_RVRSE	(0x2<<18)
+#define PRIM3D_TRIFAN		(0x3<<18)
+#define PRIM3D_POLY		(0x4<<18)
+#define PRIM3D_LINELIST 	(0x5<<18)
+#define PRIM3D_LINESTRIP	(0x6<<18)
+#define PRIM3D_RECTLIST 	(0x7<<18)
+#define PRIM3D_POINTLIST	(0x8<<18)
+#define PRIM3D_DIB		(0x9<<18)
+#define PRIM3D_CLEAR_RECT	(0xa<<18)
+#define PRIM3D_ZONE_INIT	(0xd<<18)
+#define PRIM3D_MASK		(0x1f<<18)
+
+/* p137 */
+#define _3DSTATE_AA_CMD			(CMD_3D | (0x06<<24))
+#define AA_LINE_ECAAR_WIDTH_ENABLE	(1<<16)
+#define AA_LINE_ECAAR_WIDTH_0_5 	0
+#define AA_LINE_ECAAR_WIDTH_1_0		(1<<14)
+#define AA_LINE_ECAAR_WIDTH_2_0 	(2<<14)
+#define AA_LINE_ECAAR_WIDTH_4_0 	(3<<14)
+#define AA_LINE_REGION_WIDTH_ENABLE	(1<<8)
+#define AA_LINE_REGION_WIDTH_0_5	0
+#define AA_LINE_REGION_WIDTH_1_0	(1<<6)
+#define AA_LINE_REGION_WIDTH_2_0	(2<<6)
+#define AA_LINE_REGION_WIDTH_4_0	(3<<6)
+
+/* 3DSTATE_BACKFACE_STENCIL_OPS, p138*/
+#define _3DSTATE_BACKFACE_STENCIL_OPS    (CMD_3D | (0x8<<24))
+#define BFO_ENABLE_STENCIL_REF          (1<<23)
+#define BFO_STENCIL_REF_SHIFT           15
+#define BFO_STENCIL_REF_MASK            (0xff<<15)
+#define BFO_ENABLE_STENCIL_FUNCS        (1<<14)
+#define BFO_STENCIL_TEST_SHIFT          11
+#define BFO_STENCIL_TEST_MASK           (0x7<<11)
+#define BFO_STENCIL_FAIL_SHIFT          8
+#define BFO_STENCIL_FAIL_MASK           (0x7<<8)
+#define BFO_STENCIL_PASS_Z_FAIL_SHIFT   5
+#define BFO_STENCIL_PASS_Z_FAIL_MASK    (0x7<<5)
+#define BFO_STENCIL_PASS_Z_PASS_SHIFT   2
+#define BFO_STENCIL_PASS_Z_PASS_MASK    (0x7<<2)
+#define BFO_ENABLE_STENCIL_TWO_SIDE     (1<<1)
+#define BFO_STENCIL_TWO_SIDE            (1<<0)
+
+
+/* 3DSTATE_BACKFACE_STENCIL_MASKS, p140 */
+#define _3DSTATE_BACKFACE_STENCIL_MASKS    (CMD_3D | (0x9<<24))
+#define BFM_ENABLE_STENCIL_TEST_MASK      (1<<17)
+#define BFM_ENABLE_STENCIL_WRITE_MASK     (1<<16)
+#define BFM_STENCIL_TEST_MASK_SHIFT       8
+#define BFM_STENCIL_TEST_MASK_MASK        (0xff<<8)
+#define BFM_STENCIL_WRITE_MASK_SHIFT      0
+#define BFM_STENCIL_WRITE_MASK_MASK       (0xff<<0)
+
+
+
+/* 3DSTATE_BIN_CONTROL p141 */
+
+/* p143 */
+#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
+/* Dword 1 */
+#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
+#define BUF_3D_ID_DEPTH 	(0x7<<24)
+#define BUF_3D_USE_FENCE	(1<<23)
+#define BUF_3D_TILED_SURFACE	(1<<22)
+#define BUF_3D_TILE_WALK_X	0
+#define BUF_3D_TILE_WALK_Y	(1<<21)
+#define BUF_3D_PITCH(x)         (((x)/4)<<2)
+/* Dword 2 */
+#define BUF_3D_ADDR(x)		((x) & ~0x3)
+
+
+/* 3DSTATE_CHROMA_KEY */
+
+/* 3DSTATE_CLEAR_PARAMETERS, p150 */
+#define _3DSTATE_CLEAR_PARAMETERS	(CMD_3D | (0x1d<<24) | (0x9c<<16) | 5)
+/* Dword 1 */
+#define CLEARPARAM_CLEAR_RECT		(1 << 16)
+#define CLEARPARAM_ZONE_INIT		(0 << 16)
+#define CLEARPARAM_WRITE_COLOR		(1 << 2)
+#define CLEARPARAM_WRITE_DEPTH		(1 << 1)
+#define CLEARPARAM_WRITE_STENCIL	(1 << 0)
+
+/* 3DSTATE_CONSTANT_BLEND_COLOR, p153 */
+#define _3DSTATE_CONST_BLEND_COLOR_CMD	(CMD_3D | (0x1d<<24) | (0x88<<16))
+
+
+
+/* 3DSTATE_COORD_SET_BINDINGS, p154 */
+#define _3DSTATE_COORD_SET_BINDINGS      (CMD_3D | (0x16<<24))
+#define CSB_TCB(iunit, eunit)           ((eunit)<<(iunit*3))
+
+/* p156 */
+#define _3DSTATE_DFLT_DIFFUSE_CMD	(CMD_3D | (0x1d<<24) | (0x99<<16))
+
+/* p157 */
+#define _3DSTATE_DFLT_SPEC_CMD		(CMD_3D | (0x1d<<24) | (0x9a<<16))
+
+/* p158 */
+#define _3DSTATE_DFLT_Z_CMD		(CMD_3D | (0x1d<<24) | (0x98<<16))
+
+
+/* 3DSTATE_DEPTH_OFFSET_SCALE, p159 */
+#define _3DSTATE_DEPTH_OFFSET_SCALE       (CMD_3D | (0x1d<<24) | (0x97<<16))
+/* scale in dword 1 */
+
+
+/* 3DSTATE_DEPTH_SUBRECT_DISABLE, p160 */
+#define _3DSTATE_DEPTH_SUBRECT_DISABLE    (CMD_3D | (0x1c<<24) | (0x11<<19) | 0x2)
+
+/* p161 */
+#define _3DSTATE_DST_BUF_VARS_CMD	(CMD_3D | (0x1d<<24) | (0x85<<16))
+/* Dword 1 */
+#define TEX_DEFAULT_COLOR_OGL           (0<<30)
+#define TEX_DEFAULT_COLOR_D3D           (1<<30)
+#define ZR_EARLY_DEPTH                  (1<<29)
+#define LOD_PRECLAMP_OGL                (1<<28)
+#define LOD_PRECLAMP_D3D                (0<<28)
+#define DITHER_FULL_ALWAYS              (0<<26)
+#define DITHER_FULL_ON_FB_BLEND         (1<<26)
+#define DITHER_CLAMPED_ALWAYS           (2<<26)
+#define LINEAR_GAMMA_BLEND_32BPP        (1<<25)
+#define DEBUG_DISABLE_ENH_DITHER        (1<<24)
+#define DSTORG_HORT_BIAS(x)		((x)<<20)
+#define DSTORG_VERT_BIAS(x)		((x)<<16)
+#define COLOR_4_2_2_CHNL_WRT_ALL	0
+#define COLOR_4_2_2_CHNL_WRT_Y		(1<<12)
+#define COLOR_4_2_2_CHNL_WRT_CR		(2<<12)
+#define COLOR_4_2_2_CHNL_WRT_CB		(3<<12)
+#define COLOR_4_2_2_CHNL_WRT_CRCB	(4<<12)
+#define COLOR_BUF_8BIT			0
+#define COLOR_BUF_RGB555 		(1<<8)
+#define COLOR_BUF_RGB565 		(2<<8)
+#define COLOR_BUF_ARGB8888		(3<<8)
+#define DEPTH_FRMT_16_FIXED		0
+#define DEPTH_FRMT_16_FLOAT		(1<<2)
+#define DEPTH_FRMT_24_FIXED_8_OTHER	(2<<2)
+#define VERT_LINE_STRIDE_1		(1<<1)
+#define VERT_LINE_STRIDE_0		(0<<1)
+#define VERT_LINE_STRIDE_OFS_1		1
+#define VERT_LINE_STRIDE_OFS_0		0
+
+/* p166 */
+#define _3DSTATE_DRAW_RECT_CMD		(CMD_3D|(0x1d<<24)|(0x80<<16)|3)
+/* Dword 1 */
+#define DRAW_RECT_DIS_DEPTH_OFS 	(1<<30)
+#define DRAW_DITHER_OFS_X(x)		((x)<<26)
+#define DRAW_DITHER_OFS_Y(x)		((x)<<24)
+/* Dword 2 */
+#define DRAW_YMIN(x)			((x)<<16)
+#define DRAW_XMIN(x)			(x)
+/* Dword 3 */
+#define DRAW_YMAX(x)			((x)<<16)
+#define DRAW_XMAX(x)			(x)
+/* Dword 4 */
+#define DRAW_YORG(x)			((x)<<16)
+#define DRAW_XORG(x)			(x)
+
+
+/* 3DSTATE_FILTER_COEFFICIENTS_4X4, p170 */
+
+/* 3DSTATE_FILTER_COEFFICIENTS_6X5, p172 */
+
+
+/* _3DSTATE_FOG_COLOR, p173 */
+#define _3DSTATE_FOG_COLOR_CMD		(CMD_3D|(0x15<<24))
+#define FOG_COLOR_RED(x)		((x)<<16)
+#define FOG_COLOR_GREEN(x)		((x)<<8)
+#define FOG_COLOR_BLUE(x)		(x)
+
+/* _3DSTATE_FOG_MODE, p174 */
+#define _3DSTATE_FOG_MODE_CMD		(CMD_3D|(0x1d<<24)|(0x89<<16)|2)
+/* Dword 1 */
+#define FMC1_FOGFUNC_MODIFY_ENABLE	(1<<31)
+#define FMC1_FOGFUNC_VERTEX		(0<<28)
+#define FMC1_FOGFUNC_PIXEL_EXP		(1<<28)
+#define FMC1_FOGFUNC_PIXEL_EXP2		(2<<28)
+#define FMC1_FOGFUNC_PIXEL_LINEAR	(3<<28)
+#define FMC1_FOGFUNC_MASK		(3<<28)
+#define FMC1_FOGINDEX_MODIFY_ENABLE     (1<<27)
+#define FMC1_FOGINDEX_Z		        (0<<25)
+#define FMC1_FOGINDEX_W   		(1<<25)
+#define FMC1_C1_C2_MODIFY_ENABLE	(1<<24)
+#define FMC1_DENSITY_MODIFY_ENABLE	(1<<23)
+#define FMC1_C1_ONE      	        (1<<13)
+#define FMC1_C1_MASK		        (0xffff<<4)
+/* Dword 2 */
+#define FMC2_C2_ONE		        (1<<16)
+/* Dword 3 */
+#define FMC3_D_ONE      		(1<<16)
+
+
+
+/* _3DSTATE_INDEPENDENT_ALPHA_BLEND, p177 */
+#define _3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD	(CMD_3D|(0x0b<<24))
+#define IAB_MODIFY_ENABLE	        (1<<23)
+#define IAB_ENABLE       	        (1<<22)
+#define IAB_MODIFY_FUNC         	(1<<21)
+#define IAB_FUNC_SHIFT          	16
+#define IAB_MODIFY_SRC_FACTOR   	(1<<11)
+#define IAB_SRC_FACTOR_SHIFT		6
+#define IAB_SRC_FACTOR_MASK		(BLENDFACT_MASK<<6)
+#define IAB_MODIFY_DST_FACTOR	        (1<<5)
+#define IAB_DST_FACTOR_SHIFT		0
+#define IAB_DST_FACTOR_MASK		(BLENDFACT_MASK<<0)
+
+
+#define BLENDFUNC_ADD			0x0
+#define BLENDFUNC_SUBTRACT		0x1
+#define BLENDFUNC_REVERSE_SUBTRACT	0x2
+#define BLENDFUNC_MIN			0x3
+#define BLENDFUNC_MAX			0x4
+#define BLENDFUNC_MASK			0x7
+
+/* 3DSTATE_LOAD_INDIRECT, p180 */
+
+#define _3DSTATE_LOAD_INDIRECT	        (CMD_3D|(0x1d<<24)|(0x7<<16))
+#define LI0_STATE_STATIC_INDIRECT       (0x01<<8)
+#define LI0_STATE_DYNAMIC_INDIRECT      (0x02<<8)
+#define LI0_STATE_SAMPLER               (0x04<<8)
+#define LI0_STATE_MAP                   (0x08<<8)
+#define LI0_STATE_PROGRAM               (0x10<<8)
+#define LI0_STATE_CONSTANTS             (0x20<<8)
+
+#define SIS0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define SIS0_FORCE_LOAD                 (1<<1)
+#define SIS0_BUFFER_VALID               (1<<0)
+#define SIS1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define DIS0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define DIS0_BUFFER_RESET               (1<<1)
+#define DIS0_BUFFER_VALID               (1<<0)
+
+#define SSB0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define SSB0_FORCE_LOAD                 (1<<1)
+#define SSB0_BUFFER_VALID               (1<<0)
+#define SSB1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define MSB0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define MSB0_FORCE_LOAD                 (1<<1)
+#define MSB0_BUFFER_VALID               (1<<0)
+#define MSB1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define PSP0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define PSP0_FORCE_LOAD                 (1<<1)
+#define PSP0_BUFFER_VALID               (1<<0)
+#define PSP1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define PSC0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define PSC0_FORCE_LOAD                 (1<<1)
+#define PSC0_BUFFER_VALID               (1<<0)
+#define PSC1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+
+
+
+
+/* _3DSTATE_RASTERIZATION_RULES */
+#define _3DSTATE_RASTER_RULES_CMD	(CMD_3D|(0x07<<24))
+#define ENABLE_POINT_RASTER_RULE	(1<<15)
+#define OGL_POINT_RASTER_RULE		(1<<13)
+#define ENABLE_TEXKILL_3D_4D            (1<<10)
+#define TEXKILL_3D                      (0<<9)
+#define TEXKILL_4D                      (1<<9)
+#define ENABLE_LINE_STRIP_PROVOKE_VRTX	(1<<8)
+#define ENABLE_TRI_FAN_PROVOKE_VRTX	(1<<5)
+#define LINE_STRIP_PROVOKE_VRTX(x)	((x)<<6)
+#define TRI_FAN_PROVOKE_VRTX(x) 	((x)<<3)
+
+/* _3DSTATE_SCISSOR_ENABLE, p256 */
+#define _3DSTATE_SCISSOR_ENABLE_CMD	(CMD_3D|(0x1c<<24)|(0x10<<19))
+#define ENABLE_SCISSOR_RECT		((1<<1) | 1)
+#define DISABLE_SCISSOR_RECT		(1<<1)
+
+/* _3DSTATE_SCISSOR_RECTANGLE_0, p257 */
+#define _3DSTATE_SCISSOR_RECT_0_CMD	(CMD_3D|(0x1d<<24)|(0x81<<16)|1)
+/* Dword 1 */
+#define SCISSOR_RECT_0_YMIN(x)		((x)<<16)
+#define SCISSOR_RECT_0_XMIN(x)		(x)
+/* Dword 2 */
+#define SCISSOR_RECT_0_YMAX(x)		((x)<<16)
+#define SCISSOR_RECT_0_XMAX(x)		(x)
+
+/* p189 */
+#define _3DSTATE_LOAD_STATE_IMMEDIATE_1   ((0x3<<29)|(0x1d<<24)|(0x04<<16))
+#define I1_LOAD_S(n)                      (1<<(4+n))
+
+#define S0_VB_OFFSET_MASK              0xffffffc
+#define S0_AUTO_CACHE_INV_DISABLE      (1<<0)
+
+#define S1_VERTEX_WIDTH_SHIFT          24
+#define S1_VERTEX_WIDTH_MASK           (0x3f<<24)
+#define S1_VERTEX_PITCH_SHIFT          16
+#define S1_VERTEX_PITCH_MASK           (0x3f<<16)
+
+#define TEXCOORDFMT_2D                 0x0
+#define TEXCOORDFMT_3D                 0x1
+#define TEXCOORDFMT_4D                 0x2
+#define TEXCOORDFMT_1D                 0x3
+#define TEXCOORDFMT_2D_16              0x4
+#define TEXCOORDFMT_4D_16              0x5
+#define TEXCOORDFMT_NOT_PRESENT        0xf
+#define S2_TEXCOORD_FMT0_MASK            0xf
+#define S2_TEXCOORD_FMT1_SHIFT           4
+#define S2_TEXCOORD_FMT(unit, type)    ((type)<<(unit*4))
+#define S2_TEXCOORD_NONE               (~0)
+
+/* S3 not interesting */
+
+#define S4_POINT_WIDTH_SHIFT           23
+#define S4_POINT_WIDTH_MASK            (0x1ff<<23)
+#define S4_LINE_WIDTH_SHIFT            19
+#define S4_LINE_WIDTH_ONE              (0x2<<19)
+#define S4_LINE_WIDTH_MASK             (0xf<<19)
+#define S4_FLATSHADE_ALPHA             (1<<18)
+#define S4_FLATSHADE_FOG               (1<<17)
+#define S4_FLATSHADE_SPECULAR          (1<<16)
+#define S4_FLATSHADE_COLOR             (1<<15)
+#define S4_CULLMODE_BOTH	       (0<<13)
+#define S4_CULLMODE_NONE	       (1<<13)
+#define S4_CULLMODE_CW		       (2<<13)
+#define S4_CULLMODE_CCW		       (3<<13)
+#define S4_CULLMODE_MASK	       (3<<13)
+#define S4_VFMT_POINT_WIDTH            (1<<12)
+#define S4_VFMT_SPEC_FOG               (1<<11)
+#define S4_VFMT_COLOR                  (1<<10)
+#define S4_VFMT_DEPTH_OFFSET           (1<<9)
+#define S4_VFMT_XYZ     	       (1<<6)
+#define S4_VFMT_XYZW     	       (2<<6)
+#define S4_VFMT_XY     		       (3<<6)
+#define S4_VFMT_XYW     	       (4<<6)
+#define S4_VFMT_XYZW_MASK              (7<<6)
+#define S4_FORCE_DEFAULT_DIFFUSE       (1<<5)
+#define S4_FORCE_DEFAULT_SPECULAR      (1<<4)
+#define S4_LOCAL_DEPTH_OFFSET_ENABLE   (1<<3)
+#define S4_VFMT_FOG_PARAM              (1<<2)
+#define S4_SPRITE_POINT_ENABLE         (1<<1)
+#define S4_LINE_ANTIALIAS_ENABLE       (1<<0)
+
+#define S4_VFMT_MASK (S4_VFMT_POINT_WIDTH   | 	\
+		      S4_VFMT_SPEC_FOG      |	\
+		      S4_VFMT_COLOR         |	\
+		      S4_VFMT_DEPTH_OFFSET  |	\
+		      S4_VFMT_XYZW_MASK     |	\
+		      S4_VFMT_FOG_PARAM)
+
+
+#define S5_WRITEDISABLE_ALPHA          (1<<31)
+#define S5_WRITEDISABLE_RED            (1<<30)
+#define S5_WRITEDISABLE_GREEN          (1<<29)
+#define S5_WRITEDISABLE_BLUE           (1<<28)
+#define S5_WRITEDISABLE_MASK           (0xf<<28)
+#define S5_FORCE_DEFAULT_POINT_SIZE    (1<<27)
+#define S5_LAST_PIXEL_ENABLE           (1<<26)
+#define S5_GLOBAL_DEPTH_OFFSET_ENABLE  (1<<25)
+#define S5_FOG_ENABLE                  (1<<24)
+#define S5_STENCIL_REF_SHIFT           16
+#define S5_STENCIL_REF_MASK            (0xff<<16)
+#define S5_STENCIL_TEST_FUNC_SHIFT     13
+#define S5_STENCIL_TEST_FUNC_MASK      (0x7<<13)
+#define S5_STENCIL_FAIL_SHIFT          10
+#define S5_STENCIL_FAIL_MASK           (0x7<<10)
+#define S5_STENCIL_PASS_Z_FAIL_SHIFT   7
+#define S5_STENCIL_PASS_Z_FAIL_MASK    (0x7<<7)
+#define S5_STENCIL_PASS_Z_PASS_SHIFT   4
+#define S5_STENCIL_PASS_Z_PASS_MASK    (0x7<<4)
+#define S5_STENCIL_WRITE_ENABLE        (1<<3)
+#define S5_STENCIL_TEST_ENABLE         (1<<2)
+#define S5_COLOR_DITHER_ENABLE         (1<<1)
+#define S5_LOGICOP_ENABLE              (1<<0)
+
+
+#define S6_ALPHA_TEST_ENABLE           (1<<31)
+#define S6_ALPHA_TEST_FUNC_SHIFT       28
+#define S6_ALPHA_TEST_FUNC_MASK        (0x7<<28)
+#define S6_ALPHA_REF_SHIFT             20
+#define S6_ALPHA_REF_MASK              (0xff<<20)
+#define S6_DEPTH_TEST_ENABLE           (1<<19)
+#define S6_DEPTH_TEST_FUNC_SHIFT       16
+#define S6_DEPTH_TEST_FUNC_MASK        (0x7<<16)
+#define S6_CBUF_BLEND_ENABLE           (1<<15)
+#define S6_CBUF_BLEND_FUNC_SHIFT       12
+#define S6_CBUF_BLEND_FUNC_MASK        (0x7<<12)
+#define S6_CBUF_SRC_BLEND_FACT_SHIFT   8
+#define S6_CBUF_SRC_BLEND_FACT_MASK    (0xf<<8)
+#define S6_CBUF_DST_BLEND_FACT_SHIFT   4
+#define S6_CBUF_DST_BLEND_FACT_MASK    (0xf<<4)
+#define S6_DEPTH_WRITE_ENABLE          (1<<3)
+#define S6_COLOR_WRITE_ENABLE          (1<<2)
+#define S6_TRISTRIP_PV_SHIFT           0
+#define S6_TRISTRIP_PV_MASK            (0x3<<0)
+
+#define S7_DEPTH_OFFSET_CONST_MASK     ~0
+
+
+
+#define DST_BLND_FACT(f) ((f)<<S6_CBUF_DST_BLEND_FACT_SHIFT)
+#define SRC_BLND_FACT(f) ((f)<<S6_CBUF_SRC_BLEND_FACT_SHIFT)
+#define DST_ABLND_FACT(f) ((f)<<IAB_DST_FACTOR_SHIFT)
+#define SRC_ABLND_FACT(f) ((f)<<IAB_SRC_FACTOR_SHIFT)
+
+
+
+
+/* 3DSTATE_MAP_DEINTERLACER_PARAMETERS */
+
+/* 3DSTATE_MAP_PALETTE_LOAD_32, p206 */
+#define _3DSTATE_MAP_PALETTE_LOAD_32    (CMD_3D|(0x1d<<24)|(0x8f<<16))
+/* subsequent dwords up to length (max 16) are ARGB8888 color values */
+
+/* _3DSTATE_MODES_4, p218 */
+#define _3DSTATE_MODES_4_CMD		(CMD_3D|(0x0d<<24))
+#define ENABLE_LOGIC_OP_FUNC		(1<<23)
+#define LOGIC_OP_FUNC(x)		((x)<<18)
+#define LOGICOP_MASK			(0xf<<18)
+#define MODE4_ENABLE_STENCIL_TEST_MASK	((1<<17)|(0xff00))
+#define ENABLE_STENCIL_TEST_MASK	(1<<17)
+#define STENCIL_TEST_MASK(x)		(((x)&0xff)<<8)
+#define MODE4_ENABLE_STENCIL_WRITE_MASK	((1<<16)|(0x00ff))
+#define ENABLE_STENCIL_WRITE_MASK	(1<<16)
+#define STENCIL_WRITE_MASK(x)		((x)&0xff)
+
+/* _3DSTATE_MODES_5, p220 */
+#define _3DSTATE_MODES_5_CMD		(CMD_3D|(0x0c<<24))
+#define PIPELINE_FLUSH_RENDER_CACHE	(1<<18)
+#define PIPELINE_FLUSH_TEXTURE_CACHE	(1<<16)
+
+
+/* p221 */
+#define _3DSTATE_PIXEL_SHADER_CONSTANTS  (CMD_3D|(0x1d<<24)|(0x6<<16))
+#define PS1_REG(n)                      (1<<(n))
+#define PS2_CONST_X(n)                  (n)
+#define PS3_CONST_Y(n)                  (n)
+#define PS4_CONST_Z(n)                  (n)
+#define PS5_CONST_W(n)                  (n)
+
+/* p222 */
+
+
+#define I915_MAX_TEX_INDIRECT 4
+#define I915_MAX_TEX_INSN     32
+#define I915_MAX_ALU_INSN     64
+#define I915_MAX_DECL_INSN    27
+#define I915_MAX_TEMPORARY    16
+
+
+/* Each instruction is 3 dwords long, though most don't require all
+ * this space.  Maximum of 123 instructions.  Smaller maxes per insn
+ * type.
+ */
+#define _3DSTATE_PIXEL_SHADER_PROGRAM    (CMD_3D|(0x1d<<24)|(0x5<<16))
+
+#define REG_TYPE_R                 0    /* temporary regs, no need to
+                                         * dcl, must be written before
+                                         * read -- Preserved between
+                                         * phases. 
+                                         */
+#define REG_TYPE_T                 1    /* Interpolated values, must be
+                                         * dcl'ed before use.
+                                         *
+                                         * 0..7: texture coord,
+                                         * 8: diffuse spec,
+                                         * 9: specular color,
+                                         * 10: fog parameter in w.
+                                         */
+#define REG_TYPE_CONST             2    /* Restriction: only one const
+                                         * can be referenced per
+                                         * instruction, though it may be
+                                         * selected for multiple inputs.
+                                         * Constants not initialized
+                                         * default to zero.
+                                         */
+#define REG_TYPE_S                 3    /* sampler */
+#define REG_TYPE_OC                4    /* output color (rgba) */
+#define REG_TYPE_OD                5    /* output depth (w), xyz are
+                                         * temporaries.  If not written,
+                                         * interpolated depth is used?
+                                         */
+#define REG_TYPE_U                 6    /* unpreserved temporaries */
+#define REG_TYPE_MASK              0x7
+#define REG_NR_MASK                0xf
+
+
+/* REG_TYPE_T:
+ */
+#define T_TEX0     0
+#define T_TEX1     1
+#define T_TEX2     2
+#define T_TEX3     3
+#define T_TEX4     4
+#define T_TEX5     5
+#define T_TEX6     6
+#define T_TEX7     7
+#define T_DIFFUSE  8
+#define T_SPECULAR 9
+#define T_FOG_W    10           /* interpolated fog is in W coord */
+
+/* Arithmetic instructions */
+
+/* .replicate_swizzle == selection and replication of a particular
+ * scalar channel, ie., .xxxx, .yyyy, .zzzz or .wwww 
+ */
+#define A0_NOP    (0x0<<24)     /* no operation */
+#define A0_ADD    (0x1<<24)     /* dst = src0 + src1 */
+#define A0_MOV    (0x2<<24)     /* dst = src0 */
+#define A0_MUL    (0x3<<24)     /* dst = src0 * src1 */
+#define A0_MAD    (0x4<<24)     /* dst = src0 * src1 + src2 */
+#define A0_DP2ADD (0x5<<24)     /* dst.xyzw = src0.xy dot src1.xy + src2.replicate_swizzle */
+#define A0_DP3    (0x6<<24)     /* dst.xyzw = src0.xyz dot src1.xyz */
+#define A0_DP4    (0x7<<24)     /* dst.xyzw = src0.xyzw dot src1.xyzw */
+#define A0_FRC    (0x8<<24)     /* dst = src0 - floor(src0) */
+#define A0_RCP    (0x9<<24)     /* dst.xyzw = 1/(src0.replicate_swizzle) */
+#define A0_RSQ    (0xa<<24)     /* dst.xyzw = 1/(sqrt(abs(src0.replicate_swizzle))) */
+#define A0_EXP    (0xb<<24)     /* dst.xyzw = exp2(src0.replicate_swizzle) */
+#define A0_LOG    (0xc<<24)     /* dst.xyzw = log2(abs(src0.replicate_swizzle)) */
+#define A0_CMP    (0xd<<24)     /* dst = (src0 >= 0.0) ? src1 : src2 */
+#define A0_MIN    (0xe<<24)     /* dst = (src0 < src1) ? src0 : src1 */
+#define A0_MAX    (0xf<<24)     /* dst = (src0 >= src1) ? src0 : src1 */
+#define A0_FLR    (0x10<<24)    /* dst = floor(src0) */
+#define A0_MOD    (0x11<<24)    /* dst = src0 fmod 1.0 */
+#define A0_TRC    (0x12<<24)    /* dst = int(src0) */
+#define A0_SGE    (0x13<<24)    /* dst = src0 >= src1 ? 1.0 : 0.0 */
+#define A0_SLT    (0x14<<24)    /* dst = src0 < src1 ? 1.0 : 0.0 */
+#define A0_DEST_SATURATE                 (1<<22)
+#define A0_DEST_TYPE_SHIFT                19
+/* Allow: R, OC, OD, U */
+#define A0_DEST_NR_SHIFT                 14
+/* Allow R: 0..15, OC,OD: 0..0, U: 0..2 */
+#define A0_DEST_CHANNEL_X                (1<<10)
+#define A0_DEST_CHANNEL_Y                (2<<10)
+#define A0_DEST_CHANNEL_Z                (4<<10)
+#define A0_DEST_CHANNEL_W                (8<<10)
+#define A0_DEST_CHANNEL_ALL              (0xf<<10)
+#define A0_DEST_CHANNEL_SHIFT            10
+#define A0_SRC0_TYPE_SHIFT               7
+#define A0_SRC0_NR_SHIFT                 2
+
+#define A0_DEST_CHANNEL_XY              (A0_DEST_CHANNEL_X|A0_DEST_CHANNEL_Y)
+#define A0_DEST_CHANNEL_XYZ             (A0_DEST_CHANNEL_XY|A0_DEST_CHANNEL_Z)
+
+
+#define SRC_X        0
+#define SRC_Y        1
+#define SRC_Z        2
+#define SRC_W        3
+#define SRC_ZERO     4
+#define SRC_ONE      5
+
+#define A1_SRC0_CHANNEL_X_NEGATE         (1<<31)
+#define A1_SRC0_CHANNEL_X_SHIFT          28
+#define A1_SRC0_CHANNEL_Y_NEGATE         (1<<27)
+#define A1_SRC0_CHANNEL_Y_SHIFT          24
+#define A1_SRC0_CHANNEL_Z_NEGATE         (1<<23)
+#define A1_SRC0_CHANNEL_Z_SHIFT          20
+#define A1_SRC0_CHANNEL_W_NEGATE         (1<<19)
+#define A1_SRC0_CHANNEL_W_SHIFT          16
+#define A1_SRC1_TYPE_SHIFT               13
+#define A1_SRC1_NR_SHIFT                 8
+#define A1_SRC1_CHANNEL_X_NEGATE         (1<<7)
+#define A1_SRC1_CHANNEL_X_SHIFT          4
+#define A1_SRC1_CHANNEL_Y_NEGATE         (1<<3)
+#define A1_SRC1_CHANNEL_Y_SHIFT          0
+
+#define A2_SRC1_CHANNEL_Z_NEGATE         (1<<31)
+#define A2_SRC1_CHANNEL_Z_SHIFT          28
+#define A2_SRC1_CHANNEL_W_NEGATE         (1<<27)
+#define A2_SRC1_CHANNEL_W_SHIFT          24
+#define A2_SRC2_TYPE_SHIFT               21
+#define A2_SRC2_NR_SHIFT                 16
+#define A2_SRC2_CHANNEL_X_NEGATE         (1<<15)
+#define A2_SRC2_CHANNEL_X_SHIFT          12
+#define A2_SRC2_CHANNEL_Y_NEGATE         (1<<11)
+#define A2_SRC2_CHANNEL_Y_SHIFT          8
+#define A2_SRC2_CHANNEL_Z_NEGATE         (1<<7)
+#define A2_SRC2_CHANNEL_Z_SHIFT          4
+#define A2_SRC2_CHANNEL_W_NEGATE         (1<<3)
+#define A2_SRC2_CHANNEL_W_SHIFT          0
+
+
+
+/* Texture instructions */
+#define T0_TEXLD     (0x15<<24) /* Sample texture using predeclared
+                                 * sampler and address, and output
+                                 * filtered texel data to destination
+                                 * register */
+#define T0_TEXLDP    (0x16<<24) /* Same as texld but performs a
+                                 * perspective divide of the texture
+                                 * coordinate .xyz values by .w before
+                                 * sampling. */
+#define T0_TEXLDB    (0x17<<24) /* Same as texld but biases the
+                                 * computed LOD by w.  Only S4.6 two's
+                                 * comp is used.  This implies that a
+                                 * float to fixed conversion is
+                                 * done. */
+#define T0_TEXKILL   (0x18<<24) /* Does not perform a sampling
+                                 * operation.  Simply kills the pixel
+                                 * if any channel of the address
+                                 * register is < 0.0. */
+#define T0_DEST_TYPE_SHIFT                19
+/* Allow: R, OC, OD, U */
+/* Note: U (unpreserved) regs do not retain their values between
+ * phases (cannot be used for feedback) 
+ *
+ * Note: oC and OD registers can only be used as the destination of a
+ * texture instruction once per phase (this is an implementation
+ * restriction). 
+ */
+#define T0_DEST_NR_SHIFT                 14
+/* Allow R: 0..15, OC,OD: 0..0, U: 0..2 */
+#define T0_SAMPLER_NR_SHIFT              0      /* This field ignored for TEXKILL */
+#define T0_SAMPLER_NR_MASK               (0xf<<0)
+
+#define T1_ADDRESS_REG_TYPE_SHIFT        24     /* Reg to use as texture coord */
+/* Allow R, T, OC, OD -- R, OC, OD are 'dependent' reads, new program phase */
+#define T1_ADDRESS_REG_NR_SHIFT          17
+#define T2_MBZ                           0
+
+/* Declaration instructions */
+#define D0_DCL       (0x19<<24) /* Declare a t (interpolated attrib)
+                                 * register or an s (sampler)
+                                 * register. */
+#define D0_SAMPLE_TYPE_SHIFT              22
+#define D0_SAMPLE_TYPE_2D                 (0x0<<22)
+#define D0_SAMPLE_TYPE_CUBE               (0x1<<22)
+#define D0_SAMPLE_TYPE_VOLUME             (0x2<<22)
+#define D0_SAMPLE_TYPE_MASK               (0x3<<22)
+
+#define D0_TYPE_SHIFT                19
+/* Allow: T, S */
+#define D0_NR_SHIFT                  14
+/* Allow T: 0..10, S: 0..15 */
+#define D0_CHANNEL_X                (1<<10)
+#define D0_CHANNEL_Y                (2<<10)
+#define D0_CHANNEL_Z                (4<<10)
+#define D0_CHANNEL_W                (8<<10)
+#define D0_CHANNEL_ALL              (0xf<<10)
+#define D0_CHANNEL_NONE             (0<<10)
+
+#define D0_CHANNEL_XY               (D0_CHANNEL_X|D0_CHANNEL_Y)
+#define D0_CHANNEL_XYZ              (D0_CHANNEL_XY|D0_CHANNEL_Z)
+
+/* I915 Errata: Do not allow (xz), (xw), (xzw) combinations for diffuse
+ * or specular declarations. 
+ *
+ * For T dcls, only allow: (x), (xy), (xyz), (w), (xyzw) 
+ *
+ * Must be zero for S (sampler) dcls
+ */
+#define D1_MBZ                          0
+#define D2_MBZ                          0
+
+
+
+/* p207 */
+#define _3DSTATE_MAP_STATE               (CMD_3D|(0x1d<<24)|(0x0<<16))
+
+#define MS1_MAPMASK_SHIFT               0
+#define MS1_MAPMASK_MASK                (0x8fff<<0)
+
+#define MS2_UNTRUSTED_SURFACE           (1<<31)
+#define MS2_ADDRESS_MASK                0xfffffffc
+#define MS2_VERTICAL_LINE_STRIDE        (1<<1)
+#define MS2_VERTICAL_OFFSET             (1<<1)
+
+#define MS3_HEIGHT_SHIFT              21
+#define MS3_WIDTH_SHIFT               10
+#define MS3_PALETTE_SELECT            (1<<9)
+#define MS3_MAPSURF_FORMAT_SHIFT      7
+#define MS3_MAPSURF_FORMAT_MASK       (0x7<<7)
+#define    MAPSURF_8BIT		 	   (1<<7)
+#define    MAPSURF_16BIT		   (2<<7)
+#define    MAPSURF_32BIT		   (3<<7)
+#define    MAPSURF_422			   (5<<7)
+#define    MAPSURF_COMPRESSED		   (6<<7)
+#define    MAPSURF_4BIT_INDEXED		   (7<<7)
+#define MS3_MT_FORMAT_MASK         (0x7 << 3)
+#define MS3_MT_FORMAT_SHIFT        3
+#define    MT_4BIT_IDX_ARGB8888	           (7<<3)       /* SURFACE_4BIT_INDEXED */
+#define    MT_8BIT_I8		           (0<<3)       /* SURFACE_8BIT */
+#define    MT_8BIT_L8		           (1<<3)
+#define    MT_8BIT_A8		           (4<<3)
+#define    MT_8BIT_MONO8	           (5<<3)
+#define    MT_16BIT_RGB565 		   (0<<3)       /* SURFACE_16BIT */
+#define    MT_16BIT_ARGB1555		   (1<<3)
+#define    MT_16BIT_ARGB4444		   (2<<3)
+#define    MT_16BIT_AY88		   (3<<3)
+#define    MT_16BIT_88DVDU	           (5<<3)
+#define    MT_16BIT_BUMP_655LDVDU	   (6<<3)
+#define    MT_16BIT_I16	                   (7<<3)
+#define    MT_16BIT_L16	                   (8<<3)
+#define    MT_16BIT_A16	                   (9<<3)
+#define    MT_32BIT_ARGB8888		   (0<<3)       /* SURFACE_32BIT */
+#define    MT_32BIT_ABGR8888		   (1<<3)
+#define    MT_32BIT_XRGB8888		   (2<<3)
+#define    MT_32BIT_XBGR8888		   (3<<3)
+#define    MT_32BIT_QWVU8888		   (4<<3)
+#define    MT_32BIT_AXVU8888		   (5<<3)
+#define    MT_32BIT_LXVU8888	           (6<<3)
+#define    MT_32BIT_XLVU8888	           (7<<3)
+#define    MT_32BIT_ARGB2101010	           (8<<3)
+#define    MT_32BIT_ABGR2101010	           (9<<3)
+#define    MT_32BIT_AWVU2101010	           (0xA<<3)
+#define    MT_32BIT_GR1616	           (0xB<<3)
+#define    MT_32BIT_VU1616	           (0xC<<3)
+#define    MT_32BIT_xI824	           (0xD<<3)
+#define    MT_32BIT_xA824	           (0xE<<3)
+#define    MT_32BIT_xL824	           (0xF<<3)
+#define    MT_422_YCRCB_SWAPY	           (0<<3)       /* SURFACE_422 */
+#define    MT_422_YCRCB_NORMAL	           (1<<3)
+#define    MT_422_YCRCB_SWAPUV	           (2<<3)
+#define    MT_422_YCRCB_SWAPUVY	           (3<<3)
+#define    MT_COMPRESS_DXT1		   (0<<3)       /* SURFACE_COMPRESSED */
+#define    MT_COMPRESS_DXT2_3	           (1<<3)
+#define    MT_COMPRESS_DXT4_5	           (2<<3)
+#define    MT_COMPRESS_FXT1		   (3<<3)
+#define    MT_COMPRESS_DXT1_RGB		   (4<<3)
+#define MS3_USE_FENCE_REGS              (1<<2)
+#define MS3_TILED_SURFACE             (1<<1)
+#define MS3_TILE_WALK                 (1<<0)
+
+#define MS4_PITCH_SHIFT                 21
+#define MS4_CUBE_FACE_ENA_NEGX          (1<<20)
+#define MS4_CUBE_FACE_ENA_POSX          (1<<19)
+#define MS4_CUBE_FACE_ENA_NEGY          (1<<18)
+#define MS4_CUBE_FACE_ENA_POSY          (1<<17)
+#define MS4_CUBE_FACE_ENA_NEGZ          (1<<16)
+#define MS4_CUBE_FACE_ENA_POSZ          (1<<15)
+#define MS4_CUBE_FACE_ENA_MASK          (0x3f<<15)
+#define MS4_MAX_LOD_SHIFT		9
+#define MS4_MAX_LOD_MASK		(0x3f<<9)
+#define MS4_MIP_LAYOUT_LEGACY           (0<<8)
+#define MS4_MIP_LAYOUT_BELOW_LPT        (0<<8)
+#define MS4_MIP_LAYOUT_RIGHT_LPT        (1<<8)
+#define MS4_VOLUME_DEPTH_SHIFT          0
+#define MS4_VOLUME_DEPTH_MASK           (0xff<<0)
+
+/* p244 */
+#define _3DSTATE_SAMPLER_STATE         (CMD_3D|(0x1d<<24)|(0x1<<16))
+
+#define SS1_MAPMASK_SHIFT               0
+#define SS1_MAPMASK_MASK                (0x8fff<<0)
+
+#define SS2_REVERSE_GAMMA_ENABLE        (1<<31)
+#define SS2_PACKED_TO_PLANAR_ENABLE     (1<<30)
+#define SS2_COLORSPACE_CONVERSION       (1<<29)
+#define SS2_CHROMAKEY_SHIFT             27
+#define SS2_BASE_MIP_LEVEL_SHIFT        22
+#define SS2_BASE_MIP_LEVEL_MASK         (0x1f<<22)
+#define SS2_MIP_FILTER_SHIFT            20
+#define SS2_MIP_FILTER_MASK             (0x3<<20)
+#define   MIPFILTER_NONE       	0
+#define   MIPFILTER_NEAREST	1
+#define   MIPFILTER_LINEAR	3
+#define SS2_MAG_FILTER_SHIFT          17
+#define SS2_MAG_FILTER_MASK           (0x7<<17)
+#define   FILTER_NEAREST	0
+#define   FILTER_LINEAR		1
+#define   FILTER_ANISOTROPIC	2
+#define   FILTER_4X4_1    	3
+#define   FILTER_4X4_2    	4
+#define   FILTER_4X4_FLAT 	5
+#define   FILTER_6X5_MONO   	6       /* XXX - check */
+#define SS2_MIN_FILTER_SHIFT          14
+#define SS2_MIN_FILTER_MASK           (0x7<<14)
+#define SS2_LOD_BIAS_SHIFT            5
+#define SS2_LOD_BIAS_ONE              (0x10<<5)
+#define SS2_LOD_BIAS_MASK             (0x1ff<<5)
+/* Shadow requires:
+ *  MT_X8{I,L,A}24 or MT_{I,L,A}16 texture format
+ *  FILTER_4X4_x  MIN and MAG filters
+ */
+#define SS2_SHADOW_ENABLE             (1<<4)
+#define SS2_MAX_ANISO_MASK            (1<<3)
+#define SS2_MAX_ANISO_2               (0<<3)
+#define SS2_MAX_ANISO_4               (1<<3)
+#define SS2_SHADOW_FUNC_SHIFT         0
+#define SS2_SHADOW_FUNC_MASK          (0x7<<0)
+/* SS2_SHADOW_FUNC values: see COMPAREFUNC_* */
+
+#define SS3_MIN_LOD_SHIFT            24
+#define SS3_MIN_LOD_ONE              (0x10<<24)
+#define SS3_MIN_LOD_MASK             (0xff<<24)
+#define SS3_KILL_PIXEL_ENABLE        (1<<17)
+#define SS3_TCX_ADDR_MODE_SHIFT      12
+#define SS3_TCX_ADDR_MODE_MASK       (0x7<<12)
+#define   TEXCOORDMODE_WRAP		0
+#define   TEXCOORDMODE_MIRROR		1
+#define   TEXCOORDMODE_CLAMP_EDGE	2
+#define   TEXCOORDMODE_CUBE       	3
+#define   TEXCOORDMODE_CLAMP_BORDER	4
+#define   TEXCOORDMODE_MIRROR_ONCE      5
+#define SS3_TCY_ADDR_MODE_SHIFT      9
+#define SS3_TCY_ADDR_MODE_MASK       (0x7<<9)
+#define SS3_TCZ_ADDR_MODE_SHIFT      6
+#define SS3_TCZ_ADDR_MODE_MASK       (0x7<<6)
+#define SS3_NORMALIZED_COORDS        (1<<5)
+#define SS3_TEXTUREMAP_INDEX_SHIFT   1
+#define SS3_TEXTUREMAP_INDEX_MASK    (0xf<<1)
+#define SS3_DEINTERLACER_ENABLE      (1<<0)
+
+#define SS4_BORDER_COLOR_MASK        (~0)
+
+/* 3DSTATE_SPAN_STIPPLE, p258
+ */
+#define _3DSTATE_STIPPLE           ((0x3<<29)|(0x1d<<24)|(0x83<<16))
+#define ST1_ENABLE               (1<<16)
+#define ST1_MASK                 (0xffff)
+
+#define _3DSTATE_DEFAULT_Z          ((0x3<<29)|(0x1d<<24)|(0x98<<16))
+#define _3DSTATE_DEFAULT_DIFFUSE    ((0x3<<29)|(0x1d<<24)|(0x99<<16))
+#define _3DSTATE_DEFAULT_SPECULAR   ((0x3<<29)|(0x1d<<24)|(0x9a<<16))
+
+
+#define MI_FLUSH                   ((0<<29)|(4<<23))
+#define FLUSH_MAP_CACHE            (1<<0)
+#define INHIBIT_FLUSH_RENDER_CACHE (1<<2)
+
+
+#define CMD_3D (0x3<<29)
+
+
+#define _3DPRIMITIVE         ((0x3<<29)|(0x1f<<24))
+#define PRIM_INDIRECT            (1<<23)
+#define PRIM_INLINE              (0<<23)
+#define PRIM_INDIRECT_SEQUENTIAL (0<<17)
+#define PRIM_INDIRECT_ELTS       (1<<17)
+
+#define PRIM3D_TRILIST		(0x0<<18)
+#define PRIM3D_TRISTRIP 	(0x1<<18)
+#define PRIM3D_TRISTRIP_RVRSE	(0x2<<18)
+#define PRIM3D_TRIFAN		(0x3<<18)
+#define PRIM3D_POLY		(0x4<<18)
+#define PRIM3D_LINELIST 	(0x5<<18)
+#define PRIM3D_LINESTRIP	(0x6<<18)
+#define PRIM3D_RECTLIST 	(0x7<<18)
+#define PRIM3D_POINTLIST	(0x8<<18)
+#define PRIM3D_DIB		(0x9<<18)
+#define PRIM3D_MASK		(0x1f<<18)
+
+#define I915PACKCOLOR4444(r,g,b,a) \
+  ((((a) & 0xf0) << 8) | (((r) & 0xf0) << 4) | ((g) & 0xf0) | ((b) >> 4))
+
+#define I915PACKCOLOR1555(r,g,b,a) \
+  ((((r) & 0xf8) << 7) | (((g) & 0xf8) << 2) | (((b) & 0xf8) >> 3) | \
+    ((a) ? 0x8000 : 0))
+
+#define I915PACKCOLOR565(r,g,b) \
+  ((((r) & 0xf8) << 8) | (((g) & 0xfc) << 3) | (((b) & 0xf8) >> 3))
+
+#define I915PACKCOLOR8888(r,g,b,a) \
+  ((a<<24) | (r<<16) | (g<<8) | b)
+
+
+
+
+#define BR00_BITBLT_CLIENT   0x40000000
+#define BR00_OP_COLOR_BLT    0x10000000
+#define BR00_OP_SRC_COPY_BLT 0x10C00000
+#define BR13_SOLID_PATTERN   0x80000000
+
+#define XY_COLOR_BLT_CMD		((2<<29)|(0x50<<22)|0x4)
+#define XY_COLOR_BLT_WRITE_ALPHA	(1<<21)
+#define XY_COLOR_BLT_WRITE_RGB		(1<<20)
+
+#define XY_SRC_COPY_BLT_CMD             ((2<<29)|(0x53<<22)|6)
+#define XY_SRC_COPY_BLT_WRITE_ALPHA     (1<<21)
+#define XY_SRC_COPY_BLT_WRITE_RGB       (1<<20)
+
+#define MI_WAIT_FOR_EVENT               ((0x3<<23))
+#define MI_WAIT_FOR_PLANE_B_FLIP        (1<<6)
+#define MI_WAIT_FOR_PLANE_A_FLIP        (1<<2)
+
+#define MI_BATCH_BUFFER                 (0x30<<23)
+#define MI_BATCH_BUFFER_START           (0x31<<23)
+#define MI_BATCH_BUFFER_END             (0xa<<23)
+
+
+
+#define COMPAREFUNC_ALWAYS		0
+#define COMPAREFUNC_NEVER		0x1
+#define COMPAREFUNC_LESS		0x2
+#define COMPAREFUNC_EQUAL		0x3
+#define COMPAREFUNC_LEQUAL		0x4
+#define COMPAREFUNC_GREATER		0x5
+#define COMPAREFUNC_NOTEQUAL		0x6
+#define COMPAREFUNC_GEQUAL		0x7
+
+#define STENCILOP_KEEP			0
+#define STENCILOP_ZERO			0x1
+#define STENCILOP_REPLACE		0x2
+#define STENCILOP_INCRSAT		0x3
+#define STENCILOP_DECRSAT		0x4
+#define STENCILOP_INCR			0x5
+#define STENCILOP_DECR			0x6
+#define STENCILOP_INVERT		0x7
+
+#define LOGICOP_CLEAR			0
+#define LOGICOP_NOR			0x1
+#define LOGICOP_AND_INV 		0x2
+#define LOGICOP_COPY_INV		0x3
+#define LOGICOP_AND_RVRSE		0x4
+#define LOGICOP_INV			0x5
+#define LOGICOP_XOR			0x6
+#define LOGICOP_NAND			0x7
+#define LOGICOP_AND			0x8
+#define LOGICOP_EQUIV			0x9
+#define LOGICOP_NOOP			0xa
+#define LOGICOP_OR_INV			0xb
+#define LOGICOP_COPY			0xc
+#define LOGICOP_OR_RVRSE		0xd
+#define LOGICOP_OR			0xe
+#define LOGICOP_SET			0xf
+
+#define BLENDFACT_ZERO			0x01
+#define BLENDFACT_ONE			0x02
+#define BLENDFACT_SRC_COLR		0x03
+#define BLENDFACT_INV_SRC_COLR 		0x04
+#define BLENDFACT_SRC_ALPHA		0x05
+#define BLENDFACT_INV_SRC_ALPHA 	0x06
+#define BLENDFACT_DST_ALPHA		0x07
+#define BLENDFACT_INV_DST_ALPHA 	0x08
+#define BLENDFACT_DST_COLR		0x09
+#define BLENDFACT_INV_DST_COLR		0x0a
+#define BLENDFACT_SRC_ALPHA_SATURATE	0x0b
+#define BLENDFACT_CONST_COLOR		0x0c
+#define BLENDFACT_INV_CONST_COLOR	0x0d
+#define BLENDFACT_CONST_ALPHA		0x0e
+#define BLENDFACT_INV_CONST_ALPHA	0x0f
+#define BLENDFACT_MASK          	0x0f
+
+#define PCI_CHIP_I915_G			0x2582
+#define PCI_CHIP_I915_GM		0x2592
+#define PCI_CHIP_I945_G			0x2772
+#define PCI_CHIP_I945_GM		0x27A2
+#define PCI_CHIP_I945_GME		0x27AE
+#define PCI_CHIP_G33_G			0x29C2
+#define PCI_CHIP_Q35_G			0x29B2
+#define PCI_CHIP_Q33_G			0x29D2
+
+
+#endif
diff --git a/src/gallium/drivers/i915/i915_resource.c b/src/gallium/drivers/i915/i915_resource.c
new file mode 100644
index 0000000000..499233ceb9
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_resource.c
@@ -0,0 +1,51 @@
+#include "util/u_debug.h"
+
+#include "i915_resource.h"
+#include "i915_context.h"
+#include "i915_screen.h"
+
+
+static struct pipe_resource *
+i915_resource_create(struct pipe_screen *screen,
+                    const struct pipe_resource *template)
+{
+   if (template->target == PIPE_BUFFER)
+      return i915_buffer_create(screen, template);
+   else
+      return i915_texture_create(screen, template);
+
+}
+
+static struct pipe_resource *
+i915_resource_from_handle(struct pipe_screen * screen,
+			 const struct pipe_resource *template,
+			 struct winsys_handle *whandle)
+{
+   if (template->target == PIPE_BUFFER)
+      return NULL;
+   else
+      return i915_texture_from_handle(screen, template, whandle);
+}
+
+
+void
+i915_init_resource_functions(struct i915_context *i915 )
+{
+   i915->base.is_resource_referenced = u_default_is_resource_referenced;
+   i915->base.get_transfer = u_get_transfer_vtbl;
+   i915->base.transfer_map = u_transfer_map_vtbl;
+   i915->base.transfer_flush_region = u_transfer_flush_region_vtbl;
+   i915->base.transfer_unmap = u_transfer_unmap_vtbl;
+   i915->base.transfer_destroy = u_transfer_destroy_vtbl;
+   i915->base.transfer_inline_write = u_transfer_inline_write_vtbl;
+}
+
+void
+i915_init_screen_resource_functions(struct i915_screen *is)
+{
+   is->base.resource_create = i915_resource_create;
+   is->base.resource_from_handle = i915_resource_from_handle;
+   is->base.resource_get_handle = u_resource_get_handle_vtbl;
+   is->base.resource_destroy = u_resource_destroy_vtbl;
+   is->base.user_buffer_create = i915_user_buffer_create;
+}
diff --git a/src/gallium/drivers/i915/i915_resource.h b/src/gallium/drivers/i915/i915_resource.h
new file mode 100644
index 0000000000..1093e8f41f
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_resource.h
@@ -0,0 +1,114 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_RESOURCE_H
+#define I915_RESOURCE_H
+
+struct i915_screen;
+
+#include "util/u_transfer.h"
+#include "util/u_debug.h"
+
+
+struct i915_context;
+struct i915_screen;
+
+
+struct i915_buffer {
+   struct u_resource b;
+   uint8_t *data;
+   boolean free_on_destroy;
+};
+
+#define I915_MAX_TEXTURE_2D_LEVELS 11  /* max 1024x1024 */
+#define I915_MAX_TEXTURE_3D_LEVELS  8  /* max 128x128x128 */
+
+
+
+struct i915_texture {
+   struct u_resource b;
+
+   unsigned stride;
+   unsigned depth_stride;          /* per-image on i945? */
+   unsigned total_nblocksy;
+
+   unsigned sw_tiled; /**< tiled with software flags */
+   unsigned hw_tiled; /**< tiled with hardware fences */
+
+   unsigned nr_images[I915_MAX_TEXTURE_2D_LEVELS];
+
+   /* Explicitly store the offset of each image for each cube face or
+    * depth value.
+    */
+   unsigned *image_offset[I915_MAX_TEXTURE_2D_LEVELS];   /**< array [depth] of offsets */
+
+   /* The data is held here:
+    */
+   struct i915_winsys_buffer *buffer;
+};
+
+void i915_init_screen_resource_functions(struct i915_screen *is);
+void i915_init_resource_functions(struct i915_context *i915);
+
+extern struct u_resource_vtbl i915_buffer_vtbl;
+extern struct u_resource_vtbl i915_texture_vtbl;
+
+static INLINE struct i915_texture *i915_texture(struct pipe_resource *resource)
+{
+   struct i915_texture *tex = (struct i915_texture *)resource;
+   assert(tex->b.vtbl == &i915_texture_vtbl);
+   return tex;
+}
+
+static INLINE struct i915_buffer *i915_buffer(struct pipe_resource *resource)
+{
+   struct i915_buffer *tex = (struct i915_buffer *)resource;
+   assert(tex->b.vtbl == &i915_buffer_vtbl);
+   return tex;
+}
+
+struct pipe_resource *
+i915_texture_create(struct pipe_screen *screen,
+                    const struct pipe_resource *template);
+
+struct pipe_resource *
+i915_texture_from_handle(struct pipe_screen * screen,
+			 const struct pipe_resource *template,
+			 struct winsys_handle *whandle);
+
+
+struct pipe_resource *
+i915_user_buffer_create(struct pipe_screen *screen,
+                        void *ptr,
+                        unsigned bytes,
+			unsigned usage);
+
+struct pipe_resource *
+i915_buffer_create(struct pipe_screen *screen,
+		   const struct pipe_resource *template);
+
+#endif /* I915_RESOURCE_H */
diff --git a/src/gallium/drivers/i915/i915_resource_buffer.c b/src/gallium/drivers/i915/i915_resource_buffer.c
new file mode 100644
index 0000000000..0d379497df
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_resource_buffer.c
@@ -0,0 +1,160 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "i915_context.h"
+#include "i915_resource.h"
+
+
+
+static boolean
+i915_buffer_get_handle(struct pipe_screen *screen,
+		       struct pipe_resource *resource,
+		       struct winsys_handle *handle)
+{
+   return FALSE;
+}
+
+static void
+i915_buffer_destroy(struct pipe_screen *screen,
+		    struct pipe_resource *resource)
+{
+   struct i915_buffer *buffer = i915_buffer(resource);
+   if (buffer->free_on_destroy)
+      align_free(buffer->data);
+   FREE(buffer);
+}
+
+
+static void *
+i915_buffer_transfer_map( struct pipe_context *pipe,
+			  struct pipe_transfer *transfer )
+{
+   struct i915_buffer *buffer = i915_buffer(transfer->resource);
+   return buffer->data + transfer->box.x;
+}
+
+
+static void
+i915_buffer_transfer_inline_write( struct pipe_context *rm_ctx,
+				   struct pipe_resource *resource,
+				   struct pipe_subresource sr,
+				   unsigned usage,
+				   const struct pipe_box *box,
+				   const void *data,
+				   unsigned stride,
+				   unsigned slice_stride)
+{
+   struct i915_buffer *buffer = i915_buffer(resource);
+
+   memcpy(buffer->data + box->x,
+	  data,
+	  box->width);
+}
+
+
+struct u_resource_vtbl i915_buffer_vtbl = 
+{
+   i915_buffer_get_handle,	     /* get_handle */
+   i915_buffer_destroy,		     /* resource_destroy */
+   NULL,			     /* is_resource_referenced */
+   u_default_get_transfer,	     /* get_transfer */
+   u_default_transfer_destroy,	     /* transfer_destroy */
+   i915_buffer_transfer_map,	     /* transfer_map */
+   u_default_transfer_flush_region,  /* transfer_flush_region */
+   u_default_transfer_unmap,	     /* transfer_unmap */
+   i915_buffer_transfer_inline_write /* transfer_inline_write */
+};
+
+
+
+struct pipe_resource *
+i915_buffer_create(struct pipe_screen *screen,
+                    const struct pipe_resource *template)
+{
+   struct i915_buffer *buf = CALLOC_STRUCT(i915_buffer);
+
+   if (!buf)
+      return NULL;
+
+   buf->b.b = *template;
+   buf->b.vtbl = &i915_buffer_vtbl;
+   pipe_reference_init(&buf->b.b.reference, 1);
+   buf->b.b.screen = screen;
+   
+   buf->data = MALLOC(template->width0);
+   buf->free_on_destroy = TRUE;
+
+   if (!buf->data)
+      goto err;
+
+   return &buf->b.b;
+
+err:
+   FREE(buf);
+   return NULL;
+}
+
+
+
+struct pipe_resource *
+i915_user_buffer_create(struct pipe_screen *screen,
+                        void *ptr,
+                        unsigned bytes,
+			unsigned bind)
+{
+   struct i915_buffer *buf = CALLOC_STRUCT(i915_buffer);
+
+   if (!buf)
+      return NULL;
+
+   pipe_reference_init(&buf->b.b.reference, 1);
+   buf->b.vtbl = &i915_buffer_vtbl;
+   buf->b.b.screen = screen;
+   buf->b.b.format = PIPE_FORMAT_R8_UNORM; /* ?? */
+   buf->b.b.usage = PIPE_USAGE_IMMUTABLE;
+   buf->b.b.bind = bind;
+   buf->b.b.flags = 0;
+   buf->b.b.width0 = bytes;
+   buf->b.b.height0 = 1;
+   buf->b.b.depth0 = 1;
+
+   buf->data = ptr;
+   buf->free_on_destroy = FALSE;
+
+   return &buf->b.b;
+}
diff --git a/src/gallium/drivers/i915/i915_resource_texture.c b/src/gallium/drivers/i915/i915_resource_texture.c
new file mode 100644
index 0000000000..17fcdee379
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_resource_texture.c
@@ -0,0 +1,857 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  */
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "i915_context.h"
+#include "i915_resource.h"
+#include "i915_screen.h"
+#include "i915_winsys.h"
+
+
+#define DEBUG_TEXTURES 0
+
+/*
+ * Helper function and arrays
+ */
+
+
+/**
+ * Initial offset for Cube map.
+ */
+static const int initial_offsets[6][2] = {
+   [PIPE_TEX_FACE_POS_X] = {0, 0},
+   [PIPE_TEX_FACE_POS_Y] = {1, 0},
+   [PIPE_TEX_FACE_POS_Z] = {1, 1},
+   [PIPE_TEX_FACE_NEG_X] = {0, 2},
+   [PIPE_TEX_FACE_NEG_Y] = {1, 2},
+   [PIPE_TEX_FACE_NEG_Z] = {1, 3},
+};
+
+/**
+ * Step offsets for Cube map.
+ */
+static const int step_offsets[6][2] = {
+   [PIPE_TEX_FACE_POS_X] = { 0, 2},
+   [PIPE_TEX_FACE_POS_Y] = {-1, 2},
+   [PIPE_TEX_FACE_POS_Z] = {-1, 1},
+   [PIPE_TEX_FACE_NEG_X] = { 0, 2},
+   [PIPE_TEX_FACE_NEG_Y] = {-1, 2},
+   [PIPE_TEX_FACE_NEG_Z] = {-1, 1},
+};
+
+/**
+ * For compressed level 2
+ */
+static const int bottom_offsets[6] = {
+   [PIPE_TEX_FACE_POS_X] = 16 + 0 * 8,
+   [PIPE_TEX_FACE_POS_Y] = 16 + 1 * 8,
+   [PIPE_TEX_FACE_POS_Z] = 16 + 2 * 8,
+   [PIPE_TEX_FACE_NEG_X] = 16 + 3 * 8,
+   [PIPE_TEX_FACE_NEG_Y] = 16 + 4 * 8,
+   [PIPE_TEX_FACE_NEG_Z] = 16 + 5 * 8,
+};
+
+static INLINE unsigned
+align_nblocksx(enum pipe_format format, unsigned width, unsigned align_to)
+{
+   return align(util_format_get_nblocksx(format, width), align_to);
+}
+
+static INLINE unsigned
+align_nblocksy(enum pipe_format format, unsigned width, unsigned align_to)
+{
+   return align(util_format_get_nblocksy(format, width), align_to);
+}
+
+static INLINE unsigned
+get_pot_stride(enum pipe_format format, unsigned width)
+{
+   return util_next_power_of_two(util_format_get_stride(format, width));
+}
+
+/*
+ * More advanced helper funcs
+ */
+
+
+static void
+i915_texture_set_level_info(struct i915_texture *tex,
+                            unsigned level, unsigned nr_images)
+{
+   assert(level < Elements(tex->nr_images));
+   assert(nr_images);
+   assert(!tex->image_offset[level]);
+
+   tex->nr_images[level] = nr_images;
+   tex->image_offset[level] = (unsigned *) MALLOC(nr_images * sizeof(unsigned));
+   tex->image_offset[level][0] = 0;
+}
+
+static void
+i915_texture_set_image_offset(struct i915_texture *tex,
+                              unsigned level, unsigned img,
+                              unsigned x, unsigned y)
+{
+   /* for the first image and level make sure offset is zero */
+   assert(!(img == 0 && level == 0) || (x == 0 && y == 0));
+   assert(img < tex->nr_images[level]);
+
+   tex->image_offset[level][img] = y * tex->stride + x * util_format_get_blocksize(tex->b.b.format);
+
+#if DEBUG_TEXTURES
+   debug_printf("%s: %p level %u, img %u (%u, %u) %p\n", __FUNCTION__,
+                tex, level, img, x, y,
+                (void*)(uintptr_t)tex->image_offset[level][img]);
+#endif
+}
+
+
+/*
+ * Shared layout functions
+ */
+
+
+/**
+ * Special case to deal with scanout textures.
+ */
+static boolean
+i9x5_scanout_layout(struct i915_texture *tex)
+{
+   struct pipe_resource *pt = &tex->b.b;
+
+   if (pt->last_level > 0 || util_format_get_blocksize(pt->format) != 4)
+      return FALSE;
+
+   i915_texture_set_level_info(tex, 0, 1);
+   i915_texture_set_image_offset(tex, 0, 0, 0, 0);
+
+   if (pt->width0 >= 240) {
+      tex->stride = get_pot_stride(pt->format, pt->width0);
+      tex->total_nblocksy = align_nblocksy(pt->format, pt->height0, 8);
+      tex->hw_tiled = I915_TILE_X;
+   } else if (pt->width0 == 64 && pt->height0 == 64) {
+      tex->stride = get_pot_stride(pt->format, pt->width0);
+      tex->total_nblocksy = align_nblocksy(pt->format, pt->height0, 8);
+   } else {
+      return FALSE;
+   }
+
+#if DEBUG_TEXTURE
+   debug_printf("%s size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+      pt->width0, pt->height0, util_format_get_blocksize(pt->format),
+      tex->stride, tex->total_nblocksy, tex->stride * tex->total_nblocksy);
+#endif
+
+   return TRUE;
+}
+
+/**
+ * Special case to deal with shared textures.
+ */
+static boolean
+i9x5_display_target_layout(struct i915_texture *tex)
+{
+   struct pipe_resource *pt = &tex->b.b;
+
+   if (pt->last_level > 0 || util_format_get_blocksize(pt->format) != 4)
+      return FALSE;
+
+   /* fallback to normal textures for small textures */
+   if (pt->width0 < 240)
+      return FALSE;
+
+   i915_texture_set_level_info(tex, 0, 1);
+   i915_texture_set_image_offset(tex, 0, 0, 0, 0);
+
+   tex->stride = get_pot_stride(pt->format, pt->width0);
+   tex->total_nblocksy = align_nblocksy(pt->format, pt->height0, 8);
+   tex->hw_tiled = I915_TILE_X;
+
+#if DEBUG_TEXTURE
+   debug_printf("%s size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+      pt->width0, pt->height0, util_format_get_blocksize(pt->format),
+      tex->stride, tex->total_nblocksy, tex->stride * tex->total_nblocksy);
+#endif
+
+   return TRUE;
+}
+
+/**
+ * Helper function for special layouts
+ */
+static boolean
+i9x5_special_layout(struct i915_texture *tex)
+{
+   struct pipe_resource *pt = &tex->b.b;
+
+   /* Scanouts needs special care */
+   if (pt->bind & PIPE_BIND_SCANOUT)
+      if (i9x5_scanout_layout(tex))
+         return TRUE;
+
+   /* Shared buffers needs to be compatible with X servers
+    *
+    * XXX: need a better name than shared for this if it is to be part
+    * of core gallium, and probably move the flag to resource.flags,
+    * rather than bindings.
+    */
+   if (pt->bind & (PIPE_BIND_SHARED | PIPE_BIND_DISPLAY_TARGET))
+      if (i9x5_display_target_layout(tex))
+         return TRUE;
+
+   return FALSE;
+}
+
+/**
+ * Cube layout used on i915 and for non-compressed textures on i945.
+ */
+static void
+i9x5_texture_layout_cube(struct i915_texture *tex)
+{
+   struct pipe_resource *pt = &tex->b.b;
+   const unsigned nblocks = util_format_get_nblocksx(pt->format, pt->width0);
+   unsigned level;
+   unsigned face;
+
+   assert(pt->width0 == pt->height0); /* cubemap images are square */
+
+   /* double pitch for cube layouts */
+   tex->stride = align(nblocks * util_format_get_blocksize(pt->format) * 2, 4);
+   tex->total_nblocksy = nblocks * 4;
+
+   for (level = 0; level <= pt->last_level; level++)
+      i915_texture_set_level_info(tex, level, 6);
+
+   for (face = 0; face < 6; face++) {
+      unsigned x = initial_offsets[face][0] * nblocks;
+      unsigned y = initial_offsets[face][1] * nblocks;
+      unsigned d = nblocks;
+
+      for (level = 0; level <= pt->last_level; level++) {
+         i915_texture_set_image_offset(tex, level, face, x, y);
+         d >>= 1;
+         x += step_offsets[face][0] * d;
+         y += step_offsets[face][1] * d;
+      }
+   }
+}
+
+
+/*
+ * i915 layout functions
+ */
+
+
+static void
+i915_texture_layout_2d(struct i915_texture *tex)
+{
+   struct pipe_resource *pt = &tex->b.b;
+   unsigned level;
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned nblocksy = util_format_get_nblocksy(pt->format, pt->width0);
+   unsigned align_y = 2;
+
+   if (util_format_is_s3tc(pt->format))
+      align_y = 1;
+
+   tex->stride = align(util_format_get_stride(pt->format, pt->width0), 4);
+   tex->total_nblocksy = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      i915_texture_set_level_info(tex, level, 1);
+      i915_texture_set_image_offset(tex, level, 0, 0, tex->total_nblocksy);
+
+      tex->total_nblocksy += nblocksy;
+
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      nblocksy = align_nblocksy(pt->format, height, align_y);
+   }
+}
+
+static void
+i915_texture_layout_3d(struct i915_texture *tex)
+{
+   struct pipe_resource *pt = &tex->b.b;
+   unsigned level;
+
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
+   unsigned nblocksy = util_format_get_nblocksy(pt->format, pt->height0);
+   unsigned stack_nblocksy = 0;
+
+   /* Calculate the size of a single slice. 
+    */
+   tex->stride = align(util_format_get_stride(pt->format, pt->width0), 4);
+
+   /* XXX: hardware expects/requires 9 levels at minimum.
+    */
+   for (level = 0; level <= MAX2(8, pt->last_level); level++) {
+      i915_texture_set_level_info(tex, level, depth);
+
+      stack_nblocksy += MAX2(2, nblocksy);
+
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      nblocksy = util_format_get_nblocksy(pt->format, height);
+   }
+
+   /* Fixup depth image_offsets: 
+    */
+   for (level = 0; level <= pt->last_level; level++) {
+      unsigned i;
+      for (i = 0; i < depth; i++) 
+         i915_texture_set_image_offset(tex, level, i, 0, i * stack_nblocksy);
+
+      depth = u_minify(depth, 1);
+   }
+
+   /* Multiply slice size by texture depth for total size.  It's
+    * remarkable how wasteful of memory the i915 texture layouts
+    * are.  They are largely fixed in the i945.
+    */
+   tex->total_nblocksy = stack_nblocksy * pt->depth0;
+}
+
+static boolean
+i915_texture_layout(struct i915_texture * tex)
+{
+   struct pipe_resource *pt = &tex->b.b;
+
+   switch (pt->target) {
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_2D:
+      if (!i9x5_special_layout(tex))
+         i915_texture_layout_2d(tex);
+      break;
+   case PIPE_TEXTURE_3D:
+      i915_texture_layout_3d(tex);
+      break;
+   case PIPE_TEXTURE_CUBE:
+      i9x5_texture_layout_cube(tex);
+      break;
+   default:
+      assert(0);
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+/*
+ * i945 layout functions
+ */
+
+
+static void
+i945_texture_layout_2d(struct i915_texture *tex)
+{
+   struct pipe_resource *pt = &tex->b.b;
+   int align_x = 4, align_y = 2;
+   unsigned level;
+   unsigned x = 0;
+   unsigned y = 0;
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned nblocksx = util_format_get_nblocksx(pt->format, pt->width0);
+   unsigned nblocksy = util_format_get_nblocksy(pt->format, pt->height0);
+
+   if (util_format_is_s3tc(pt->format)) {
+      align_x = 1;
+      align_y = 1;
+   }
+
+   tex->stride = align(util_format_get_stride(pt->format, pt->width0), 4);
+
+   /* May need to adjust pitch to accomodate the placement of
+    * the 2nd mipmap level.  This occurs when the alignment
+    * constraints of mipmap placement push the right edge of the
+    * 2nd mipmap level out past the width of its parent.
+    */
+   if (pt->last_level > 0) {
+      unsigned mip1_nblocksx =
+         align_nblocksx(pt->format, u_minify(pt->width0, 1), align_x) +
+         util_format_get_nblocksx(pt->format, u_minify(pt->width0, 2));
+
+      if (mip1_nblocksx > nblocksx)
+         tex->stride = mip1_nblocksx * util_format_get_blocksize(pt->format);
+   }
+
+   /* Pitch must be a whole number of dwords
+    */
+   tex->stride = align(tex->stride, 64);
+   tex->total_nblocksy = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      i915_texture_set_level_info(tex, level, 1);
+      i915_texture_set_image_offset(tex, level, 0, x, y);
+
+      /* Because the images are packed better, the final offset
+       * might not be the maximal one:
+       */
+      tex->total_nblocksy = MAX2(tex->total_nblocksy, y + nblocksy);
+
+      /* Layout_below: step right after second mipmap level.
+       */
+      if (level == 1) {
+         x += nblocksx;
+      } else {
+         y += nblocksy;
+      }
+
+      width  = u_minify(width, 1);
+      height = u_minify(height, 1);
+      nblocksx = align_nblocksx(pt->format, width, align_x);
+      nblocksy = align_nblocksy(pt->format, height, align_y);
+   }
+}
+
+static void
+i945_texture_layout_3d(struct i915_texture *tex)
+{
+   struct pipe_resource *pt = &tex->b.b;
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
+   unsigned nblocksy = util_format_get_nblocksy(pt->format, pt->width0);
+   unsigned pack_x_pitch, pack_x_nr;
+   unsigned pack_y_pitch;
+   unsigned level;
+
+   tex->stride = align(util_format_get_stride(pt->format, pt->width0), 4);
+   tex->total_nblocksy = 0;
+
+   pack_y_pitch = MAX2(nblocksy, 2);
+   pack_x_pitch = tex->stride / util_format_get_blocksize(pt->format);
+   pack_x_nr = 1;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      int x = 0;
+      int y = 0;
+      unsigned q, j;
+
+      i915_texture_set_level_info(tex, level, depth);
+
+      for (q = 0; q < depth;) {
+         for (j = 0; j < pack_x_nr && q < depth; j++, q++) {
+            i915_texture_set_image_offset(tex, level, q, x, y + tex->total_nblocksy);
+            x += pack_x_pitch;
+         }
+
+         x = 0;
+         y += pack_y_pitch;
+      }
+
+      tex->total_nblocksy += y;
+
+      if (pack_x_pitch > 4) {
+         pack_x_pitch >>= 1;
+         pack_x_nr <<= 1;
+         assert(pack_x_pitch * pack_x_nr * util_format_get_blocksize(pt->format) <= tex->stride);
+      }
+
+      if (pack_y_pitch > 2) {
+         pack_y_pitch >>= 1;
+      }
+
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
+      nblocksy = util_format_get_nblocksy(pt->format, height);
+   }
+}
+
+static void
+i945_texture_layout_cube(struct i915_texture *tex)
+{
+   struct pipe_resource *pt = &tex->b.b;
+   const unsigned nblocks = util_format_get_nblocksx(pt->format, pt->width0);
+   const unsigned dim = pt->width0;
+   unsigned level;
+   unsigned face;
+
+   assert(pt->width0 == pt->height0); /* cubemap images are square */
+   assert(util_next_power_of_two(pt->width0) == pt->width0); /* npot only */
+   assert(util_format_is_s3tc(pt->format)); /* compressed only */
+
+   /*
+    * Depending on the size of the largest images, pitch can be
+    * determined either by the old-style packing of cubemap faces,
+    * or the final row of 4x4, 2x2 and 1x1 faces below this.
+    *
+    * 64  * 2 / 4 = 32
+    * 14 * 2 = 28
+    */
+   if (pt->width0 >= 64)
+      tex->stride = nblocks * 2 * util_format_get_blocksize(pt->format);
+   else
+      tex->stride = 14 * 2 * util_format_get_blocksize(pt->format);
+
+   /*
+    * Something similary apply for height as well.
+    */
+   if (pt->width0 >= 4)
+      tex->total_nblocksy = nblocks * 4 + 1;
+   else
+      tex->total_nblocksy = 1;
+
+   /* Set all the levels to effectively occupy the whole rectangular region */
+   for (level = 0; level <= pt->last_level; level++)
+      i915_texture_set_level_info(tex, level, 6);
+
+   for (face = 0; face < 6; face++) {
+      /* all calculations in pixels */
+      unsigned total_height = tex->total_nblocksy * 4;
+      unsigned x = initial_offsets[face][0] * dim;
+      unsigned y = initial_offsets[face][1] * dim;
+      unsigned d = dim;
+
+      if (dim == 4 && face >= 4) {
+         x = (face - 4) * 8;
+         y = tex->total_nblocksy * 4 - 4; /* 4 = 1 block */
+      } else if (dim < 4 && (face > 0)) {
+         x = face * 8;
+         y = total_height - 4;
+      }
+
+      for (level = 0; level <= pt->last_level; level++) {
+         i915_texture_set_image_offset(tex, level, face,
+                                       util_format_get_nblocksx(pt->format, x),
+                                       util_format_get_nblocksy(pt->format, y));
+
+         d >>= 1;
+
+         switch (d) {
+         case 4:
+            switch (face) {
+            case PIPE_TEX_FACE_POS_X:
+            case PIPE_TEX_FACE_NEG_X:
+               x += step_offsets[face][0] * d;
+               y += step_offsets[face][1] * d;
+               break;
+            case PIPE_TEX_FACE_POS_Y:
+            case PIPE_TEX_FACE_NEG_Y:
+               y += 12;
+               x -= 8;
+               break;
+            case PIPE_TEX_FACE_POS_Z:
+            case PIPE_TEX_FACE_NEG_Z:
+               y = total_height - 4;
+               x = (face - 4) * 8;
+               break;
+            }
+            break;
+         case 2:
+            y = total_height - 4;
+            x = bottom_offsets[face];
+            break;
+         case 1:
+            x += 48;
+            break;
+         default:
+            x += step_offsets[face][0] * d;
+            y += step_offsets[face][1] * d;
+            break;
+         }
+      }
+   }
+}
+
+static boolean
+i945_texture_layout(struct i915_texture * tex)
+{
+   struct pipe_resource *pt = &tex->b.b;
+
+   switch (pt->target) {
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_2D:
+      if (!i9x5_special_layout(tex))
+         i945_texture_layout_2d(tex);
+      break;
+   case PIPE_TEXTURE_3D:
+      i945_texture_layout_3d(tex);
+      break;
+   case PIPE_TEXTURE_CUBE:
+      if (!util_format_is_s3tc(pt->format))
+         i9x5_texture_layout_cube(tex);
+      else
+         i945_texture_layout_cube(tex);
+      break;
+   default:
+      assert(0);
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+
+/*
+ * Screen texture functions
+ */
+
+
+
+static boolean
+i915_texture_get_handle(struct pipe_screen * screen,
+                        struct pipe_resource *texture,
+                        struct winsys_handle *whandle)
+{
+   struct i915_screen *is = i915_screen(screen);
+   struct i915_texture *tex = i915_texture(texture);
+   struct i915_winsys *iws = is->iws;
+
+   return iws->buffer_get_handle(iws, tex->buffer, whandle, tex->stride);
+}
+
+
+static void
+i915_texture_destroy(struct pipe_screen *screen,
+		     struct pipe_resource *pt)
+{
+   struct i915_texture *tex = i915_texture(pt);
+   struct i915_winsys *iws = i915_screen(screen)->iws;
+   uint i;
+
+   iws->buffer_destroy(iws, tex->buffer);
+
+   for (i = 0; i < Elements(tex->image_offset); i++)
+      if (tex->image_offset[i])
+         FREE(tex->image_offset[i]);
+
+   FREE(tex);
+}
+
+static struct pipe_transfer * 
+i915_texture_get_transfer(struct pipe_context *context,
+			  struct pipe_resource *resource,
+			  struct pipe_subresource sr,
+			  unsigned usage,
+			  const struct pipe_box *box)
+{
+   struct i915_texture *tex = i915_texture(resource);
+   struct pipe_transfer *transfer = CALLOC_STRUCT(pipe_transfer);
+   if (transfer == NULL)
+      return NULL;
+
+   transfer->resource = resource;
+   transfer->sr = sr;
+   transfer->usage = usage;
+   transfer->box = *box;
+   transfer->stride = tex->stride;
+
+   return transfer;
+}
+
+
+static void *
+i915_texture_transfer_map(struct pipe_context *pipe,
+			  struct pipe_transfer *transfer)
+{
+   struct pipe_resource *resource = transfer->resource;
+   struct i915_texture *tex = i915_texture(resource);
+   struct i915_winsys *iws = i915_screen(pipe->screen)->iws;
+   struct pipe_subresource sr = transfer->sr;
+   struct pipe_box *box = &transfer->box;
+   enum pipe_format format = resource->format;
+   unsigned offset;
+   char *map;
+
+   if (resource->target == PIPE_TEXTURE_CUBE) {
+      offset = tex->image_offset[sr.level][sr.face];
+   } else if (resource->target == PIPE_TEXTURE_3D) {
+      offset = tex->image_offset[sr.level][box->z];
+   } else {
+      offset = tex->image_offset[sr.level][0];
+      assert(sr.face == 0);
+      assert(box->z == 0);
+   }
+
+   map = iws->buffer_map(iws, tex->buffer,
+                         (transfer->usage & PIPE_TRANSFER_WRITE) ? TRUE : FALSE);
+   if (map == NULL)
+      return NULL;
+
+   return map + offset +
+      box->y / util_format_get_blockheight(format) * transfer->stride +
+      box->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
+}
+
+static void
+i915_texture_transfer_unmap(struct pipe_context *pipe,
+			    struct pipe_transfer *transfer)
+{
+   struct i915_texture *tex = i915_texture(transfer->resource);
+   struct i915_winsys *iws = i915_screen(tex->b.b.screen)->iws;
+   iws->buffer_unmap(iws, tex->buffer);
+}
+
+
+
+struct u_resource_vtbl i915_texture_vtbl = 
+{
+   i915_texture_get_handle,	      /* get_handle */
+   i915_texture_destroy,	      /* resource_destroy */
+   NULL,			      /* is_resource_referenced */
+   i915_texture_get_transfer,	      /* get_transfer */
+   u_default_transfer_destroy,	      /* transfer_destroy */
+   i915_texture_transfer_map,	      /* transfer_map */
+   u_default_transfer_flush_region,   /* transfer_flush_region */
+   i915_texture_transfer_unmap,	      /* transfer_unmap */
+   u_default_transfer_inline_write    /* transfer_inline_write */
+};
+
+
+
+
+struct pipe_resource *
+i915_texture_create(struct pipe_screen *screen,
+                    const struct pipe_resource *template)
+{
+   struct i915_screen *is = i915_screen(screen);
+   struct i915_winsys *iws = is->iws;
+   struct i915_texture *tex = CALLOC_STRUCT(i915_texture);
+   size_t tex_size;
+   unsigned buf_usage = 0;
+
+   if (!tex)
+      return NULL;
+
+   tex->b.b = *template;
+   tex->b.vtbl = &i915_texture_vtbl;
+   pipe_reference_init(&tex->b.b.reference, 1);
+   tex->b.b.screen = screen;
+
+   if (is->is_i945) {
+      if (!i945_texture_layout(tex))
+         goto fail;
+   } else {
+      if (!i915_texture_layout(tex))
+         goto fail;
+   }
+
+   tex_size = tex->stride * tex->total_nblocksy;
+
+   /* for scanouts and cursors, cursors arn't scanouts */
+
+   /* XXX: use a custom flag for cursors, don't rely on magically
+    * guessing that this is Xorg asking for a cursor
+    */
+   if ((template->bind & PIPE_BIND_SCANOUT) && template->width0 != 64)
+      buf_usage = I915_NEW_SCANOUT;
+   else
+      buf_usage = I915_NEW_TEXTURE;
+
+   tex->buffer = iws->buffer_create(iws, tex_size, 64, buf_usage);
+   if (!tex->buffer)
+      goto fail;
+
+   /* setup any hw fences */
+   if (tex->hw_tiled) {
+      assert(tex->sw_tiled == I915_TILE_NONE);
+      iws->buffer_set_fence_reg(iws, tex->buffer, tex->stride, tex->hw_tiled);
+   }
+
+   
+#if 0
+   void *ptr = ws->buffer_map(ws, tex->buffer,
+      PIPE_BUFFER_USAGE_CPU_WRITE);
+   memset(ptr, 0x80, tex_size);
+   ws->buffer_unmap(ws, tex->buffer);
+#endif
+
+#if DEBUG_TEXTURES
+   debug_printf("%s: %p size %u, stride %u, blocks (%u, %u)\n", __func__,
+                tex, (unsigned int)tex_size, tex->stride,
+                tex->stride / util_format_get_blocksize(tex->b.b.format),
+                tex->total_nblocksy);
+#endif
+
+   return &tex->b.b;
+
+fail:
+   FREE(tex);
+   return NULL;
+}
+
+struct pipe_resource *
+i915_texture_from_handle(struct pipe_screen * screen,
+			  const struct pipe_resource *template,
+			  struct winsys_handle *whandle)
+{
+   struct i915_screen *is = i915_screen(screen);
+   struct i915_texture *tex;
+   struct i915_winsys *iws = is->iws;
+   struct i915_winsys_buffer *buffer;
+   unsigned stride;
+
+   assert(screen);
+
+   buffer = iws->buffer_from_handle(iws, whandle, &stride);
+
+   /* Only supports one type */
+   if (template->target != PIPE_TEXTURE_2D ||
+       template->last_level != 0 ||
+       template->depth0 != 1) {
+      return NULL;
+   }
+
+   tex = CALLOC_STRUCT(i915_texture);
+   if (!tex)
+      return NULL;
+
+   tex->b.b = *template;
+   tex->b.vtbl = &i915_texture_vtbl;
+   pipe_reference_init(&tex->b.b.reference, 1);
+   tex->b.b.screen = screen;
+
+   tex->stride = stride;
+
+   i915_texture_set_level_info(tex, 0, 1);
+   i915_texture_set_image_offset(tex, 0, 0, 0, 0);
+
+   tex->buffer = buffer;
+
+   return &tex->b.b;
+}
+
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
new file mode 100644
index 0000000000..f82426520c
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -0,0 +1,334 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_screen.h"
+#include "i915_surface.h"
+#include "i915_resource.h"
+#include "i915_winsys.h"
+
+
+/*
+ * Probe functions
+ */
+
+
+static const char *
+i915_get_vendor(struct pipe_screen *screen)
+{
+   return "VMware, Inc.";
+}
+
+static const char *
+i915_get_name(struct pipe_screen *screen)
+{
+   static char buffer[128];
+   const char *chipset;
+
+   switch (i915_screen(screen)->iws->pci_id) {
+   case PCI_CHIP_I915_G:
+      chipset = "915G";
+      break;
+   case PCI_CHIP_I915_GM:
+      chipset = "915GM";
+      break;
+   case PCI_CHIP_I945_G:
+      chipset = "945G";
+      break;
+   case PCI_CHIP_I945_GM:
+      chipset = "945GM";
+      break;
+   case PCI_CHIP_I945_GME:
+      chipset = "945GME";
+      break;
+   case PCI_CHIP_G33_G:
+      chipset = "G33";
+      break;
+   case PCI_CHIP_Q35_G:
+      chipset = "Q35";
+      break;
+   case PCI_CHIP_Q33_G:
+      chipset = "Q33";
+      break;
+   default:
+      chipset = "unknown";
+      break;
+   }
+
+   util_snprintf(buffer, sizeof(buffer), "i915 (chipset: %s)", chipset);
+   return buffer;
+}
+
+static int
+i915_get_param(struct pipe_screen *screen, enum pipe_cap param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return 8;
+   case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+      return 0;
+   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+      return 8;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 0;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 0;
+   case PIPE_CAP_TIMER_QUERY:
+      return 0;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return I915_MAX_TEXTURE_2D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return I915_MAX_TEXTURE_3D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return I915_MAX_TEXTURE_2D_LEVELS;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+      return 0;
+   case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
+      /* disable for now */
+      return 0;
+   default:
+      return 0;
+   }
+}
+
+static float
+i915_get_paramf(struct pipe_screen *screen, enum pipe_cap param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 7.5;
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0;
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 4.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0;
+
+   default:
+      return 0;
+   }
+}
+
+static boolean
+i915_is_format_supported(struct pipe_screen *screen,
+                         enum pipe_format format,
+                         enum pipe_texture_target target,
+                         unsigned sample_count,
+                         unsigned tex_usage,
+                         unsigned geom_flags)
+{
+   static const enum pipe_format tex_supported[] = {
+      PIPE_FORMAT_B8G8R8A8_UNORM,
+      PIPE_FORMAT_B8G8R8X8_UNORM,
+      PIPE_FORMAT_R8G8B8A8_UNORM,
+#if 0
+      PIPE_FORMAT_R8G8B8X8_UNORM,
+#endif
+      PIPE_FORMAT_B5G6R5_UNORM,
+      PIPE_FORMAT_L8_UNORM,
+      PIPE_FORMAT_A8_UNORM,
+      PIPE_FORMAT_I8_UNORM,
+      PIPE_FORMAT_L8A8_UNORM,
+      PIPE_FORMAT_UYVY,
+      PIPE_FORMAT_YUYV,
+      /* XXX why not?
+      PIPE_FORMAT_Z16_UNORM, */
+      PIPE_FORMAT_Z24X8_UNORM,
+      PIPE_FORMAT_Z24_UNORM_S8_USCALED,
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   static const enum pipe_format render_supported[] = {
+      PIPE_FORMAT_B8G8R8A8_UNORM,
+      PIPE_FORMAT_B5G6R5_UNORM,
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   static const enum pipe_format depth_supported[] = {
+      /* XXX why not?
+      PIPE_FORMAT_Z16_UNORM, */
+      PIPE_FORMAT_Z24X8_UNORM,
+      PIPE_FORMAT_Z24_UNORM_S8_USCALED,
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   const enum pipe_format *list;
+   uint i;
+
+   if (sample_count > 1)
+      return FALSE;
+
+   if(tex_usage & PIPE_BIND_DEPTH_STENCIL)
+      list = depth_supported;
+   else if (tex_usage & PIPE_BIND_RENDER_TARGET)
+      list = render_supported;
+   else
+      list = tex_supported;
+
+   for (i = 0; list[i] != PIPE_FORMAT_NONE; i++) {
+      if (list[i] == format)
+         return TRUE;
+   }
+
+   return FALSE;
+}
+
+
+/*
+ * Fence functions
+ */
+
+
+static void
+i915_fence_reference(struct pipe_screen *screen,
+                     struct pipe_fence_handle **ptr,
+                     struct pipe_fence_handle *fence)
+{
+   struct i915_screen *is = i915_screen(screen);
+
+   is->iws->fence_reference(is->iws, ptr, fence);
+}
+
+static int
+i915_fence_signalled(struct pipe_screen *screen,
+                     struct pipe_fence_handle *fence,
+                     unsigned flags)
+{
+   struct i915_screen *is = i915_screen(screen);
+
+   return is->iws->fence_signalled(is->iws, fence);
+}
+
+static int
+i915_fence_finish(struct pipe_screen *screen,
+                  struct pipe_fence_handle *fence,
+                  unsigned flags)
+{
+   struct i915_screen *is = i915_screen(screen);
+
+   return is->iws->fence_finish(is->iws, fence);
+}
+
+
+/*
+ * Generic functions
+ */
+
+
+static void
+i915_destroy_screen(struct pipe_screen *screen)
+{
+   struct i915_screen *is = i915_screen(screen);
+
+   if (is->iws)
+      is->iws->destroy(is->iws);
+
+   FREE(is);
+}
+
+/**
+ * Create a new i915_screen object
+ */
+struct pipe_screen *
+i915_screen_create(struct i915_winsys *iws)
+{
+   struct i915_screen *is = CALLOC_STRUCT(i915_screen);
+
+   if (!is)
+      return NULL;
+
+   switch (iws->pci_id) {
+   case PCI_CHIP_I915_G:
+   case PCI_CHIP_I915_GM:
+      is->is_i945 = FALSE;
+      break;
+
+   case PCI_CHIP_I945_G:
+   case PCI_CHIP_I945_GM:
+   case PCI_CHIP_I945_GME:
+   case PCI_CHIP_G33_G:
+   case PCI_CHIP_Q33_G:
+   case PCI_CHIP_Q35_G:
+      is->is_i945 = TRUE;
+      break;
+
+   default:
+      debug_printf("%s: unknown pci id 0x%x, cannot create screen\n", 
+                   __FUNCTION__, iws->pci_id);
+      FREE(is);
+      return NULL;
+   }
+
+   is->iws = iws;
+
+   is->base.winsys = NULL;
+
+   is->base.destroy = i915_destroy_screen;
+
+   is->base.get_name = i915_get_name;
+   is->base.get_vendor = i915_get_vendor;
+   is->base.get_param = i915_get_param;
+   is->base.get_paramf = i915_get_paramf;
+   is->base.is_format_supported = i915_is_format_supported;
+
+   is->base.context_create = i915_create_context;
+
+   is->base.fence_reference = i915_fence_reference;
+   is->base.fence_signalled = i915_fence_signalled;
+   is->base.fence_finish = i915_fence_finish;
+
+   i915_init_screen_resource_functions(is);
+   i915_init_screen_surface_functions(is);
+
+   return &is->base;
+}
diff --git a/src/gallium/drivers/i915/i915_screen.h b/src/gallium/drivers/i915/i915_screen.h
new file mode 100644
index 0000000000..0c4186c68e
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_screen.h
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_SCREEN_H
+#define I915_SCREEN_H
+
+#include "pipe/p_state.h"
+#include "pipe/p_screen.h"
+
+
+struct i915_winsys;
+
+
+/**
+ * Subclass of pipe_screen
+ */
+struct i915_screen
+{
+   struct pipe_screen base;
+
+   struct i915_winsys *iws;
+
+   boolean is_i945;
+};
+
+/**
+ * Subclass of pipe_transfer
+ */
+struct i915_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned offset;
+};
+
+
+/*
+ * Cast wrappers
+ */
+
+
+static INLINE struct i915_screen *
+i915_screen(struct pipe_screen *pscreen)
+{
+   return (struct i915_screen *) pscreen;
+}
+
+static INLINE struct i915_transfer *
+i915_transfer(struct pipe_transfer *transfer)
+{
+   return (struct i915_transfer *)transfer;
+}
+
+
+#endif /* I915_SCREEN_H */
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
new file mode 100644
index 0000000000..e767aa9f8f
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -0,0 +1,863 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "draw/draw_context.h"
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_state_inlines.h"
+#include "i915_fpc.h"
+#include "i915_resource.h"
+
+/* The i915 (and related graphics cores) do not support GL_CLAMP.  The
+ * Intel drivers for "other operating systems" implement GL_CLAMP as
+ * GL_CLAMP_TO_EDGE, so the same is done here.
+ */
+static unsigned
+translate_wrap_mode(unsigned wrap)
+{
+   switch (wrap) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return TEXCOORDMODE_WRAP;
+   case PIPE_TEX_WRAP_CLAMP:
+      return TEXCOORDMODE_CLAMP_EDGE;   /* not quite correct */
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return TEXCOORDMODE_CLAMP_EDGE;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return TEXCOORDMODE_CLAMP_BORDER;
+   /*         
+   case PIPE_TEX_WRAP_MIRRORED_REPEAT:
+      return TEXCOORDMODE_MIRROR;
+    */
+   default:
+      return TEXCOORDMODE_WRAP;
+   }
+}
+
+static unsigned translate_img_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      return FILTER_NEAREST;
+   case PIPE_TEX_FILTER_LINEAR:
+      return FILTER_LINEAR;
+   default:
+      assert(0);
+      return FILTER_NEAREST;
+   }
+}
+
+static unsigned translate_mip_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_MIPFILTER_NONE:
+      return MIPFILTER_NONE;
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      return MIPFILTER_NEAREST;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      return MIPFILTER_LINEAR;
+   default:
+      assert(0);
+      return MIPFILTER_NONE;
+   }
+}
+
+
+/* None of this state is actually used for anything yet.
+ */
+static void *
+i915_create_blend_state(struct pipe_context *pipe,
+                        const struct pipe_blend_state *blend)
+{
+   struct i915_blend_state *cso_data = CALLOC_STRUCT( i915_blend_state );
+
+   {
+      unsigned eqRGB  = blend->rt[0].rgb_func;
+      unsigned srcRGB = blend->rt[0].rgb_src_factor;
+      unsigned dstRGB = blend->rt[0].rgb_dst_factor;
+
+      unsigned eqA    = blend->rt[0].alpha_func;
+      unsigned srcA   = blend->rt[0].alpha_src_factor;
+      unsigned dstA   = blend->rt[0].alpha_dst_factor;
+
+      /* Special handling for MIN/MAX filter modes handled at
+       * state_tracker level.
+       */
+
+      if (srcA != srcRGB ||
+	  dstA != dstRGB ||
+	  eqA != eqRGB) {
+
+	 cso_data->iab = (_3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD |
+                          IAB_MODIFY_ENABLE |
+                          IAB_ENABLE |
+                          IAB_MODIFY_FUNC |
+                          IAB_MODIFY_SRC_FACTOR |
+                          IAB_MODIFY_DST_FACTOR |
+                          SRC_ABLND_FACT(i915_translate_blend_factor(srcA)) |
+                          DST_ABLND_FACT(i915_translate_blend_factor(dstA)) |
+                          (i915_translate_blend_func(eqA) << IAB_FUNC_SHIFT));
+      }
+      else {
+	 cso_data->iab = (_3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD |
+                          IAB_MODIFY_ENABLE |
+                          0);
+      }
+   }
+
+   cso_data->modes4 |= (_3DSTATE_MODES_4_CMD |
+                        ENABLE_LOGIC_OP_FUNC |
+                        LOGIC_OP_FUNC(i915_translate_logic_op(blend->logicop_func)));
+
+   if (blend->logicop_enable)
+      cso_data->LIS5 |= S5_LOGICOP_ENABLE;
+
+   if (blend->dither)
+      cso_data->LIS5 |= S5_COLOR_DITHER_ENABLE;
+
+   if ((blend->rt[0].colormask & PIPE_MASK_R) == 0)
+      cso_data->LIS5 |= S5_WRITEDISABLE_RED;
+
+   if ((blend->rt[0].colormask & PIPE_MASK_G) == 0)
+      cso_data->LIS5 |= S5_WRITEDISABLE_GREEN;
+
+   if ((blend->rt[0].colormask & PIPE_MASK_B) == 0)
+      cso_data->LIS5 |= S5_WRITEDISABLE_BLUE;
+
+   if ((blend->rt[0].colormask & PIPE_MASK_A) == 0)
+      cso_data->LIS5 |= S5_WRITEDISABLE_ALPHA;
+
+   if (blend->rt[0].blend_enable) {
+      unsigned funcRGB = blend->rt[0].rgb_func;
+      unsigned srcRGB  = blend->rt[0].rgb_src_factor;
+      unsigned dstRGB  = blend->rt[0].rgb_dst_factor;
+
+      cso_data->LIS6 |= (S6_CBUF_BLEND_ENABLE |
+                         SRC_BLND_FACT(i915_translate_blend_factor(srcRGB)) |
+                         DST_BLND_FACT(i915_translate_blend_factor(dstRGB)) |
+                         (i915_translate_blend_func(funcRGB) << S6_CBUF_BLEND_FUNC_SHIFT));
+   }
+
+   return cso_data;
+}
+
+static void i915_bind_blend_state(struct pipe_context *pipe,
+                                  void *blend)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->blend = (struct i915_blend_state*)blend;
+
+   i915->dirty |= I915_NEW_BLEND;
+}
+
+
+static void i915_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   FREE(blend);
+}
+
+static void i915_set_blend_color( struct pipe_context *pipe,
+                                  const struct pipe_blend_color *blend_color )
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->blend_color = *blend_color;
+
+   i915->dirty |= I915_NEW_BLEND;
+}
+
+static void i915_set_stencil_ref( struct pipe_context *pipe,
+                                  const struct pipe_stencil_ref *stencil_ref )
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->stencil_ref = *stencil_ref;
+
+   i915->dirty |= I915_NEW_DEPTH_STENCIL;
+}
+
+static void *
+i915_create_sampler_state(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   struct i915_sampler_state *cso = CALLOC_STRUCT( i915_sampler_state );
+   const unsigned ws = sampler->wrap_s;
+   const unsigned wt = sampler->wrap_t;
+   const unsigned wr = sampler->wrap_r;
+   unsigned minFilt, magFilt;
+   unsigned mipFilt;
+
+   cso->templ = sampler;
+
+   mipFilt = translate_mip_filter(sampler->min_mip_filter);
+   minFilt = translate_img_filter( sampler->min_img_filter );
+   magFilt = translate_img_filter( sampler->mag_img_filter );
+   
+   if (sampler->max_anisotropy > 1)
+      minFilt = magFilt = FILTER_ANISOTROPIC;
+
+   if (sampler->max_anisotropy > 2) {
+      cso->state[0] |= SS2_MAX_ANISO_4;
+   }
+
+   {
+      int b = (int) (sampler->lod_bias * 16.0);
+      b = CLAMP(b, -256, 255);
+      cso->state[0] |= ((b << SS2_LOD_BIAS_SHIFT) & SS2_LOD_BIAS_MASK);
+   }
+
+   /* Shadow:
+    */
+   if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) 
+   {
+      cso->state[0] |= (SS2_SHADOW_ENABLE |
+                        i915_translate_compare_func(sampler->compare_func));
+
+      minFilt = FILTER_4X4_FLAT;
+      magFilt = FILTER_4X4_FLAT;
+   }
+
+   cso->state[0] |= ((minFilt << SS2_MIN_FILTER_SHIFT) |
+                     (mipFilt << SS2_MIP_FILTER_SHIFT) |
+                     (magFilt << SS2_MAG_FILTER_SHIFT));
+
+   cso->state[1] |=
+      ((translate_wrap_mode(ws) << SS3_TCX_ADDR_MODE_SHIFT) |
+       (translate_wrap_mode(wt) << SS3_TCY_ADDR_MODE_SHIFT) |
+       (translate_wrap_mode(wr) << SS3_TCZ_ADDR_MODE_SHIFT));
+
+   if (sampler->normalized_coords)
+      cso->state[1] |= SS3_NORMALIZED_COORDS;
+
+   {
+      int minlod = (int) (16.0 * sampler->min_lod);
+      int maxlod = (int) (16.0 * sampler->max_lod);
+      minlod = CLAMP(minlod, 0, 16 * 11);
+      maxlod = CLAMP(maxlod, 0, 16 * 11);
+
+      if (minlod > maxlod)
+	 maxlod = minlod;
+
+      cso->minlod = minlod;
+      cso->maxlod = maxlod;
+   }
+
+   {
+      ubyte r = float_to_ubyte(sampler->border_color[0]);
+      ubyte g = float_to_ubyte(sampler->border_color[1]);
+      ubyte b = float_to_ubyte(sampler->border_color[2]);
+      ubyte a = float_to_ubyte(sampler->border_color[3]);
+      cso->state[2] = I915PACKCOLOR8888(r, g, b, a);
+   }
+   return cso;
+}
+
+static void i915_bind_sampler_states(struct pipe_context *pipe,
+                                     unsigned num, void **sampler)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   unsigned i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == i915->num_samplers &&
+       !memcmp(i915->sampler, sampler, num * sizeof(void *)))
+      return;
+
+   draw_flush(i915->draw);
+
+   for (i = 0; i < num; ++i)
+      i915->sampler[i] = sampler[i];
+   for (i = num; i < PIPE_MAX_SAMPLERS; ++i)
+      i915->sampler[i] = NULL;
+
+   i915->num_samplers = num;
+
+   i915->dirty |= I915_NEW_SAMPLER;
+}
+
+static void i915_delete_sampler_state(struct pipe_context *pipe,
+                                      void *sampler)
+{
+   FREE(sampler);
+}
+
+
+/** XXX move someday?  Or consolidate all these simple state setters
+ * into one file.
+ */
+
+static void *
+i915_create_depth_stencil_state(struct pipe_context *pipe,
+				const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   struct i915_depth_stencil_state *cso = CALLOC_STRUCT( i915_depth_stencil_state );
+
+   {
+      int testmask = depth_stencil->stencil[0].valuemask & 0xff;
+      int writemask = depth_stencil->stencil[0].writemask & 0xff;
+
+      cso->stencil_modes4 |= (_3DSTATE_MODES_4_CMD |
+                              ENABLE_STENCIL_TEST_MASK |
+                              STENCIL_TEST_MASK(testmask) |
+                              ENABLE_STENCIL_WRITE_MASK |
+                              STENCIL_WRITE_MASK(writemask));
+   }
+
+   if (depth_stencil->stencil[0].enabled) {
+      int test = i915_translate_compare_func(depth_stencil->stencil[0].func);
+      int fop  = i915_translate_stencil_op(depth_stencil->stencil[0].fail_op);
+      int dfop = i915_translate_stencil_op(depth_stencil->stencil[0].zfail_op);
+      int dpop = i915_translate_stencil_op(depth_stencil->stencil[0].zpass_op);
+
+      cso->stencil_LIS5 |= (S5_STENCIL_TEST_ENABLE |
+                            S5_STENCIL_WRITE_ENABLE |
+                            (test << S5_STENCIL_TEST_FUNC_SHIFT) |
+                            (fop  << S5_STENCIL_FAIL_SHIFT) |
+                            (dfop << S5_STENCIL_PASS_Z_FAIL_SHIFT) |
+                            (dpop << S5_STENCIL_PASS_Z_PASS_SHIFT));
+   }
+
+   if (depth_stencil->stencil[1].enabled) {
+      int test  = i915_translate_compare_func(depth_stencil->stencil[1].func);
+      int fop   = i915_translate_stencil_op(depth_stencil->stencil[1].fail_op);
+      int dfop  = i915_translate_stencil_op(depth_stencil->stencil[1].zfail_op);
+      int dpop  = i915_translate_stencil_op(depth_stencil->stencil[1].zpass_op);
+      int tmask = depth_stencil->stencil[1].valuemask & 0xff;
+      int wmask = depth_stencil->stencil[1].writemask & 0xff;
+
+      cso->bfo[0] = (_3DSTATE_BACKFACE_STENCIL_OPS |
+                     BFO_ENABLE_STENCIL_FUNCS |
+                     BFO_ENABLE_STENCIL_TWO_SIDE |
+                     BFO_ENABLE_STENCIL_REF |
+                     BFO_STENCIL_TWO_SIDE |
+                     (test << BFO_STENCIL_TEST_SHIFT) |
+                     (fop  << BFO_STENCIL_FAIL_SHIFT) |
+                     (dfop << BFO_STENCIL_PASS_Z_FAIL_SHIFT) |
+                     (dpop << BFO_STENCIL_PASS_Z_PASS_SHIFT));
+
+      cso->bfo[1] = (_3DSTATE_BACKFACE_STENCIL_MASKS |
+                     BFM_ENABLE_STENCIL_TEST_MASK |
+                     BFM_ENABLE_STENCIL_WRITE_MASK |
+                     (tmask << BFM_STENCIL_TEST_MASK_SHIFT) |
+                     (wmask << BFM_STENCIL_WRITE_MASK_SHIFT));
+   }
+   else {
+      /* This actually disables two-side stencil: The bit set is a
+       * modify-enable bit to indicate we are changing the two-side
+       * setting.  Then there is a symbolic zero to show that we are
+       * setting the flag to zero/off.
+       */
+      cso->bfo[0] = (_3DSTATE_BACKFACE_STENCIL_OPS |
+                     BFO_ENABLE_STENCIL_TWO_SIDE |
+                     0);
+      cso->bfo[1] = 0;
+   }
+
+   if (depth_stencil->depth.enabled) {
+      int func = i915_translate_compare_func(depth_stencil->depth.func);
+
+      cso->depth_LIS6 |= (S6_DEPTH_TEST_ENABLE |
+                          (func << S6_DEPTH_TEST_FUNC_SHIFT));
+
+      if (depth_stencil->depth.writemask)
+	 cso->depth_LIS6 |= S6_DEPTH_WRITE_ENABLE;
+   }
+
+   if (depth_stencil->alpha.enabled) {
+      int test = i915_translate_compare_func(depth_stencil->alpha.func);
+      ubyte refByte = float_to_ubyte(depth_stencil->alpha.ref_value);
+
+      cso->depth_LIS6 |= (S6_ALPHA_TEST_ENABLE |
+			  (test << S6_ALPHA_TEST_FUNC_SHIFT) |
+			  (((unsigned) refByte) << S6_ALPHA_REF_SHIFT));
+   }
+
+   return cso;
+}
+
+static void i915_bind_depth_stencil_state(struct pipe_context *pipe,
+                                          void *depth_stencil)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->depth_stencil = (const struct i915_depth_stencil_state *)depth_stencil;
+
+   i915->dirty |= I915_NEW_DEPTH_STENCIL;
+}
+
+static void i915_delete_depth_stencil_state(struct pipe_context *pipe,
+                                            void *depth_stencil)
+{
+   FREE(depth_stencil);
+}
+
+
+static void i915_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   memcpy( &i915->scissor, scissor, sizeof(*scissor) );
+   i915->dirty |= I915_NEW_SCISSOR;
+}
+
+
+static void i915_set_polygon_stipple( struct pipe_context *pipe,
+                                   const struct pipe_poly_stipple *stipple )
+{
+}
+
+
+
+static void *
+i915_create_fs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   struct i915_fragment_shader *ifs = CALLOC_STRUCT(i915_fragment_shader);
+   if (!ifs)
+      return NULL;
+
+   ifs->state.tokens = tgsi_dup_tokens(templ->tokens);
+
+   tgsi_scan_shader(templ->tokens, &ifs->info);
+
+   /* The shader's compiled to i915 instructions here */
+   i915_translate_fragment_program(i915, ifs);
+
+   return ifs;
+}
+
+static void
+i915_bind_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->fs = (struct i915_fragment_shader*) shader;
+
+   i915->dirty |= I915_NEW_FS;
+}
+
+static
+void i915_delete_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct i915_fragment_shader *ifs = (struct i915_fragment_shader *) shader;
+
+   if (ifs->program)
+      FREE(ifs->program);
+   ifs->program_len = 0;
+
+   FREE((struct tgsi_token *)ifs->state.tokens);
+
+   FREE(ifs);
+}
+
+
+static void *
+i915_create_vs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   /* just pass-through to draw module */
+   return draw_create_vertex_shader(i915->draw, templ);
+}
+
+static void i915_bind_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   /* just pass-through to draw module */
+   draw_bind_vertex_shader(i915->draw, (struct draw_vertex_shader *) shader);
+
+   i915->dirty |= I915_NEW_VS;
+}
+
+static void i915_delete_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   /* just pass-through to draw module */
+   draw_delete_vertex_shader(i915->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void i915_set_constant_buffer(struct pipe_context *pipe,
+                                     uint shader, uint index,
+                                     struct pipe_resource *buf)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   /* Make a copy of shader constants.
+    * During fragment program translation we may add additional
+    * constants to the array.
+    *
+    * We want to consider the situation where some user constants
+    * (ex: a material color) may change frequently but the shader program
+    * stays the same.  In that case we should only be updating the first
+    * N constants, leaving any extras from shader translation alone.
+    */
+   if (buf) {
+      struct i915_buffer *ir = i915_buffer(buf);
+      memcpy(i915->current.constants[shader], ir->data, ir->b.b.width0);
+      i915->current.num_user_constants[shader] = (ir->b.b.width0 /
+						  4 * sizeof(float));
+   }
+   else {
+      i915->current.num_user_constants[shader] = 0;
+   }
+
+
+   i915->dirty |= I915_NEW_CONSTANTS;
+}
+
+
+static void i915_set_fragment_sampler_views(struct pipe_context *pipe,
+                                            unsigned num,
+                                            struct pipe_sampler_view **views)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == i915->num_fragment_sampler_views &&
+       !memcmp(i915->fragment_sampler_views, views, num * sizeof(struct pipe_sampler_view *)))
+      return;
+
+   /* Fixes wrong texture in texobj with VBUF */
+   draw_flush(i915->draw);
+
+   for (i = 0; i < num; i++)
+      pipe_sampler_view_reference(&i915->fragment_sampler_views[i],
+                                  views[i]);
+
+   for (i = num; i < i915->num_fragment_sampler_views; i++)
+      pipe_sampler_view_reference(&i915->fragment_sampler_views[i],
+                                  NULL);
+
+   i915->num_fragment_sampler_views = num;
+
+   i915->dirty |= I915_NEW_SAMPLER_VIEW;
+}
+
+
+static struct pipe_sampler_view *
+i915_create_sampler_view(struct pipe_context *pipe,
+                         struct pipe_resource *texture,
+                         const struct pipe_sampler_view *templ)
+{
+   struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
+
+   if (view) {
+      *view = *templ;
+      view->reference.count = 1;
+      view->texture = NULL;
+      pipe_resource_reference(&view->texture, texture);
+      view->context = pipe;
+   }
+
+   return view;
+}
+
+
+static void
+i915_sampler_view_destroy(struct pipe_context *pipe,
+                          struct pipe_sampler_view *view)
+{
+   pipe_resource_reference(&view->texture, NULL);
+   FREE(view);
+}
+
+
+static void i915_set_framebuffer_state(struct pipe_context *pipe,
+				       const struct pipe_framebuffer_state *fb)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   int i;
+
+   draw_flush(i915->draw);
+
+   i915->framebuffer.width = fb->width;
+   i915->framebuffer.height = fb->height;
+   i915->framebuffer.nr_cbufs = fb->nr_cbufs;
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&i915->framebuffer.cbufs[i], fb->cbufs[i]);
+   }
+   pipe_surface_reference(&i915->framebuffer.zsbuf, fb->zsbuf);
+
+   i915->dirty |= I915_NEW_FRAMEBUFFER;
+}
+
+
+
+static void i915_set_clip_state( struct pipe_context *pipe,
+			     const struct pipe_clip_state *clip )
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   draw_set_clip_state(i915->draw, clip);
+
+   i915->dirty |= I915_NEW_CLIP;
+}
+
+
+
+/* Called when driver state tracker notices changes to the viewport
+ * matrix:
+ */
+static void i915_set_viewport_state( struct pipe_context *pipe,
+				     const struct pipe_viewport_state *viewport )
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   i915->viewport = *viewport; /* struct copy */
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_state(i915->draw, &i915->viewport);
+
+   i915->dirty |= I915_NEW_VIEWPORT;
+}
+
+
+static void *
+i915_create_rasterizer_state(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *rasterizer)
+{
+   struct i915_rasterizer_state *cso = CALLOC_STRUCT( i915_rasterizer_state );
+
+   cso->templ = rasterizer;
+   cso->color_interp = rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+   cso->light_twoside = rasterizer->light_twoside;
+   cso->ds[0].u = _3DSTATE_DEPTH_OFFSET_SCALE;
+   cso->ds[1].f = rasterizer->offset_scale;
+   if (rasterizer->poly_stipple_enable) {
+      cso->st |= ST1_ENABLE;
+   }
+
+   if (rasterizer->scissor)
+      cso->sc[0] = _3DSTATE_SCISSOR_ENABLE_CMD | ENABLE_SCISSOR_RECT;
+   else
+      cso->sc[0] = _3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT;
+
+   switch (rasterizer->cull_face) {
+   case PIPE_FACE_NONE:
+      cso->LIS4 |= S4_CULLMODE_NONE;
+      break;
+   case PIPE_FACE_FRONT:
+      if (rasterizer->front_ccw)
+         cso->LIS4 |= S4_CULLMODE_CCW;
+      else 
+         cso->LIS4 |= S4_CULLMODE_CW;
+      break;
+   case PIPE_FACE_BACK:
+      if (rasterizer->front_ccw)
+         cso->LIS4 |= S4_CULLMODE_CW;
+      else 
+         cso->LIS4 |= S4_CULLMODE_CCW;
+      break;
+   case PIPE_FACE_FRONT_AND_BACK:
+      cso->LIS4 |= S4_CULLMODE_BOTH;
+      break;
+   }
+
+   {
+      int line_width = CLAMP((int)(rasterizer->line_width * 2), 1, 0xf);
+
+      cso->LIS4 |= line_width << S4_LINE_WIDTH_SHIFT;
+
+      if (rasterizer->line_smooth)
+	 cso->LIS4 |= S4_LINE_ANTIALIAS_ENABLE;
+   }
+
+   {
+      int point_size = CLAMP((int) rasterizer->point_size, 1, 0xff);
+
+      cso->LIS4 |= point_size << S4_POINT_WIDTH_SHIFT;
+   }
+
+   if (rasterizer->flatshade) {
+      cso->LIS4 |= (S4_FLATSHADE_ALPHA |
+                    S4_FLATSHADE_COLOR |
+                    S4_FLATSHADE_SPECULAR);
+   }
+
+   cso->LIS7 = fui( rasterizer->offset_units );
+
+
+   return cso;
+}
+
+static void i915_bind_rasterizer_state( struct pipe_context *pipe,
+                                        void *raster )
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   i915->rasterizer = (struct i915_rasterizer_state *)raster;
+
+   /* pass-through to draw module */
+   draw_set_rasterizer_state(i915->draw,
+                           (i915->rasterizer ? i915->rasterizer->templ : NULL),
+                           raster);
+
+   i915->dirty |= I915_NEW_RASTERIZER;
+}
+
+static void i915_delete_rasterizer_state(struct pipe_context *pipe,
+                                         void *raster)
+{
+   FREE(raster);
+}
+
+static void i915_set_vertex_buffers(struct pipe_context *pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_buffer *buffers)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   /* Because we change state before the draw_set_vertex_buffers call
+    * we need a flush here, just to be sure.
+    */
+   draw_flush(i915->draw);
+
+   memcpy(i915->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   i915->num_vertex_buffers = count;
+
+   /* pass-through to draw module */
+   draw_set_vertex_buffers(i915->draw, count, buffers);
+}
+
+static void *
+i915_create_vertex_elements_state(struct pipe_context *pipe,
+                                  unsigned count,
+                                  const struct pipe_vertex_element *attribs)
+{
+   struct i915_velems_state *velems;
+   assert(count <= PIPE_MAX_ATTRIBS);
+   velems = (struct i915_velems_state *) MALLOC(sizeof(struct i915_velems_state));
+   if (velems) {
+      velems->count = count;
+      memcpy(velems->velem, attribs, sizeof(*attribs) * count);
+   }
+   return velems;
+}
+
+static void
+i915_bind_vertex_elements_state(struct pipe_context *pipe,
+                                void *velems)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   struct i915_velems_state *i915_velems = (struct i915_velems_state *) velems;
+
+   /* Because we change state before the draw_set_vertex_buffers call
+    * we need a flush here, just to be sure.
+    */
+   draw_flush(i915->draw);
+
+   /* pass-through to draw module */
+   if (i915_velems) {
+      draw_set_vertex_elements(i915->draw,
+            i915_velems->count, i915_velems->velem);
+   }
+}
+
+static void
+i915_delete_vertex_elements_state(struct pipe_context *pipe, void *velems)
+{
+   FREE( velems );
+}
+
+static void
+i915_set_sample_mask(struct pipe_context *pipe,
+                     unsigned sample_mask)
+{
+}
+
+void
+i915_init_state_functions( struct i915_context *i915 )
+{
+   i915->base.create_blend_state = i915_create_blend_state;
+   i915->base.bind_blend_state = i915_bind_blend_state;
+   i915->base.delete_blend_state = i915_delete_blend_state;
+
+   i915->base.create_sampler_state = i915_create_sampler_state;
+   i915->base.bind_fragment_sampler_states = i915_bind_sampler_states;
+   i915->base.delete_sampler_state = i915_delete_sampler_state;
+
+   i915->base.create_depth_stencil_alpha_state = i915_create_depth_stencil_state;
+   i915->base.bind_depth_stencil_alpha_state = i915_bind_depth_stencil_state;
+   i915->base.delete_depth_stencil_alpha_state = i915_delete_depth_stencil_state;
+
+   i915->base.create_rasterizer_state = i915_create_rasterizer_state;
+   i915->base.bind_rasterizer_state = i915_bind_rasterizer_state;
+   i915->base.delete_rasterizer_state = i915_delete_rasterizer_state;
+   i915->base.create_fs_state = i915_create_fs_state;
+   i915->base.bind_fs_state = i915_bind_fs_state;
+   i915->base.delete_fs_state = i915_delete_fs_state;
+   i915->base.create_vs_state = i915_create_vs_state;
+   i915->base.bind_vs_state = i915_bind_vs_state;
+   i915->base.delete_vs_state = i915_delete_vs_state;
+   i915->base.create_vertex_elements_state = i915_create_vertex_elements_state;
+   i915->base.bind_vertex_elements_state = i915_bind_vertex_elements_state;
+   i915->base.delete_vertex_elements_state = i915_delete_vertex_elements_state;
+
+   i915->base.set_blend_color = i915_set_blend_color;
+   i915->base.set_stencil_ref = i915_set_stencil_ref;
+   i915->base.set_clip_state = i915_set_clip_state;
+   i915->base.set_sample_mask = i915_set_sample_mask;
+   i915->base.set_constant_buffer = i915_set_constant_buffer;
+   i915->base.set_framebuffer_state = i915_set_framebuffer_state;
+
+   i915->base.set_polygon_stipple = i915_set_polygon_stipple;
+   i915->base.set_scissor_state = i915_set_scissor_state;
+   i915->base.set_fragment_sampler_views = i915_set_fragment_sampler_views;
+   i915->base.create_sampler_view = i915_create_sampler_view;
+   i915->base.sampler_view_destroy = i915_sampler_view_destroy;
+   i915->base.set_viewport_state = i915_set_viewport_state;
+   i915->base.set_vertex_buffers = i915_set_vertex_buffers;
+}
diff --git a/src/gallium/drivers/i915/i915_state.h b/src/gallium/drivers/i915/i915_state.h
new file mode 100644
index 0000000000..86c6b0027d
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_state.h
@@ -0,0 +1,50 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef I915_STATE_H
+#define I915_STATE_H
+
+struct i915_context;
+
+
+struct i915_tracked_state {
+   unsigned dirty;
+   void (*update)( struct i915_context * );
+};
+
+void i915_update_immediate( struct i915_context *i915 );
+void i915_update_dynamic( struct i915_context *i915 );
+void i915_update_derived( struct i915_context *i915 );
+void i915_update_samplers( struct i915_context *i915 );
+void i915_update_textures(struct i915_context *i915);
+
+void i915_emit_hardware_state( struct i915_context *i915 );
+
+#endif
diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c
new file mode 100644
index 0000000000..4da46772b5
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@@ -0,0 +1,182 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "i915_context.h"
+#include "i915_state.h"
+#include "i915_reg.h"
+
+
+
+/**
+ * Determine the hardware vertex layout.
+ * Depends on vertex/fragment shader state.
+ */
+static void calculate_vertex_layout( struct i915_context *i915 )
+{
+   const struct i915_fragment_shader *fs = i915->fs;
+   const enum interp_mode colorInterp = i915->rasterizer->color_interp;
+   struct vertex_info vinfo;
+   boolean texCoords[8], colors[2], fog, needW;
+   uint i;
+   int src;
+
+   memset(texCoords, 0, sizeof(texCoords));
+   colors[0] = colors[1] = fog = needW = FALSE;
+   memset(&vinfo, 0, sizeof(vinfo));
+
+   /* Determine which fragment program inputs are needed.  Setup HW vertex
+    * layout below, in the HW-specific attribute order.
+    */
+   for (i = 0; i < fs->info.num_inputs; i++) {
+      switch (fs->info.input_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         break;
+      case TGSI_SEMANTIC_COLOR:
+         assert(fs->info.input_semantic_index[i] < 2);
+         colors[fs->info.input_semantic_index[i]] = TRUE;
+         break;
+      case TGSI_SEMANTIC_GENERIC:
+         /* usually a texcoord */
+         {
+            const uint unit = fs->info.input_semantic_index[i];
+            assert(unit < 8);
+            texCoords[unit] = TRUE;
+            needW = TRUE;
+         }
+         break;
+      case TGSI_SEMANTIC_FOG:
+         fog = TRUE;
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   
+   /* pos */
+   src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_POSITION, 0);
+   if (needW) {
+      draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src);
+      vinfo.hwfmt[0] |= S4_VFMT_XYZW;
+      vinfo.attrib[0].emit = EMIT_4F;
+   }
+   else {
+      draw_emit_vertex_attr(&vinfo, EMIT_3F, INTERP_LINEAR, src);
+      vinfo.hwfmt[0] |= S4_VFMT_XYZ;
+      vinfo.attrib[0].emit = EMIT_3F;
+   }
+
+   /* hardware point size */
+   /* XXX todo */
+
+   /* primary color */
+   if (colors[0]) {
+      src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 0);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, colorInterp, src);
+      vinfo.hwfmt[0] |= S4_VFMT_COLOR;
+   }
+
+   /* secondary color */
+   if (colors[1]) {
+      src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 1);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB_BGRA, colorInterp, src);
+      vinfo.hwfmt[0] |= S4_VFMT_SPEC_FOG;
+   }
+
+   /* fog coord, not fog blend factor */
+   if (fog) {
+      src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FOG, 0);
+      draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+      vinfo.hwfmt[0] |= S4_VFMT_FOG_PARAM;
+   }
+
+   /* texcoords */
+   for (i = 0; i < 8; i++) {
+      uint hwtc;
+      if (texCoords[i]) {
+         hwtc = TEXCOORDFMT_4D;
+         src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_GENERIC, i);
+         draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+      }
+      else {
+         hwtc = TEXCOORDFMT_NOT_PRESENT;
+      }
+      vinfo.hwfmt[1] |= hwtc << (i * 4);
+   }
+
+   draw_compute_vertex_size(&vinfo);
+
+   if (memcmp(&i915->current.vertex_info, &vinfo, sizeof(vinfo))) {
+      /* Need to set this flag so that the LIS2/4 registers get set.
+       * It also means the i915_update_immediate() function must be called
+       * after this one, in i915_update_derived().
+       */
+      i915->dirty |= I915_NEW_VERTEX_FORMAT;
+
+      memcpy(&i915->current.vertex_info, &vinfo, sizeof(vinfo));
+   }
+}
+
+
+
+
+/* Hopefully this will remain quite simple, otherwise need to pull in
+ * something like the state tracker mechanism.
+ */
+void i915_update_derived( struct i915_context *i915 )
+{
+   if (i915->dirty & (I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS))
+      calculate_vertex_layout( i915 );
+
+   if (i915->dirty & (I915_NEW_SAMPLER | I915_NEW_SAMPLER_VIEW))
+      i915_update_samplers(i915);
+
+   if (i915->dirty & I915_NEW_SAMPLER_VIEW)
+      i915_update_textures(i915);
+
+   if (i915->dirty)
+      i915_update_immediate( i915 );
+
+   if (i915->dirty)
+      i915_update_dynamic( i915 );
+
+   if (i915->dirty & I915_NEW_FS) {
+      i915->hardware_dirty |= I915_HW_PROGRAM; /* XXX right? */
+   }
+
+   /* HW emit currently references framebuffer state directly:
+    */
+   if (i915->dirty & I915_NEW_FRAMEBUFFER)
+      i915->hardware_dirty |= I915_HW_STATIC;
+
+   i915->dirty = 0;
+}
diff --git a/src/gallium/drivers/i915/i915_state_dynamic.c b/src/gallium/drivers/i915/i915_state_dynamic.c
new file mode 100644
index 0000000000..9c6723b391
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_state_dynamic.c
@@ -0,0 +1,317 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_batch.h"
+#include "i915_state_inlines.h"
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_state.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+
+#define FILE_DEBUG_FLAG DEBUG_STATE
+
+/* State that we have chosen to store in the DYNAMIC segment of the
+ * i915 indirect state mechanism.  
+ *
+ * Can't cache these in the way we do the static state, as there is no
+ * start/size in the command packet, instead an 'end' value that gets
+ * incremented.
+ *
+ * Additionally, there seems to be a requirement to re-issue the full
+ * (active) state every time a 4kb boundary is crossed.
+ */
+
+static INLINE void set_dynamic_indirect( struct i915_context *i915,
+					 unsigned offset,
+					 const unsigned *src,
+					 unsigned dwords )
+{
+   unsigned i;
+
+   for (i = 0; i < dwords; i++)
+      i915->current.dynamic[offset + i] = src[i];
+
+   i915->hardware_dirty |= I915_HW_DYNAMIC;
+}
+
+
+/***********************************************************************
+ * Modes4: stencil masks and logicop 
+ */
+static void upload_MODES4( struct i915_context *i915 )
+{
+   unsigned modes4 = 0;
+
+   /* I915_NEW_STENCIL */
+   modes4 |= i915->depth_stencil->stencil_modes4;
+   /* I915_NEW_BLEND */
+   modes4 |= i915->blend->modes4;
+
+   /* Always, so that we know when state is in-active: 
+    */
+   set_dynamic_indirect( i915, 
+			 I915_DYNAMIC_MODES4,
+			 &modes4,
+			 1 );
+}
+
+const struct i915_tracked_state i915_upload_MODES4 = {
+   I915_NEW_BLEND | I915_NEW_DEPTH_STENCIL,
+   upload_MODES4
+};
+
+
+
+
+/***********************************************************************
+ */
+
+static void upload_BFO( struct i915_context *i915 )
+{
+   unsigned bfo[2];
+   bfo[0] = i915->depth_stencil->bfo[0];
+   bfo[1] = i915->depth_stencil->bfo[1];
+   /* I don't get it only allowed to set a ref mask when the enable bit is set? */
+   if (bfo[0] & BFO_ENABLE_STENCIL_REF) {
+      bfo[0] |= i915->stencil_ref.ref_value[1] << BFO_STENCIL_REF_SHIFT;
+   }
+   set_dynamic_indirect( i915,
+			 I915_DYNAMIC_BFO_0,
+			 &(bfo[0]),
+			 2 );
+}
+
+const struct i915_tracked_state i915_upload_BFO = {
+   I915_NEW_DEPTH_STENCIL,
+   upload_BFO
+};
+
+
+/***********************************************************************
+ */
+
+
+static void upload_BLENDCOLOR( struct i915_context *i915 )
+{
+   unsigned bc[2];
+
+   memset( bc, 0, sizeof(bc) );
+
+   /* I915_NEW_BLEND {_COLOR} 
+    */
+   {
+      const float *color = i915->blend_color.color;
+
+      bc[0] = _3DSTATE_CONST_BLEND_COLOR_CMD;
+      bc[1] = pack_ui32_float4( color[0],
+				color[1],
+				color[2], 
+				color[3] );
+   }
+
+   set_dynamic_indirect( i915, 
+			 I915_DYNAMIC_BC_0,
+			 bc,
+			 2 );
+}
+
+const struct i915_tracked_state i915_upload_BLENDCOLOR = {
+   I915_NEW_BLEND,
+   upload_BLENDCOLOR
+};
+
+/***********************************************************************
+ */
+
+
+static void upload_IAB( struct i915_context *i915 )
+{
+   unsigned iab = i915->blend->iab;
+
+
+   set_dynamic_indirect( i915,
+			 I915_DYNAMIC_IAB,
+			 &iab,
+			 1 );
+}
+
+const struct i915_tracked_state i915_upload_IAB = {
+   I915_NEW_BLEND,
+   upload_IAB
+};
+
+
+/***********************************************************************
+ */
+
+
+
+static void upload_DEPTHSCALE( struct i915_context *i915 )
+{
+   set_dynamic_indirect( i915,
+			 I915_DYNAMIC_DEPTHSCALE_0,
+			 &(i915->rasterizer->ds[0].u),
+			 2 );
+}
+
+const struct i915_tracked_state i915_upload_DEPTHSCALE = {
+   I915_NEW_RASTERIZER,
+   upload_DEPTHSCALE
+};
+
+
+
+/***********************************************************************
+ * Polygon stipple
+ *
+ * The i915 supports a 4x4 stipple natively, GL wants 32x32.
+ * Fortunately stipple is usually a repeating pattern.
+ *
+ * XXX: does stipple pattern need to be adjusted according to
+ * the window position?
+ *
+ * XXX: possibly need workaround for conform paths test. 
+ */
+
+static void upload_STIPPLE( struct i915_context *i915 )
+{
+   unsigned st[2];
+
+   st[0] = _3DSTATE_STIPPLE;
+   st[1] = 0;
+
+   /* I915_NEW_RASTERIZER
+    */
+   st[1] |= i915->rasterizer->st;
+
+
+   /* I915_NEW_STIPPLE
+    */
+   {
+      const ubyte *mask = (const ubyte *)i915->poly_stipple.stipple;
+      ubyte p[4];
+
+      p[0] = mask[12] & 0xf;
+      p[1] = mask[8] & 0xf;
+      p[2] = mask[4] & 0xf;
+      p[3] = mask[0] & 0xf;
+
+      /* Not sure what to do about fallbacks, so for now just dont:
+       */
+      st[1] |= ((p[0] << 0) |
+		(p[1] << 4) |
+		(p[2] << 8) | 
+		(p[3] << 12));
+   }
+
+
+   set_dynamic_indirect( i915, 
+			 I915_DYNAMIC_STP_0,
+			 &st[0],
+			 2 );
+}
+
+
+const struct i915_tracked_state i915_upload_STIPPLE = {
+   I915_NEW_RASTERIZER | I915_NEW_STIPPLE,
+   upload_STIPPLE
+};
+
+
+
+/***********************************************************************
+ * Scissor.
+ */
+static void upload_SCISSOR_ENABLE( struct i915_context *i915 )
+{
+   set_dynamic_indirect( i915,
+			 I915_DYNAMIC_SC_ENA_0,
+			 &(i915->rasterizer->sc[0]),
+			 1 );
+}
+
+const struct i915_tracked_state i915_upload_SCISSOR_ENABLE = {
+   I915_NEW_RASTERIZER,
+   upload_SCISSOR_ENABLE
+};
+
+
+
+static void upload_SCISSOR_RECT( struct i915_context *i915 )
+{
+   unsigned x1 = i915->scissor.minx;
+   unsigned y1 = i915->scissor.miny;
+   unsigned x2 = i915->scissor.maxx;
+   unsigned y2 = i915->scissor.maxy;
+   unsigned sc[3];
+ 
+   sc[0] = _3DSTATE_SCISSOR_RECT_0_CMD;
+   sc[1] = (y1 << 16) | (x1 & 0xffff);
+   sc[2] = (y2 << 16) | (x2 & 0xffff);
+
+   set_dynamic_indirect( i915, 
+			 I915_DYNAMIC_SC_RECT_0,
+			 &sc[0],
+			 3 );
+}
+
+
+const struct i915_tracked_state i915_upload_SCISSOR_RECT = {
+   I915_NEW_SCISSOR,
+   upload_SCISSOR_RECT
+};
+
+
+
+
+
+
+static const struct i915_tracked_state *atoms[] = {
+   &i915_upload_MODES4,
+   &i915_upload_BFO,
+   &i915_upload_BLENDCOLOR,
+   &i915_upload_IAB,
+   &i915_upload_DEPTHSCALE,
+   &i915_upload_STIPPLE,
+   &i915_upload_SCISSOR_ENABLE,
+   &i915_upload_SCISSOR_RECT
+};
+
+/* These will be dynamic indirect state commands, but for now just end
+ * up on the batch buffer with everything else.
+ */
+void i915_update_dynamic( struct i915_context *i915 )
+{
+   int i;
+
+   for (i = 0; i < Elements(atoms); i++)
+      if (i915->dirty & atoms[i]->dirty)
+	 atoms[i]->update( i915 );
+}
+
diff --git a/src/gallium/drivers/i915/i915_state_emit.c b/src/gallium/drivers/i915/i915_state_emit.c
new file mode 100644
index 0000000000..22082fece8
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_state_emit.c
@@ -0,0 +1,404 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_batch.h"
+#include "i915_reg.h"
+#include "i915_resource.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+
+static unsigned translate_format( enum pipe_format format )
+{
+   switch (format) {
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return COLOR_BUF_ARGB8888;
+   case PIPE_FORMAT_B5G6R5_UNORM:
+      return COLOR_BUF_RGB565;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned translate_depth_format( enum pipe_format zformat )
+{
+   switch (zformat) {
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+      return DEPTH_FRMT_24_FIXED_8_OTHER;
+   case PIPE_FORMAT_Z16_UNORM:
+      return DEPTH_FRMT_16_FIXED;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+
+/**
+ * Examine framebuffer state to determine width, height.
+ */
+static boolean
+framebuffer_size(const struct pipe_framebuffer_state *fb,
+                 uint *width, uint *height)
+{
+   if (fb->cbufs[0]) {
+      *width = fb->cbufs[0]->width;
+      *height = fb->cbufs[0]->height;
+      return TRUE;
+   }
+   else if (fb->zsbuf) {
+      *width = fb->zsbuf->width;
+      *height = fb->zsbuf->height;
+      return TRUE;
+   }
+   else {
+      *width = *height = 0;
+      return FALSE;
+   }
+}
+
+
+/* Push the state into the sarea and/or texture memory.
+ */
+void
+i915_emit_hardware_state(struct i915_context *i915 )
+{
+   /* XXX: there must be an easier way */
+   const unsigned dwords = ( 14 + 
+                             7 + 
+                             I915_MAX_DYNAMIC + 
+                             8 + 
+                             2 + I915_TEX_UNITS*3 + 
+                             2 + I915_TEX_UNITS*3 +
+                             2 + I915_MAX_CONSTANT*4 + 
+#if 0
+                             i915->current.program_len + 
+#else
+                             i915->fs->program_len + 
+#endif
+                             6 
+                           ) * 3/2; /* plus 50% margin */
+   const unsigned relocs = ( I915_TEX_UNITS +
+                             3
+                           ) * 3/2; /* plus 50% margin */
+
+#if 0
+   debug_printf("i915_emit_hardware_state: %d dwords, %d relocs\n", dwords, relocs);
+#endif
+   
+   if(!BEGIN_BATCH(dwords, relocs)) {
+      FLUSH_BATCH(NULL);
+      assert(BEGIN_BATCH(dwords, relocs));
+   }
+
+   /* 14 dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_INVARIENT)
+   {
+      OUT_BATCH(_3DSTATE_AA_CMD |
+                AA_LINE_ECAAR_WIDTH_ENABLE |
+                AA_LINE_ECAAR_WIDTH_1_0 |
+                AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0);
+
+      OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
+      OUT_BATCH(0);
+
+      OUT_BATCH(_3DSTATE_DFLT_SPEC_CMD);
+      OUT_BATCH(0);
+      
+      OUT_BATCH(_3DSTATE_DFLT_Z_CMD);
+      OUT_BATCH(0);
+
+      OUT_BATCH(_3DSTATE_COORD_SET_BINDINGS |
+                CSB_TCB(0, 0) |
+                CSB_TCB(1, 1) |
+                CSB_TCB(2, 2) |
+                CSB_TCB(3, 3) |
+                CSB_TCB(4, 4) | 
+                CSB_TCB(5, 5) | 
+                CSB_TCB(6, 6) | 
+                CSB_TCB(7, 7));
+
+      OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
+                ENABLE_POINT_RASTER_RULE |
+                OGL_POINT_RASTER_RULE |
+                ENABLE_LINE_STRIP_PROVOKE_VRTX |
+                ENABLE_TRI_FAN_PROVOKE_VRTX |
+                LINE_STRIP_PROVOKE_VRTX(1) |
+                TRI_FAN_PROVOKE_VRTX(2) | 
+                ENABLE_TEXKILL_3D_4D | 
+                TEXKILL_4D);
+
+      /* Need to initialize this to zero.
+       */
+      OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | (0));
+      OUT_BATCH(0);
+
+      OUT_BATCH(_3DSTATE_DEPTH_SUBRECT_DISABLE);
+
+      /* disable indirect state for now
+       */
+      OUT_BATCH(_3DSTATE_LOAD_INDIRECT | 0);
+      OUT_BATCH(0);
+   }
+   
+   /* 7 dwords, 1 relocs */
+   if (i915->hardware_dirty & I915_HW_IMMEDIATE)
+   {
+      OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | 
+                I1_LOAD_S(0) |
+                I1_LOAD_S(1) |
+                I1_LOAD_S(2) |
+                I1_LOAD_S(4) |
+                I1_LOAD_S(5) |
+                I1_LOAD_S(6) | 
+                (5));
+      
+      if(i915->vbo)
+         OUT_RELOC(i915->vbo,
+                   I915_USAGE_VERTEX,
+                   i915->current.immediate[I915_IMMEDIATE_S0]);
+      else
+         /* FIXME: we should not do this */
+         OUT_BATCH(0);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S1]);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S2]);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S4]);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S5]);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S6]);
+   } 
+   
+   /* I915_MAX_DYNAMIC dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_DYNAMIC) 
+   {
+      int i;
+      for (i = 0; i < I915_MAX_DYNAMIC; i++) {
+         OUT_BATCH(i915->current.dynamic[i]);
+      }
+   }
+   
+   /* 8 dwords, 2 relocs */
+   if (i915->hardware_dirty & I915_HW_STATIC)
+   {
+      struct pipe_surface *cbuf_surface = i915->framebuffer.cbufs[0];
+      struct pipe_surface *depth_surface = i915->framebuffer.zsbuf;
+
+      if (cbuf_surface) {
+         unsigned ctile = BUF_3D_USE_FENCE;
+         struct i915_texture *tex = i915_texture(cbuf_surface->texture);
+         assert(tex);
+
+         if (tex && tex->sw_tiled) {
+            ctile = BUF_3D_TILED_SURFACE;
+         }
+
+         OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+
+         OUT_BATCH(BUF_3D_ID_COLOR_BACK |
+                   BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
+                   ctile);
+
+         OUT_RELOC(tex->buffer,
+                   I915_USAGE_RENDER,
+                   cbuf_surface->offset);
+      }
+
+      /* What happens if no zbuf??
+       */
+      if (depth_surface) {
+         unsigned ztile = BUF_3D_USE_FENCE;
+         struct i915_texture *tex = i915_texture(depth_surface->texture);
+         assert(tex);
+
+         if (tex && tex->sw_tiled) {
+            ztile = BUF_3D_TILED_SURFACE;
+         }
+
+         OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+
+         assert(tex);
+         OUT_BATCH(BUF_3D_ID_DEPTH |
+                   BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
+                   ztile);
+
+         OUT_RELOC(tex->buffer,
+                   I915_USAGE_RENDER,
+                   depth_surface->offset);
+      }
+   
+      {
+         unsigned cformat, zformat = 0;
+      
+         if (cbuf_surface)
+            cformat = cbuf_surface->format;
+         else
+            cformat = PIPE_FORMAT_B8G8R8A8_UNORM; /* arbitrary */
+         cformat = translate_format(cformat);
+
+         if (depth_surface) 
+            zformat = translate_depth_format( i915->framebuffer.zsbuf->format );
+
+         OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
+         OUT_BATCH(DSTORG_HORT_BIAS(0x8) | /* .5 */
+                   DSTORG_VERT_BIAS(0x8) | /* .5 */
+                   LOD_PRECLAMP_OGL |
+                   TEX_DEFAULT_COLOR_OGL |
+                   cformat |
+                   zformat );
+      }
+   }
+
+#if 01
+      /* texture images */
+      /* 2 + I915_TEX_UNITS*3 dwords, I915_TEX_UNITS relocs */
+      if (i915->hardware_dirty & (I915_HW_MAP | I915_HW_SAMPLER))
+      {
+         const uint nr = i915->current.sampler_enable_nr;
+         if (nr) {
+            const uint enabled = i915->current.sampler_enable_flags;
+            uint unit;
+            uint count = 0;
+            OUT_BATCH(_3DSTATE_MAP_STATE | (3 * nr));
+            OUT_BATCH(enabled);
+            for (unit = 0; unit < I915_TEX_UNITS; unit++) {
+               if (enabled & (1 << unit)) {
+                  struct i915_texture *texture = i915_texture(i915->fragment_sampler_views[unit]->texture);
+                  struct i915_winsys_buffer *buf = texture->buffer;
+                  uint offset = 0;
+                  assert(buf);
+
+                  count++;
+
+                  OUT_RELOC(buf, I915_USAGE_SAMPLER, offset);
+                  OUT_BATCH(i915->current.texbuffer[unit][0]); /* MS3 */
+                  OUT_BATCH(i915->current.texbuffer[unit][1]); /* MS4 */
+               }
+            }
+            assert(count == nr);
+         }
+      }
+#endif
+
+#if 01
+   /* samplers */
+   /* 2 + I915_TEX_UNITS*3 dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_SAMPLER) 
+   {
+      if (i915->current.sampler_enable_nr) {
+         int i;
+         
+         OUT_BATCH( _3DSTATE_SAMPLER_STATE | 
+                    (3 * i915->current.sampler_enable_nr) );
+
+         OUT_BATCH( i915->current.sampler_enable_flags );
+
+         for (i = 0; i < I915_TEX_UNITS; i++) {
+            if (i915->current.sampler_enable_flags & (1<<i)) {
+               OUT_BATCH( i915->current.sampler[i][0] );
+               OUT_BATCH( i915->current.sampler[i][1] );
+               OUT_BATCH( i915->current.sampler[i][2] );
+            }
+         }
+      }
+   }
+#endif
+
+   /* constants */
+   /* 2 + I915_MAX_CONSTANT*4 dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_PROGRAM)
+   {
+      /* Collate the user-defined constants with the fragment shader's
+       * immediates according to the constant_flags[] array.
+       */
+      const uint nr = i915->fs->num_constants;
+      if (nr) {
+         uint i;
+
+         OUT_BATCH( _3DSTATE_PIXEL_SHADER_CONSTANTS | (nr * 4) );
+         OUT_BATCH( (1 << (nr - 1)) | ((1 << (nr - 1)) - 1) );
+
+         for (i = 0; i < nr; i++) {
+            const uint *c;
+            if (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER) {
+               /* grab user-defined constant */
+               c = (uint *) i915->current.constants[PIPE_SHADER_FRAGMENT][i];
+            }
+            else {
+               /* emit program constant */
+               c = (uint *) i915->fs->constants[i];
+            }
+#if 0 /* debug */
+            {
+               float *f = (float *) c;
+               printf("Const %2d: %f %f %f %f %s\n", i, f[0], f[1], f[2], f[3],
+                      (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER
+                       ? "user" : "immediate"));
+            }
+#endif
+            OUT_BATCH(*c++);
+            OUT_BATCH(*c++);
+            OUT_BATCH(*c++);
+            OUT_BATCH(*c++);
+         }
+      }
+   }
+
+   /* Fragment program */
+   /* i915->current.program_len dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_PROGRAM)
+   {
+      uint i;
+      /* we should always have, at least, a pass-through program */
+      assert(i915->fs->program_len > 0);
+      for (i = 0; i < i915->fs->program_len; i++) {
+         OUT_BATCH(i915->fs->program[i]);
+      }
+   }
+
+   /* drawing surface size */
+   /* 6 dwords, 0 relocs */
+   {
+      uint w, h;
+      boolean k = framebuffer_size(&i915->framebuffer, &w, &h);
+      (void)k;
+      assert(k);
+
+      OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(((w - 1) & 0xffff) | ((h - 1) << 16));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+   }
+
+
+   i915->hardware_dirty = 0;
+}
diff --git a/src/gallium/drivers/i915/i915_state_immediate.c b/src/gallium/drivers/i915/i915_state_immediate.c
new file mode 100644
index 0000000000..8cec699285
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_state_immediate.c
@@ -0,0 +1,227 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+#include "i915_state_inlines.h"
+#include "i915_context.h"
+#include "i915_state.h"
+#include "i915_reg.h"
+#include "util/u_memory.h"
+
+
+/* All state expressable with the LOAD_STATE_IMMEDIATE_1 packet.
+ * Would like to opportunistically recombine all these fragments into
+ * a single packet containing only what has changed, but for now emit
+ * as multiple packets.
+ */
+
+
+
+
+/***********************************************************************
+ * S0,S1: Vertex buffer state.  
+ */
+static void upload_S0S1(struct i915_context *i915)
+{
+   unsigned LIS0, LIS1;
+
+   /* I915_NEW_VBO */
+   /* TODO: re-use vertex buffers here? */
+   LIS0 = i915->vbo_offset;
+
+   /* I915_NEW_VERTEX_SIZE -- do this where the vertex size is calculated! 
+    */
+   {
+      unsigned vertex_size = i915->current.vertex_info.size;
+
+      LIS1 = ((vertex_size << 24) |
+	      (vertex_size << 16));
+   }
+
+   /* I915_NEW_VBO */
+   /* TODO: use a vertex generation number to track vbo changes */
+   if (1 ||
+       i915->current.immediate[I915_IMMEDIATE_S0] != LIS0 ||
+       i915->current.immediate[I915_IMMEDIATE_S1] != LIS1) 
+   {
+      i915->current.immediate[I915_IMMEDIATE_S0] = LIS0;
+      i915->current.immediate[I915_IMMEDIATE_S1] = LIS1;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+const struct i915_tracked_state i915_upload_S0S1 = {
+   I915_NEW_VBO | I915_NEW_VERTEX_FORMAT,
+   upload_S0S1
+};
+
+
+
+
+/***********************************************************************
+ * S4: Vertex format, rasterization state
+ */
+static void upload_S2S4(struct i915_context *i915)
+{
+   unsigned LIS2, LIS4;
+
+   /* I915_NEW_VERTEX_FORMAT */
+   {
+      LIS2 = i915->current.vertex_info.hwfmt[1];
+      LIS4 = i915->current.vertex_info.hwfmt[0];
+      /*
+      debug_printf("LIS2: 0x%x  LIS4: 0x%x\n", LIS2, LIS4);
+      */
+      assert(LIS4); /* should never be zero? */
+   }
+
+   LIS4 |= i915->rasterizer->LIS4;
+
+   if (LIS2 != i915->current.immediate[I915_IMMEDIATE_S2] ||
+       LIS4 != i915->current.immediate[I915_IMMEDIATE_S4]) {
+
+      i915->current.immediate[I915_IMMEDIATE_S2] = LIS2;
+      i915->current.immediate[I915_IMMEDIATE_S4] = LIS4;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+
+const struct i915_tracked_state i915_upload_S2S4 = {
+   I915_NEW_RASTERIZER | I915_NEW_VERTEX_FORMAT,
+   upload_S2S4
+};
+
+
+
+/***********************************************************************
+ * 
+ */
+static void upload_S5( struct i915_context *i915 )
+{
+   unsigned LIS5 = 0;
+
+   LIS5 |= i915->depth_stencil->stencil_LIS5;
+   /* hope it's safe to set stencil ref value even if stencil test is disabled? */
+   LIS5 |= i915->stencil_ref.ref_value[0] << S5_STENCIL_REF_SHIFT;
+
+   LIS5 |= i915->blend->LIS5;
+
+#if 0
+   /* I915_NEW_RASTERIZER */
+   if (i915->state.Polygon->OffsetFill) {
+      LIS5 |= S5_GLOBAL_DEPTH_OFFSET_ENABLE;
+   }
+#endif
+
+
+   if (LIS5 != i915->current.immediate[I915_IMMEDIATE_S5]) {
+      i915->current.immediate[I915_IMMEDIATE_S5] = LIS5;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+const struct i915_tracked_state i915_upload_S5 = {
+   (I915_NEW_DEPTH_STENCIL | I915_NEW_BLEND | I915_NEW_RASTERIZER),
+   upload_S5
+};
+
+
+/***********************************************************************
+ */
+static void upload_S6( struct i915_context *i915 )
+{
+   unsigned LIS6 = (2 << S6_TRISTRIP_PV_SHIFT);
+
+   /* I915_NEW_FRAMEBUFFER
+    */
+   if (i915->framebuffer.cbufs[0])
+      LIS6 |= S6_COLOR_WRITE_ENABLE;
+
+   /* I915_NEW_BLEND
+    */
+   LIS6 |= i915->blend->LIS6;
+
+   /* I915_NEW_DEPTH
+    */
+   LIS6 |= i915->depth_stencil->depth_LIS6;
+
+   if (LIS6 != i915->current.immediate[I915_IMMEDIATE_S6]) {
+      i915->current.immediate[I915_IMMEDIATE_S6] = LIS6;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+const struct i915_tracked_state i915_upload_S6 = {
+   I915_NEW_BLEND | I915_NEW_DEPTH_STENCIL | I915_NEW_FRAMEBUFFER,
+   upload_S6
+};
+
+
+/***********************************************************************
+ */
+static void upload_S7( struct i915_context *i915 )
+{
+   unsigned LIS7;
+
+   /* I915_NEW_RASTERIZER
+    */
+   LIS7 = i915->rasterizer->LIS7;
+
+   if (LIS7 != i915->current.immediate[I915_IMMEDIATE_S7]) {
+      i915->current.immediate[I915_IMMEDIATE_S7] = LIS7;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+const struct i915_tracked_state i915_upload_S7 = {
+   I915_NEW_RASTERIZER,
+   upload_S7
+};
+
+
+static const struct i915_tracked_state *atoms[] = {
+   &i915_upload_S0S1,
+   &i915_upload_S2S4,
+   &i915_upload_S5,
+   &i915_upload_S6,
+   &i915_upload_S7
+};
+
+/* 
+ */
+void i915_update_immediate( struct i915_context *i915 )
+{
+   int i;
+
+   for (i = 0; i < Elements(atoms); i++)
+      if (i915->dirty & atoms[i]->dirty)
+	 atoms[i]->update( i915 );
+}
diff --git a/src/gallium/drivers/i915/i915_state_inlines.h b/src/gallium/drivers/i915/i915_state_inlines.h
new file mode 100644
index 0000000000..b589117fbf
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_state_inlines.h
@@ -0,0 +1,231 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_STATE_INLINES_H
+#define I915_STATE_INLINES_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+#include "util/u_debug.h"
+#include "i915_reg.h"
+
+
+static INLINE unsigned
+i915_translate_compare_func(unsigned func)
+{
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      return COMPAREFUNC_NEVER;
+   case PIPE_FUNC_LESS:
+      return COMPAREFUNC_LESS;
+   case PIPE_FUNC_LEQUAL:
+      return COMPAREFUNC_LEQUAL;
+   case PIPE_FUNC_GREATER:
+      return COMPAREFUNC_GREATER;
+   case PIPE_FUNC_GEQUAL:
+      return COMPAREFUNC_GEQUAL;
+   case PIPE_FUNC_NOTEQUAL:
+      return COMPAREFUNC_NOTEQUAL;
+   case PIPE_FUNC_EQUAL:
+      return COMPAREFUNC_EQUAL;
+   case PIPE_FUNC_ALWAYS:
+      return COMPAREFUNC_ALWAYS;
+   default:
+      return COMPAREFUNC_ALWAYS;
+   }
+}
+
+static INLINE unsigned
+i915_translate_stencil_op(unsigned op)
+{
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      return STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:
+      return STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:
+      return STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:
+      return STENCILOP_INCRSAT;
+   case PIPE_STENCIL_OP_DECR:
+      return STENCILOP_DECRSAT;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      return STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      return STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INVERT:
+      return STENCILOP_INVERT;
+   default:
+      return STENCILOP_ZERO;
+   }
+}
+
+static INLINE unsigned
+i915_translate_blend_factor(unsigned factor)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      return BLENDFACT_ZERO;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return BLENDFACT_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_ONE:
+      return BLENDFACT_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return BLENDFACT_SRC_COLR;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      return BLENDFACT_INV_SRC_COLR;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return BLENDFACT_DST_COLR;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      return BLENDFACT_INV_DST_COLR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return BLENDFACT_INV_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return BLENDFACT_DST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return BLENDFACT_INV_DST_ALPHA;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return BLENDFACT_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return BLENDFACT_CONST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      return BLENDFACT_INV_CONST_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return BLENDFACT_CONST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      return BLENDFACT_INV_CONST_ALPHA;
+   default:
+      return BLENDFACT_ZERO;
+   }
+}
+
+static INLINE unsigned
+i915_translate_blend_func(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_BLEND_ADD:
+      return BLENDFUNC_ADD;
+   case PIPE_BLEND_MIN:
+      return BLENDFUNC_MIN;
+   case PIPE_BLEND_MAX:
+      return BLENDFUNC_MAX;
+   case PIPE_BLEND_SUBTRACT:
+      return BLENDFUNC_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return BLENDFUNC_REVERSE_SUBTRACT;
+   default:
+      return 0;
+   }
+}
+
+
+static INLINE unsigned
+i915_translate_logic_op(unsigned opcode)
+{
+   switch (opcode) {
+   case PIPE_LOGICOP_CLEAR:
+      return LOGICOP_CLEAR;
+   case PIPE_LOGICOP_AND:
+      return LOGICOP_AND;
+   case PIPE_LOGICOP_AND_REVERSE:
+      return LOGICOP_AND_RVRSE;
+   case PIPE_LOGICOP_COPY:
+      return LOGICOP_COPY;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      return LOGICOP_COPY_INV;
+   case PIPE_LOGICOP_AND_INVERTED:
+      return LOGICOP_AND_INV;
+   case PIPE_LOGICOP_NOOP:
+      return LOGICOP_NOOP;
+   case PIPE_LOGICOP_XOR:
+      return LOGICOP_XOR;
+   case PIPE_LOGICOP_OR:
+      return LOGICOP_OR;
+   case PIPE_LOGICOP_OR_INVERTED:
+      return LOGICOP_OR_INV;
+   case PIPE_LOGICOP_NOR:
+      return LOGICOP_NOR;
+   case PIPE_LOGICOP_EQUIV:
+      return LOGICOP_EQUIV;
+   case PIPE_LOGICOP_INVERT:
+      return LOGICOP_INV;
+   case PIPE_LOGICOP_OR_REVERSE:
+      return LOGICOP_OR_RVRSE;
+   case PIPE_LOGICOP_NAND:
+      return LOGICOP_NAND;
+   case PIPE_LOGICOP_SET:
+      return LOGICOP_SET;
+   default:
+      return LOGICOP_SET;
+   }
+}
+
+
+
+static INLINE boolean i915_validate_vertices( unsigned hw_prim, unsigned nr )
+{
+   boolean ok;
+
+   switch (hw_prim) {
+   case PRIM3D_POINTLIST:
+      ok = (nr >= 1);
+      assert(ok);
+      break;
+   case PRIM3D_LINELIST:
+      ok = (nr >= 2) && (nr % 2) == 0;
+      assert(ok);
+      break;
+   case PRIM3D_LINESTRIP:
+      ok = (nr >= 2);
+      assert(ok);
+      break;
+   case PRIM3D_TRILIST:
+      ok = (nr >= 3) && (nr % 3) == 0;
+      assert(ok);
+      break;
+   case PRIM3D_TRISTRIP:
+      ok = (nr >= 3);
+      assert(ok);
+      break;
+   case PRIM3D_TRIFAN:
+      ok = (nr >= 3);
+      assert(ok);
+      break;
+   case PRIM3D_POLY:
+      ok = (nr >= 3);
+      assert(ok);
+      break;
+   default:
+      assert(0);
+      ok = 0;
+      break;
+   }
+
+   return ok;
+}
+
+#endif
diff --git a/src/gallium/drivers/i915/i915_state_sampler.c b/src/gallium/drivers/i915/i915_state_sampler.c
new file mode 100644
index 0000000000..77b9bccbb7
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_state_sampler.c
@@ -0,0 +1,312 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "i915_state_inlines.h"
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_state.h"
+#include "i915_resource.h"
+
+
+/*
+ * A note about min_lod & max_lod.
+ *
+ * There is a circular dependancy between the sampler state
+ * and the map state to be submitted to hw.
+ *
+ * Two condition must be meet:
+ * min_lod =< max_lod == true
+ * max_lod =< last_level == true
+ *
+ *
+ * This is all fine and dandy if it where for the fact that max_lod
+ * is set on the map state instead of the sampler state. That is
+ * the max_lod we submit on map is:
+ * max_lod = MIN2(last_level, max_lod);
+ *
+ * So we need to update the map state when we change samplers and
+ * we need to be change the sampler state when map state is changed.
+ * The first part is done by calling i915_update_texture in
+ * i915_update_samplers and the second part is done else where in
+ * code tracking the state changes.
+ */
+
+static void
+i915_update_texture(struct i915_context *i915,
+                    uint unit,
+                    const struct i915_texture *tex,
+                    const struct i915_sampler_state *sampler,
+                    uint state[6]);
+/**
+ * Compute i915 texture sampling state.
+ *
+ * Recalculate all state from scratch.  Perhaps not the most
+ * efficient, but this has gotten complex enough that we need
+ * something which is understandable and reliable.
+ * \param state  returns the 3 words of compute state
+ */
+static void update_sampler(struct i915_context *i915,
+                           uint unit,
+			   const struct i915_sampler_state *sampler,
+			   const struct i915_texture *tex,
+			   unsigned state[3] )
+{
+   const struct pipe_resource *pt = &tex->b.b;
+   unsigned minlod, lastlod;
+
+   /* Need to do this after updating the maps, which call the
+    * intel_finalize_mipmap_tree and hence can update firstLevel:
+    */
+   state[0] = sampler->state[0];
+   state[1] = sampler->state[1];
+   state[2] = sampler->state[2];
+
+   if (pt->format == PIPE_FORMAT_UYVY ||
+       pt->format == PIPE_FORMAT_YUYV)
+      state[0] |= SS2_COLORSPACE_CONVERSION;
+
+   /* 3D textures don't seem to respect the border color.
+    * Fallback if there's ever a danger that they might refer to
+    * it.  
+    * 
+    * Effectively this means fallback on 3D clamp or
+    * clamp_to_border.
+    *
+    * XXX: Check if this is true on i945.  
+    * XXX: Check if this bug got fixed in release silicon.
+    */
+#if 0
+   {
+      const unsigned ws = sampler->templ->wrap_s;
+      const unsigned wt = sampler->templ->wrap_t;
+      const unsigned wr = sampler->templ->wrap_r;
+      if (pt->target == PIPE_TEXTURE_3D &&
+          (sampler->templ->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
+           sampler->templ->mag_img_filter != PIPE_TEX_FILTER_NEAREST) &&
+          (ws == PIPE_TEX_WRAP_CLAMP ||
+           wt == PIPE_TEX_WRAP_CLAMP ||
+           wr == PIPE_TEX_WRAP_CLAMP ||
+           ws == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
+           wt == PIPE_TEX_WRAP_CLAMP_TO_BORDER || 
+           wr == PIPE_TEX_WRAP_CLAMP_TO_BORDER)) {
+         if (i915->conformance_mode > 0) {
+            assert(0);
+            /* 	    sampler->fallback = true; */
+            /* TODO */
+         }
+      }
+   }
+#endif
+
+   /* See note at the top of file */
+   minlod = sampler->minlod;
+   lastlod = pt->last_level << 4;
+
+   if (lastlod < minlod) {
+      minlod = lastlod;
+   }
+
+   state[1] |= (sampler->minlod << SS3_MIN_LOD_SHIFT);
+   state[1] |= (unit << SS3_TEXTUREMAP_INDEX_SHIFT);
+}
+
+
+void i915_update_samplers( struct i915_context *i915 )
+{
+   uint unit;
+
+   i915->current.sampler_enable_nr = 0;
+   i915->current.sampler_enable_flags = 0x0;
+
+   for (unit = 0; unit < i915->num_fragment_sampler_views && unit < i915->num_samplers;
+        unit++) {
+      /* determine unit enable/disable by looking for a bound texture */
+      /* could also examine the fragment program? */
+      if (i915->fragment_sampler_views[unit]) {
+         struct i915_texture *texture = i915_texture(i915->fragment_sampler_views[unit]->texture);
+
+	 update_sampler( i915,
+	                 unit,
+	                 i915->sampler[unit],       /* sampler state */
+	                 texture,                    /* texture */
+	                 i915->current.sampler[unit] /* the result */
+	                 );
+	 i915_update_texture( i915,
+	                      unit,
+	                      texture,                      /* texture */
+	                      i915->sampler[unit],          /* sampler state */
+	                      i915->current.texbuffer[unit] );
+
+	 i915->current.sampler_enable_nr++;
+	 i915->current.sampler_enable_flags |= (1 << unit);
+      }
+   }
+
+   i915->hardware_dirty |= I915_HW_SAMPLER | I915_HW_MAP;
+}
+
+
+static uint
+translate_texture_format(enum pipe_format pipeFormat)
+{
+   switch (pipeFormat) {
+   case PIPE_FORMAT_L8_UNORM:
+      return MAPSURF_8BIT | MT_8BIT_L8;
+   case PIPE_FORMAT_I8_UNORM:
+      return MAPSURF_8BIT | MT_8BIT_I8;
+   case PIPE_FORMAT_A8_UNORM:
+      return MAPSURF_8BIT | MT_8BIT_A8;
+   case PIPE_FORMAT_L8A8_UNORM:
+      return MAPSURF_16BIT | MT_16BIT_AY88;
+   case PIPE_FORMAT_B5G6R5_UNORM:
+      return MAPSURF_16BIT | MT_16BIT_RGB565;
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
+      return MAPSURF_16BIT | MT_16BIT_ARGB1555;
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
+      return MAPSURF_16BIT | MT_16BIT_ARGB4444;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return MAPSURF_32BIT | MT_32BIT_ARGB8888;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      return MAPSURF_32BIT | MT_32BIT_XRGB8888;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return MAPSURF_32BIT | MT_32BIT_ABGR8888;
+#if 0
+   case PIPE_FORMAT_R8G8B8X8_UNORM:
+      return MAPSURF_32BIT | MT_32BIT_XBGR8888;
+#endif
+   case PIPE_FORMAT_YUYV:
+      return (MAPSURF_422 | MT_422_YCRCB_NORMAL);
+   case PIPE_FORMAT_UYVY:
+      return (MAPSURF_422 | MT_422_YCRCB_SWAPY);
+#if 0
+   case PIPE_FORMAT_RGB_FXT1:
+   case PIPE_FORMAT_RGBA_FXT1:
+      return (MAPSURF_COMPRESSED | MT_COMPRESS_FXT1);
+#endif
+   case PIPE_FORMAT_Z16_UNORM:
+      return (MAPSURF_16BIT | MT_16BIT_L16);
+#if 0
+   case PIPE_FORMAT_RGBA_DXT1:
+   case PIPE_FORMAT_RGB_DXT1:
+      return (MAPSURF_COMPRESSED | MT_COMPRESS_DXT1);
+   case PIPE_FORMAT_RGBA_DXT3:
+      return (MAPSURF_COMPRESSED | MT_COMPRESS_DXT2_3);
+   case PIPE_FORMAT_RGBA_DXT5:
+      return (MAPSURF_COMPRESSED | MT_COMPRESS_DXT4_5);
+#endif
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+   case PIPE_FORMAT_Z24X8_UNORM:
+      return (MAPSURF_32BIT | MT_32BIT_xI824);
+   default:
+      debug_printf("i915: translate_texture_format() bad image format %x\n",
+              pipeFormat);
+      assert(0);
+      return 0;
+   }
+}
+
+
+static void
+i915_update_texture(struct i915_context *i915,
+                    uint unit,
+                    const struct i915_texture *tex,
+                    const struct i915_sampler_state *sampler,
+                    uint state[6])
+{
+   const struct pipe_resource *pt = &tex->b.b;
+   uint format, pitch;
+   const uint width = pt->width0, height = pt->height0, depth = pt->depth0;
+   const uint num_levels = pt->last_level;
+   unsigned max_lod = num_levels * 4;
+   unsigned tiled = MS3_USE_FENCE_REGS;
+
+   assert(tex);
+   assert(width);
+   assert(height);
+   assert(depth);
+
+   format = translate_texture_format(pt->format);
+   pitch = tex->stride;
+
+   assert(format);
+   assert(pitch);
+
+   if (tex->sw_tiled) {
+      assert(!((pitch - 1) & pitch));
+      tiled = MS3_TILED_SURFACE;
+   }
+
+   /* MS3 state */
+   state[0] =
+      (((height - 1) << MS3_HEIGHT_SHIFT)
+       | ((width - 1) << MS3_WIDTH_SHIFT)
+       | format
+       | tiled);
+
+   /*
+    * XXX When min_filter != mag_filter and there's just one mipmap level,
+    * set max_lod = 1 to make sure i915 chooses between min/mag filtering.
+    */
+
+   /* See note at the top of file */
+   if (max_lod > (sampler->maxlod >> 2))
+      max_lod = sampler->maxlod >> 2;
+
+   /* MS4 state */
+   state[1] =
+      ((((pitch / 4) - 1) << MS4_PITCH_SHIFT)
+       | MS4_CUBE_FACE_ENA_MASK
+       | ((max_lod) << MS4_MAX_LOD_SHIFT)
+       | ((depth - 1) << MS4_VOLUME_DEPTH_SHIFT));
+}
+
+
+void
+i915_update_textures(struct i915_context *i915)
+{
+   uint unit;
+
+   for (unit = 0; unit < i915->num_fragment_sampler_views && unit < i915->num_samplers;
+        unit++) {
+      /* determine unit enable/disable by looking for a bound texture */
+      /* could also examine the fragment program? */
+      if (i915->fragment_sampler_views[unit]) {
+         struct i915_texture *texture = i915_texture(i915->fragment_sampler_views[unit]->texture);
+
+	 i915_update_texture( i915,
+	                      unit,
+	                      texture,                      /* texture */
+	                      i915->sampler[unit],          /* sampler state */
+	                      i915->current.texbuffer[unit] );
+      }
+   }
+
+   i915->hardware_dirty |= I915_HW_MAP;
+}
diff --git a/src/gallium/drivers/i915/i915_surface.c b/src/gallium/drivers/i915/i915_surface.c
new file mode 100644
index 0000000000..f40876e708
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_surface.c
@@ -0,0 +1,223 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_surface.h"
+#include "i915_resource.h"
+#include "i915_blit.h"
+#include "i915_reg.h"
+#include "i915_screen.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+
+
+/* Assumes all values are within bounds -- no checking at this level -
+ * do it higher up if required.
+ */
+static void
+i915_surface_copy(struct pipe_context *pipe,
+                  struct pipe_resource *dst, struct pipe_subresource subdst,
+                  unsigned dstx, unsigned dsty, unsigned dstz,
+                  struct pipe_resource *src, struct pipe_subresource subsrc,
+                  unsigned srcx, unsigned srcy, unsigned srcz,
+                  unsigned width, unsigned height)
+{
+   struct i915_texture *dst_tex = i915_texture(dst);
+   struct i915_texture *src_tex = i915_texture(src);
+   struct pipe_resource *dpt = &dst_tex->b.b;
+   struct pipe_resource *spt = &src_tex->b.b;
+   unsigned dst_offset, src_offset;  /* in bytes */
+
+   if (dst->target == PIPE_TEXTURE_CUBE) {
+      dst_offset = dst_tex->image_offset[subdst.level][subdst.face];
+   }
+   else if (dst->target == PIPE_TEXTURE_3D) {
+      dst_offset = dst_tex->image_offset[subdst.level][dstz];
+   }
+   else {
+      dst_offset = dst_tex->image_offset[subdst.level][0];
+      assert(subdst.face == 0);
+      assert(dstz == 0);
+   }
+   if (src->target == PIPE_TEXTURE_CUBE) {
+      src_offset = src_tex->image_offset[subsrc.level][subsrc.face];
+   }
+   else if (src->target == PIPE_TEXTURE_3D) {
+      src_offset = src_tex->image_offset[subsrc.level][srcz];
+   }
+   else {
+      src_offset = src_tex->image_offset[subsrc.level][0];
+      assert(subsrc.face == 0);
+      assert(srcz == 0);
+   }
+
+
+   assert( dst != src );
+   assert( util_format_get_blocksize(dpt->format) == util_format_get_blocksize(spt->format) );
+   assert( util_format_get_blockwidth(dpt->format) == util_format_get_blockwidth(spt->format) );
+   assert( util_format_get_blockheight(dpt->format) == util_format_get_blockheight(spt->format) );
+   assert( util_format_get_blockwidth(dpt->format) == 1 );
+   assert( util_format_get_blockheight(dpt->format) == 1 );
+
+   i915_copy_blit( i915_context(pipe),
+                   util_format_get_blocksize(dpt->format),
+                   (unsigned short) src_tex->stride, src_tex->buffer, src_offset,
+                   (unsigned short) dst_tex->stride, dst_tex->buffer, dst_offset,
+                   (short) srcx, (short) srcy, (short) dstx, (short) dsty, (short) width, (short) height );
+}
+
+
+static void
+i915_clear_render_target(struct pipe_context *pipe,
+                         struct pipe_surface *dst,
+                         const float *rgba,
+                         unsigned dstx, unsigned dsty,
+                         unsigned width, unsigned height)
+{
+   struct i915_texture *tex = i915_texture(dst->texture);
+   struct pipe_resource *pt = &tex->b.b;
+   union util_color uc;
+
+   assert(util_format_get_blockwidth(pt->format) == 1);
+   assert(util_format_get_blockheight(pt->format) == 1);
+
+   util_pack_color(rgba, dst->format, &uc);
+   i915_fill_blit( i915_context(pipe),
+                   util_format_get_blocksize(pt->format),
+                   XY_COLOR_BLT_WRITE_ALPHA | XY_COLOR_BLT_WRITE_RGB,
+                   (unsigned short) tex->stride,
+                   tex->buffer, dst->offset,
+                   (short) dstx, (short) dsty,
+                   (short) width, (short) height,
+                   uc.ui );
+}
+
+static void
+i915_clear_depth_stencil(struct pipe_context *pipe,
+                         struct pipe_surface *dst,
+                         unsigned clear_flags,
+                         double depth,
+                         unsigned stencil,
+                         unsigned dstx, unsigned dsty,
+                         unsigned width, unsigned height)
+{
+   struct i915_texture *tex = i915_texture(dst->texture);
+   struct pipe_resource *pt = &tex->b.b;
+   unsigned packedds;
+   unsigned mask = 0;
+
+   assert(util_format_get_blockwidth(pt->format) == 1);
+   assert(util_format_get_blockheight(pt->format) == 1);
+
+   packedds = util_pack_z_stencil(dst->format, depth, stencil);
+
+   if (clear_flags & PIPE_CLEAR_DEPTH)
+      mask |= XY_COLOR_BLT_WRITE_RGB;
+   /* XXX presumably this does read-modify-write
+      (otherwise this won't work anyway). Hence will only want to
+      do it if really have stencil and it isn't cleared */
+   if ((clear_flags & PIPE_CLEAR_STENCIL) ||
+       (dst->format != PIPE_FORMAT_Z24_UNORM_S8_USCALED))
+      mask |= XY_COLOR_BLT_WRITE_ALPHA;
+
+   i915_fill_blit( i915_context(pipe),
+                   util_format_get_blocksize(pt->format),
+                   mask,
+                   (unsigned short) tex->stride,
+                   tex->buffer, dst->offset,
+                   (short) dstx, (short) dsty,
+                   (short) width, (short) height,
+                   packedds );
+}
+
+/*
+ * Screen surface functions
+ */
+
+
+static struct pipe_surface *
+i915_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_resource *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned flags)
+{
+   struct i915_texture *tex = i915_texture(pt);
+   struct pipe_surface *ps;
+   unsigned offset;  /* in bytes */
+
+   if (pt->target == PIPE_TEXTURE_CUBE) {
+      offset = tex->image_offset[level][face];
+   }
+   else if (pt->target == PIPE_TEXTURE_3D) {
+      offset = tex->image_offset[level][zslice];
+   }
+   else {
+      offset = tex->image_offset[level][0];
+      assert(face == 0);
+      assert(zslice == 0);
+   }
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      pipe_reference_init(&ps->reference, 1);
+      pipe_resource_reference(&ps->texture, pt);
+      ps->format = pt->format;
+      ps->width = u_minify(pt->width0, level);
+      ps->height = u_minify(pt->height0, level);
+      ps->offset = offset;
+      ps->usage = flags;
+   }
+   return ps;
+}
+
+static void
+i915_tex_surface_destroy(struct pipe_surface *surf)
+{
+   pipe_resource_reference(&surf->texture, NULL);
+   FREE(surf);
+}
+
+
+void
+i915_init_surface_functions(struct i915_context *i915)
+{
+   i915->base.resource_copy_region = i915_surface_copy;
+   i915->base.clear_render_target = i915_clear_render_target;
+   i915->base.clear_depth_stencil = i915_clear_depth_stencil;
+}
+
+/* No good reason for these to be in the screen.
+ */
+void
+i915_init_screen_surface_functions(struct i915_screen *is)
+{
+   is->base.get_tex_surface = i915_get_tex_surface;
+   is->base.tex_surface_destroy = i915_tex_surface_destroy;
+}
diff --git a/src/gallium/drivers/i915/i915_surface.h b/src/gallium/drivers/i915/i915_surface.h
new file mode 100644
index 0000000000..448106d566
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_surface.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_SURFACE_H
+#define I915_SURFACE_H
+
+struct i915_context;
+struct i915_screen;
+
+void i915_init_surface_functions( struct i915_context *i915 );
+void i915_init_screen_surface_functions( struct i915_screen *is );
+
+
+#endif /* I915_SCREEN_H */
diff --git a/src/gallium/drivers/i915/i915_winsys.h b/src/gallium/drivers/i915/i915_winsys.h
new file mode 100644
index 0000000000..3aba19fe6a
--- /dev/null
+++ b/src/gallium/drivers/i915/i915_winsys.h
@@ -0,0 +1,232 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 Jakob Bornecrantz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef I915_WINSYS_H
+#define I915_WINSYS_H
+
+#include "pipe/p_compiler.h"
+
+struct i915_winsys;
+struct i915_winsys_buffer;
+struct i915_winsys_batchbuffer;
+struct pipe_resource;
+struct pipe_fence_handle;
+struct winsys_handle;
+
+enum i915_winsys_buffer_usage
+{
+   /* use on textures */
+   I915_USAGE_RENDER    = 0x01,
+   I915_USAGE_SAMPLER   = 0x02,
+   I915_USAGE_2D_TARGET = 0x04,
+   I915_USAGE_2D_SOURCE = 0x08,
+   /* use on vertex */
+   I915_USAGE_VERTEX    = 0x10
+};
+
+enum i915_winsys_buffer_type
+{
+   I915_NEW_TEXTURE,
+   I915_NEW_SCANOUT, /**< a texture used for scanning out from */
+   I915_NEW_VERTEX
+};
+
+enum i915_winsys_buffer_tile
+{
+   I915_TILE_NONE,
+   I915_TILE_X,
+   I915_TILE_Y
+};
+
+struct i915_winsys_batchbuffer {
+
+   struct i915_winsys *iws;
+
+   /**
+    * Values exported to speed up the writing the batchbuffer,
+    * instead of having to go trough a accesor function for
+    * each dword written.
+    */
+   /*{@*/
+   uint8_t *map;
+   uint8_t *ptr;
+   size_t size;
+
+   size_t relocs;
+   size_t max_relocs;
+   /*@}*/
+};
+
+struct i915_winsys {
+
+   unsigned pci_id; /**< PCI ID for the device */
+
+   /**
+    * Batchbuffer functions.
+    */
+   /*@{*/
+   /**
+    * Create a new batchbuffer.
+    */
+   struct i915_winsys_batchbuffer *
+      (*batchbuffer_create)(struct i915_winsys *iws);
+
+   /**
+    * Emit a relocation to a buffer.
+    * Target position in batchbuffer is the same as ptr.
+    *
+    * @batch
+    * @reloc buffer address to be inserted into target.
+    * @usage how is the hardware going to use the buffer.
+    * @offset add this to the reloc buffers address
+    * @target buffer where to write the address, null for batchbuffer.
+    */
+   int (*batchbuffer_reloc)(struct i915_winsys_batchbuffer *batch,
+                            struct i915_winsys_buffer *reloc,
+                            enum i915_winsys_buffer_usage usage,
+                            unsigned offset);
+
+   /**
+    * Flush a bufferbatch.
+    */
+   void (*batchbuffer_flush)(struct i915_winsys_batchbuffer *batch,
+                             struct pipe_fence_handle **fence);
+
+   /**
+    * Destroy a batchbuffer.
+    */
+   void (*batchbuffer_destroy)(struct i915_winsys_batchbuffer *batch);
+   /*@}*/
+
+
+   /**
+    * Buffer functions.
+    */
+   /*@{*/
+   /**
+    * Create a buffer.
+    */
+   struct i915_winsys_buffer *
+      (*buffer_create)(struct i915_winsys *iws,
+                       unsigned size, unsigned alignment,
+                       enum i915_winsys_buffer_type type);
+
+   /**
+    * Creates a buffer from a handle.
+    * Used to implement pipe_screen::resource_from_handle.
+    * Also provides the stride information needed for the
+    * texture via the stride argument.
+    */
+   struct i915_winsys_buffer *
+      (*buffer_from_handle)(struct i915_winsys *iws,
+                            struct winsys_handle *whandle,
+                            unsigned *stride);
+
+   /**
+    * Used to implement pipe_screen::resource_get_handle.
+    * The winsys might need the stride information.
+    */
+   boolean (*buffer_get_handle)(struct i915_winsys *iws,
+                                struct i915_winsys_buffer *buffer,
+                                struct winsys_handle *whandle,
+                                unsigned stride);
+
+   /**
+    * Fence a buffer with a fence reg.
+    * Not to be confused with pipe_fence_handle.
+    */
+   int (*buffer_set_fence_reg)(struct i915_winsys *iws,
+                               struct i915_winsys_buffer *buffer,
+                               unsigned stride,
+                               enum i915_winsys_buffer_tile tile);
+
+   /**
+    * Map a buffer.
+    */
+   void *(*buffer_map)(struct i915_winsys *iws,
+                       struct i915_winsys_buffer *buffer,
+                       boolean write);
+
+   /**
+    * Unmap a buffer.
+    */
+   void (*buffer_unmap)(struct i915_winsys *iws,
+                        struct i915_winsys_buffer *buffer);
+
+   /**
+    * Write to a buffer.
+    *
+    * Arguments follows pipe_buffer_write.
+    */
+   int (*buffer_write)(struct i915_winsys *iws,
+                       struct i915_winsys_buffer *dst,
+                       size_t offset,
+                       size_t size,
+                       const void *data);
+
+   void (*buffer_destroy)(struct i915_winsys *iws,
+                          struct i915_winsys_buffer *buffer);
+   /*@}*/
+
+
+   /**
+    * Fence functions.
+    */
+   /*@{*/
+   /**
+    * Reference fence and set ptr to fence.
+    */
+   void (*fence_reference)(struct i915_winsys *iws,
+                           struct pipe_fence_handle **ptr,
+                           struct pipe_fence_handle *fence);
+
+   /**
+    * Check if a fence has finished.
+    */
+   int (*fence_signalled)(struct i915_winsys *iws,
+                          struct pipe_fence_handle *fence);
+
+   /**
+    * Wait on a fence to finish.
+    */
+   int (*fence_finish)(struct i915_winsys *iws,
+                       struct pipe_fence_handle *fence);
+   /*@}*/
+
+
+   /**
+    * Destroy the winsys.
+    */
+   void (*destroy)(struct i915_winsys *iws);
+};
+
+
+/**
+ * Create i915 pipe_screen.
+ */
+struct pipe_screen *i915_screen_create(struct i915_winsys *iws);
+
+
+#endif
diff --git a/src/gallium/drivers/i965/Makefile b/src/gallium/drivers/i965/Makefile
new file mode 100644
index 0000000000..b0b0970338
--- /dev/null
+++ b/src/gallium/drivers/i965/Makefile
@@ -0,0 +1,74 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = i965
+
+C_SOURCES = \
+	brw_cc.c \
+	brw_clip.c \
+	brw_clip_line.c \
+	brw_clip_point.c \
+	brw_clip_state.c \
+	brw_clip_tri.c \
+	brw_clip_unfilled.c \
+	brw_clip_util.c \
+	brw_context.c \
+	brw_curbe.c \
+	brw_disasm.c \
+	brw_draw.c \
+	brw_draw_upload.c \
+	brw_eu.c \
+	brw_eu_debug.c \
+	brw_eu_emit.c \
+	brw_eu_util.c \
+	brw_gs.c \
+	brw_gs_emit.c \
+	brw_gs_state.c \
+	brw_misc_state.c \
+	brw_pipe_blend.c \
+	brw_pipe_depth.c \
+	brw_pipe_fb.c \
+	brw_pipe_query.c \
+	brw_pipe_shader.c \
+	brw_pipe_flush.c \
+	brw_pipe_misc.c \
+	brw_pipe_sampler.c \
+	brw_pipe_vertex.c \
+	brw_pipe_clear.c \
+	brw_pipe_rast.c \
+	brw_resource.c \
+	brw_sf.c \
+	brw_sf_emit.c \
+	brw_sf_state.c \
+	brw_state_batch.c \
+	brw_state_debug.c \
+	brw_state_cache.c \
+	brw_state_upload.c \
+	brw_structs_dump.c \
+	brw_swtnl.c \
+	brw_urb.c \
+	brw_vs.c \
+	brw_vs_emit.c \
+	brw_vs_state.c \
+	brw_vs_surface_state.c \
+	brw_wm.c \
+	brw_wm_debug.c \
+	brw_wm_emit.c \
+	brw_wm_fp.c \
+	brw_wm_iz.c \
+	brw_wm_pass0.c \
+	brw_wm_pass1.c \
+	brw_wm_pass2.c \
+	brw_wm_sampler_state.c \
+	brw_wm_state.c \
+	brw_wm_surface_state.c \
+	brw_screen.c \
+	brw_resource_buffer.c \
+	brw_resource_texture.c \
+	brw_resource_texture_layout.c \
+	brw_screen_surface.c \
+	brw_batchbuffer.c \
+	brw_winsys_debug.c \
+	intel_decode.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/i965/SConscript b/src/gallium/drivers/i965/SConscript
new file mode 100644
index 0000000000..119f914a16
--- /dev/null
+++ b/src/gallium/drivers/i965/SConscript
@@ -0,0 +1,82 @@
+Import('*')
+
+env = env.Clone()
+
+if msvc:
+	print 'warning: not building i965g'
+	Return();
+
+i965 = env.ConvenienceLibrary(
+	target = 'i965',
+	source = [
+		'brw_batchbuffer.c',
+		'brw_cc.c',
+		'brw_clip.c',
+		'brw_clip_line.c',
+		'brw_clip_point.c',
+		'brw_clip_state.c',
+		'brw_clip_tri.c',
+		'brw_clip_unfilled.c',
+		'brw_clip_util.c',
+		'brw_context.c',
+		'brw_curbe.c',
+		'brw_disasm.c',
+		'brw_draw.c',
+		'brw_draw_upload.c',
+		'brw_eu.c',
+		'brw_eu_debug.c',
+		'brw_eu_emit.c',
+		'brw_eu_util.c',
+		'brw_gs.c',
+		'brw_gs_emit.c',
+		'brw_gs_state.c',
+		'brw_misc_state.c',
+		'brw_pipe_blend.c',
+		'brw_pipe_clear.c',
+		'brw_pipe_depth.c',
+		'brw_pipe_fb.c',
+		'brw_pipe_flush.c',
+		'brw_pipe_misc.c',
+		'brw_pipe_query.c',
+		'brw_pipe_rast.c',
+		'brw_pipe_sampler.c',
+		'brw_pipe_shader.c',
+		'brw_pipe_vertex.c',
+		'brw_resource.c',
+		'brw_resource_buffer.c',
+		'brw_resource_texture.c',
+		'brw_resource_texture_layout.c',
+		'brw_screen.c',
+		'brw_screen_surface.c',
+		'brw_structs_dump.c',
+		'brw_sf.c',
+		'brw_sf_emit.c',
+		'brw_sf_state.c',
+		'brw_state_batch.c',
+		'brw_state_cache.c',
+		'brw_state_debug.c',
+		'brw_state_upload.c',
+		'brw_swtnl.c',
+		'brw_urb.c',
+		'brw_vs.c',
+		'brw_vs_emit.c',
+		'brw_vs_state.c',
+		'brw_vs_surface_state.c',
+		'brw_winsys_debug.c',
+		'brw_wm.c',
+#		'brw_wm_constant_buffer.c',
+		'brw_wm_debug.c',
+		'brw_wm_emit.c',
+		'brw_wm_fp.c',
+#		'brw_wm_glsl.c',
+		'brw_wm_iz.c',
+		'brw_wm_pass0.c',
+		'brw_wm_pass1.c',
+		'brw_wm_pass2.c',
+		'brw_wm_sampler_state.c',
+		'brw_wm_state.c',
+		'brw_wm_surface_state.c',
+		'intel_decode.c',
+	])
+
+Export('i965')
diff --git a/src/gallium/drivers/i965/brw_batchbuffer.c b/src/gallium/drivers/i965/brw_batchbuffer.c
new file mode 100644
index 0000000000..8b3f46f2c1
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_batchbuffer.c
@@ -0,0 +1,202 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_memory.h"
+
+#include "brw_batchbuffer.h"
+#include "brw_reg.h"
+#include "brw_winsys.h"
+#include "brw_debug.h"
+#include "brw_structs.h"
+
+#define ALWAYS_EMIT_MI_FLUSH 1
+
+enum pipe_error
+brw_batchbuffer_reset(struct brw_batchbuffer *batch)
+{
+   enum pipe_error ret;
+
+   ret = batch->sws->bo_alloc( batch->sws,
+                               BRW_BUFFER_TYPE_BATCH,
+                               BRW_BATCH_SIZE, 4096,
+                               &batch->buf );
+   if (ret)
+      return ret;
+
+   batch->size = BRW_BATCH_SIZE;
+
+   /* With map_range semantics, the winsys can decide whether to
+    * inject a malloc'ed bounce buffer instead of mapping directly.
+    */
+   batch->map = batch->sws->bo_map(batch->buf,
+                                   BRW_DATA_BATCH_BUFFER,
+                                   0, batch->size,
+                                   GL_TRUE,
+                                   GL_TRUE,
+                                   GL_TRUE);
+
+   batch->ptr = batch->map;
+   return PIPE_OK;
+}
+
+struct brw_batchbuffer *
+brw_batchbuffer_alloc(struct brw_winsys_screen *sws,
+                      struct brw_chipset chipset)
+{
+   struct brw_batchbuffer *batch = CALLOC_STRUCT(brw_batchbuffer);
+
+   batch->sws = sws;
+   batch->chipset = chipset;
+   brw_batchbuffer_reset(batch);
+
+   return batch;
+}
+
+void
+brw_batchbuffer_free(struct brw_batchbuffer *batch)
+{
+   if (batch->map) {
+      batch->sws->bo_unmap(batch->buf);
+      batch->map = NULL;
+   }
+
+   bo_reference(&batch->buf, NULL);
+   FREE(batch);
+}
+
+
+void
+_brw_batchbuffer_flush(struct brw_batchbuffer *batch, 
+		       const char *file,
+		       int line)
+{
+   GLuint used = batch->ptr - batch->map;
+
+   if (used == 0)
+      return;
+
+   /* Post-swap throttling done by the state tracker.
+    */
+
+   if (BRW_DEBUG & DEBUG_BATCH)
+      debug_printf("%s:%d: Batchbuffer flush with %db used\n", 
+		   file, line, used);
+
+   if (ALWAYS_EMIT_MI_FLUSH) {
+      *(GLuint *) (batch->ptr) = MI_FLUSH | BRW_FLUSH_STATE_CACHE;
+      batch->ptr += 4;
+      used = batch->ptr - batch->map;
+   }
+
+   /* Round batchbuffer usage to 2 DWORDs. 
+    */
+   if ((used & 4) == 0) {
+      *(GLuint *) (batch->ptr) = 0; /* noop */
+      batch->ptr += 4;
+      used = batch->ptr - batch->map;
+   }
+
+   /* Mark the end of the buffer. 
+    */
+   *(GLuint *) (batch->ptr) = MI_BATCH_BUFFER_END;
+   batch->ptr += 4;
+   used = batch->ptr - batch->map;
+
+   batch->sws->bo_flush_range(batch->buf, 0, used);
+   batch->sws->bo_unmap(batch->buf);
+   batch->map = NULL;
+   batch->ptr = NULL;
+      
+   batch->sws->bo_exec(batch->buf, used );
+
+   if (BRW_DEBUG & DEBUG_SYNC) {
+      /* Abuse map/unmap to achieve wait-for-fence.
+       *
+       * XXX: hide this inside the winsys and export a fence
+       * interface.
+       */
+      debug_printf("waiting for idle\n");
+      batch->sws->bo_wait_idle(batch->buf);
+   }
+
+   /* Reset the buffer:
+    */
+   brw_batchbuffer_reset(batch);
+}
+
+
+/* The OUT_RELOC() macro ends up here, generating a relocation within
+ * the batch buffer.
+ */
+enum pipe_error
+brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
+			   struct brw_winsys_buffer *buffer,
+			   enum brw_buffer_usage usage,
+			   uint32_t delta)
+{
+   int ret;
+
+   if (batch->ptr - batch->map > batch->buf->size) {
+      debug_printf("bad relocation ptr %p map %p offset %li size %i\n",
+		   batch->ptr, batch->map, batch->ptr - batch->map, batch->buf->size);
+
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+
+   ret = batch->sws->bo_emit_reloc(batch->buf,
+				   usage,
+				   delta, 
+				   batch->ptr - batch->map,
+				   buffer);
+   if (ret != 0)
+      return ret;
+
+   /* bo_emit_reloc was resposible for writing a zero into the
+    * batchbuffer if necessary.  Just need to update our pointer.
+    */
+   batch->ptr += 4;
+
+   return 0;
+}
+
+enum pipe_error
+brw_batchbuffer_data(struct brw_batchbuffer *batch,
+                       const void *data, GLuint bytes,
+		       enum cliprect_mode cliprect_mode)
+{
+   enum pipe_error ret;
+
+   assert((bytes & 3) == 0);
+
+   ret = brw_batchbuffer_require_space(batch, bytes);
+   if (ret)
+      return ret;
+
+   memcpy(batch->ptr, data, bytes);
+   batch->ptr += bytes;
+   return 0;
+}
diff --git a/src/gallium/drivers/i965/brw_batchbuffer.h b/src/gallium/drivers/i965/brw_batchbuffer.h
new file mode 100644
index 0000000000..6ca9f617f5
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_batchbuffer.h
@@ -0,0 +1,148 @@
+#ifndef BRW_BATCHBUFFER_H
+#define BRW_BATCHBUFFER_H
+
+#include "util/u_debug.h"
+
+#include "brw_types.h"
+#include "brw_winsys.h"
+#include "brw_reg.h"
+
+#define BATCH_SZ 16384
+#define BATCH_RESERVED 16
+
+/* All ignored:
+ */
+enum cliprect_mode {
+   IGNORE_CLIPRECTS,
+   LOOP_CLIPRECTS,
+   NO_LOOP_CLIPRECTS,
+   REFERENCES_CLIPRECTS
+};
+
+
+
+
+struct brw_batchbuffer {
+
+   struct brw_winsys_screen *sws;
+   struct brw_winsys_buffer *buf;
+   struct brw_chipset chipset;
+
+   /**
+    * Values exported to speed up the writing the batchbuffer,
+    * instead of having to go trough a accesor function for
+    * each dword written.
+    */
+   /*{@*/
+   uint8_t *map;
+   uint8_t *ptr;
+   size_t size;
+   struct {
+      uint8_t *end_ptr;
+   } emit;
+
+
+   size_t relocs;
+   size_t max_relocs;
+   /*@}*/
+};
+
+struct brw_batchbuffer *brw_batchbuffer_alloc( struct brw_winsys_screen *sws,
+                                               struct brw_chipset chipset );
+
+void brw_batchbuffer_free(struct brw_batchbuffer *batch);
+
+void _brw_batchbuffer_flush(struct brw_batchbuffer *batch,
+			      const char *file, int line);
+
+
+enum pipe_error
+brw_batchbuffer_reset(struct brw_batchbuffer *batch);
+
+
+/* Unlike bmBufferData, this currently requires the buffer be mapped.
+ * Consider it a convenience function wrapping multple
+ * intel_buffer_dword() calls.
+ */
+enum pipe_error brw_batchbuffer_data(struct brw_batchbuffer *batch,
+                            const void *data, GLuint bytes,
+			    enum cliprect_mode cliprect_mode);
+
+
+enum pipe_error brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
+			       struct brw_winsys_buffer *buffer,
+			       enum brw_buffer_usage usage,
+			       uint32_t offset);
+
+/* Inline functions - might actually be better off with these
+ * non-inlined.  Certainly better off switching all command packets to
+ * be passed as structs rather than dwords, but that's a little bit of
+ * work...
+ */
+static INLINE GLint
+brw_batchbuffer_space(struct brw_batchbuffer *batch)
+{
+   return (batch->size - BATCH_RESERVED) - (batch->ptr - batch->map);
+}
+
+
+static INLINE void
+brw_batchbuffer_emit_dword(struct brw_batchbuffer *batch, GLuint dword)
+{
+   assert(batch->map);
+   assert(brw_batchbuffer_space(batch) >= 4);
+   *(GLuint *) (batch->ptr) = dword;
+   batch->ptr += 4;
+}
+
+static INLINE enum pipe_error
+brw_batchbuffer_require_space(struct brw_batchbuffer *batch,
+                                GLuint sz)
+{
+   assert(sz < batch->size - 8);
+   if (brw_batchbuffer_space(batch) < sz) {
+      assert(0);
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+#ifdef DEBUG
+   batch->emit.end_ptr = batch->ptr + sz;
+#endif
+   return 0;
+}
+
+/* Here are the crusty old macros, to be removed:
+ */
+#define BEGIN_BATCH(n, cliprect_mode) do {				\
+      brw_batchbuffer_require_space(brw->batch, (n)*4);			\
+   } while (0)
+
+#define OUT_BATCH(d) brw_batchbuffer_emit_dword(brw->batch, d)
+
+#define OUT_RELOC(buf, usage, delta) do {				\
+      assert((unsigned) (delta) < buf->size);				\
+      brw_batchbuffer_emit_reloc(brw->batch, buf,			\
+				 usage, delta);				\
+   } while (0)
+
+#ifdef DEBUG
+#define ADVANCE_BATCH() do {						\
+      unsigned int _n = brw->batch->ptr - brw->batch->emit.end_ptr;	\
+      if (_n != 0) {							\
+	 debug_printf("%s: %d too many bytes emitted to batch\n",	\
+		      __FUNCTION__, _n);				\
+	 abort();							\
+      }									\
+      brw->batch->emit.end_ptr = NULL;					\
+   } while(0)
+#else
+#define ADVANCE_BATCH()
+#endif
+
+static INLINE void
+brw_batchbuffer_emit_mi_flush(struct brw_batchbuffer *batch)
+{
+   brw_batchbuffer_require_space(batch, 4);
+   brw_batchbuffer_emit_dword(batch, MI_FLUSH);
+}
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_cc.c b/src/gallium/drivers/i965/brw_cc.c
new file mode 100644
index 0000000000..cc8e380c68
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_cc.c
@@ -0,0 +1,129 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+
+
+static enum pipe_error prepare_cc_vp( struct brw_context *brw )
+{
+   return brw_cache_data( &brw->cache, 
+                         BRW_CC_VP,
+                         &brw->curr.ccv,
+                         NULL, 0,
+                         &brw->cc.reloc[CC_RELOC_VP].bo );
+}
+
+const struct brw_tracked_state brw_cc_vp = {
+   .dirty = {
+      .mesa = PIPE_NEW_VIEWPORT,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0
+   },
+   .prepare = prepare_cc_vp
+};
+
+
+/* A long-winded way to OR two unsigned integers together:
+ */
+static INLINE struct brw_cc3
+combine_cc3( struct brw_cc3 a, struct brw_cc3 b )
+{
+   union { struct brw_cc3 cc3; unsigned i; } ca, cb;
+   ca.cc3 = a;
+   cb.cc3 = b;
+   ca.i |= cb.i;
+   return ca.cc3;
+}
+
+static INLINE struct brw_cc1
+combine_cc1( struct brw_cc1 a, struct brw_cc1 b )
+{
+   union { struct brw_cc1 cc1; unsigned i; } ca, cb;
+   ca.cc1 = a;
+   cb.cc1 = b;
+   ca.i |= cb.i;
+   return ca.cc1;
+}
+
+static INLINE struct brw_cc2
+combine_cc2( struct brw_cc2 a, struct brw_cc2 b )
+{
+   union { struct brw_cc2 cc2; unsigned i; } ca, cb;
+   ca.cc2 = a;
+   cb.cc2 = b;
+   ca.i |= cb.i;
+   return ca.cc2;
+}
+
+static int prepare_cc_unit( struct brw_context *brw )
+{
+   brw->cc.cc.cc0 = brw->curr.zstencil->cc0;
+   brw->cc.cc.cc1 = combine_cc1( brw->curr.zstencil->cc1, brw->curr.cc1_stencil_ref );
+   brw->cc.cc.cc2 = combine_cc2( brw->curr.zstencil->cc2, brw->curr.blend->cc2 );
+   brw->cc.cc.cc3 = combine_cc3( brw->curr.zstencil->cc3, brw->curr.blend->cc3 );
+
+   brw->cc.cc.cc5 = brw->curr.blend->cc5;
+   brw->cc.cc.cc6 = brw->curr.blend->cc6;
+   brw->cc.cc.cc7 = brw->curr.zstencil->cc7;
+
+   return brw_cache_data_sz(&brw->cache, BRW_CC_UNIT,
+                           &brw->cc.cc, sizeof(brw->cc.cc),
+                           brw->cc.reloc, 1,
+                           &brw->cc.state_bo);
+}
+
+const struct brw_tracked_state brw_cc_unit = {
+   .dirty = {
+      .mesa = PIPE_NEW_DEPTH_STENCIL_ALPHA | PIPE_NEW_BLEND,
+      .brw = 0,
+      .cache = CACHE_NEW_CC_VP
+   },
+   .prepare = prepare_cc_unit,
+};
+
+
+void brw_hw_cc_init( struct brw_context *brw )
+{
+   make_reloc(&brw->cc.reloc[0],
+              BRW_USAGE_STATE,
+              0,
+              offsetof(struct brw_cc_unit_state, cc4),
+              NULL);
+}
+
+
+void brw_hw_cc_cleanup( struct brw_context *brw )
+{
+   bo_reference(&brw->cc.state_bo, NULL);
+   bo_reference(&brw->cc.reloc[0].bo, NULL);
+}
diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
new file mode 100644
index 0000000000..ccba205e8c
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -0,0 +1,223 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_state.h"
+
+#include "util/u_math.h"
+
+#include "brw_screen.h"
+#include "brw_batchbuffer.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_state.h"
+#include "brw_pipe_rast.h"
+#include "brw_clip.h"
+
+
+#define FRONT_UNFILLED_BIT  0x1
+#define BACK_UNFILLED_BIT   0x2
+
+
+static enum pipe_error
+compile_clip_prog( struct brw_context *brw,
+                   struct brw_clip_prog_key *key,
+                   struct brw_winsys_buffer **bo_out )
+{
+   enum pipe_error ret;
+   struct brw_clip_compile c;
+   const GLuint *program;
+   GLuint program_size;
+   GLuint delta;
+
+   memset(&c, 0, sizeof(c));
+   
+   /* Begin the compilation:
+    */
+   brw_init_compile(brw, &c.func);
+
+   c.func.single_program_flow = 1;
+
+   c.chipset = brw->chipset;
+   c.key = *key;
+   c.need_ff_sync = c.chipset.is_igdng;
+
+   /* Need to locate the two positions present in vertex + header.
+    * These are currently hardcoded:
+    */
+   c.header_position_offset = ATTR_SIZE;
+
+   if (c.chipset.is_igdng)
+       delta = 3 * REG_SIZE;
+   else
+       delta = REG_SIZE;
+
+   c.offset_hpos = delta + c.key.output_hpos * ATTR_SIZE;
+
+   if (c.key.output_color0 != BRW_OUTPUT_NOT_PRESENT)
+      c.offset_color0 = delta + c.key.output_color0 * ATTR_SIZE;
+
+   if (c.key.output_color1 != BRW_OUTPUT_NOT_PRESENT)
+      c.offset_color1 = delta + c.key.output_color1 * ATTR_SIZE;
+
+   if (c.key.output_bfc0 != BRW_OUTPUT_NOT_PRESENT)
+      c.offset_bfc0 = delta + c.key.output_bfc0 * ATTR_SIZE;
+
+   if (c.key.output_bfc1 != BRW_OUTPUT_NOT_PRESENT)
+      c.offset_bfc1 = delta + c.key.output_bfc1 * ATTR_SIZE;
+
+   if (c.key.output_edgeflag != BRW_OUTPUT_NOT_PRESENT)
+      c.offset_edgeflag = delta + c.key.output_edgeflag * ATTR_SIZE;
+   
+   if (BRW_IS_IGDNG(brw))
+       c.nr_regs = (c.key.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+   else
+       c.nr_regs = (c.key.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
+   c.nr_bytes = c.nr_regs * REG_SIZE;
+
+   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.  
+    */
+   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
+
+
+   /* Would ideally have the option of producing a program which could
+    * do all three:
+    */
+   switch (key->primitive) {
+   case PIPE_PRIM_TRIANGLES: 
+      if (key->do_unfilled)
+	 brw_emit_unfilled_clip( &c );
+      else
+	 brw_emit_tri_clip( &c );
+      break;
+   case PIPE_PRIM_LINES:
+      brw_emit_line_clip( &c );
+      break;
+   case PIPE_PRIM_POINTS:
+      brw_emit_point_clip( &c );
+      break;
+   default:
+      assert(0);
+      return PIPE_ERROR_BAD_INPUT;
+   }
+
+	 
+
+   /* get the program
+    */
+   ret = brw_get_program(&c.func, &program, &program_size);
+   if (ret)
+      return ret;
+
+   /* Upload
+    */
+   ret = brw_upload_cache( &brw->cache,
+                           BRW_CLIP_PROG,
+                           &c.key, sizeof(c.key),
+                           NULL, 0,
+                           program, program_size,
+                           &c.prog_data,
+                           &brw->clip.prog_data,
+                           bo_out );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static enum pipe_error
+upload_clip_prog(struct brw_context *brw)
+{
+   const struct brw_vertex_shader *vs = brw->curr.vertex_shader;
+   struct brw_clip_prog_key key;
+   enum pipe_error ret;
+
+   /* Populate the key, starting from the almost-complete version from
+    * the rast state. 
+    */
+
+   /* PIPE_NEW_RAST */
+   key = brw->curr.rast->clip_key;
+   
+   /* BRW_NEW_REDUCED_PRIMITIVE */
+   key.primitive = brw->reduced_primitive;
+
+   /* XXX: if edgeflag is moved to a proper TGSI vs output, can remove
+    * dependency on CACHE_NEW_VS_PROG
+    */
+   /* CACHE_NEW_VS_PROG */
+   key.nr_attrs        = brw->vs.prog_data->nr_outputs;
+
+   /* PIPE_NEW_VS */
+   key.output_hpos     = vs->output_hpos;
+   key.output_color0   = vs->output_color0;
+   key.output_color1   = vs->output_color1;
+   key.output_bfc0     = vs->output_bfc0;
+   key.output_bfc1     = vs->output_bfc1;
+   key.output_edgeflag = vs->output_edgeflag;
+
+   /* PIPE_NEW_CLIP */
+   key.nr_userclip = brw->curr.ucp.nr;
+
+   /* Already cached?
+    */
+   if (brw_search_cache(&brw->cache, BRW_CLIP_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->clip.prog_data,
+                        &brw->clip.prog_bo))
+      return PIPE_OK;
+
+   /* Compile new program:
+    */
+   ret = compile_clip_prog( brw, &key, &brw->clip.prog_bo );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+const struct brw_tracked_state brw_clip_prog = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_RAST | 
+		PIPE_NEW_CLIP),
+      .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .prepare = upload_clip_prog
+};
diff --git a/src/gallium/drivers/i965/brw_clip.h b/src/gallium/drivers/i965/brw_clip.h
new file mode 100644
index 0000000000..80e3a11a37
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip.h
@@ -0,0 +1,199 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#ifndef BRW_CLIP_H
+#define BRW_CLIP_H
+
+#include "pipe/p_state.h"
+#include "brw_reg.h"
+#include "brw_eu.h"
+
+#define MAX_VERTS (3+6+6)	
+
+/* Note that if unfilled primitives are being emitted, we have to fix
+ * up polygon offset and flatshading at this point:
+ */
+struct brw_clip_prog_key {
+   GLuint nr_attrs:6;
+   GLuint primitive:4;
+   GLuint nr_userclip:3;
+   GLuint do_flat_shading:1;
+   GLuint do_unfilled:1;
+   GLuint fill_cw:2;		/* includes cull information */
+   GLuint fill_ccw:2;		/* includes cull information */
+   GLuint offset_cw:1;
+   GLuint offset_ccw:1;
+   GLuint copy_bfc_cw:1;
+   GLuint copy_bfc_ccw:1;
+   GLuint clip_mode:3;
+   GLuint output_hpos:6;        /* not always zero? */
+
+   GLuint output_color0:6;
+   GLuint output_color1:6;
+   GLuint output_bfc0:6;
+   GLuint output_bfc1:6;
+   GLuint output_edgeflag:6;
+   GLuint pad1:2;
+   
+   GLfloat offset_factor;
+   GLfloat offset_units;
+};
+
+struct brw_clip_prog_data {
+   GLuint curb_read_length;	/* user planes? */
+   GLuint clip_mode;
+   GLuint urb_read_length;
+   GLuint total_grf;
+};
+
+#define CLIP_LINE   0
+#define CLIP_POINT  1
+#define CLIP_FILL   2
+#define CLIP_CULL   3
+
+
+#define PRIM_MASK  (0x1f)
+
+struct brw_clip_compile {
+   struct brw_compile func;
+   struct brw_clip_prog_key key;
+   struct brw_clip_prog_data prog_data;
+   
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_VERTS];
+
+      struct brw_reg t;
+      struct brw_reg t0, t1;
+      struct brw_reg dp0, dp1;
+
+      struct brw_reg dpPrev;
+      struct brw_reg dp;
+      struct brw_reg loopcount;
+      struct brw_reg nr_verts;
+      struct brw_reg planemask;
+
+      struct brw_reg inlist;
+      struct brw_reg outlist;
+      struct brw_reg freelist;
+
+      struct brw_reg dir;
+      struct brw_reg tmp0, tmp1;
+      struct brw_reg offset;
+      
+      struct brw_reg fixed_planes;
+      struct brw_reg plane_equation;
+       
+      struct brw_reg ff_sync;
+   } reg;
+
+   /* 3 different ways of expressing vertex size, including
+    * key.nr_attrs.
+    */
+   GLuint nr_regs;
+   GLuint nr_bytes;
+
+   GLuint first_tmp;
+   GLuint last_tmp;
+
+   GLboolean need_direction;
+   struct brw_chipset chipset;
+
+   GLuint last_mrf;
+
+   GLuint header_position_offset;
+   GLboolean need_ff_sync;
+
+   GLuint nr_color_attrs;
+   GLuint offset_color0;
+   GLuint offset_color1;
+   GLuint offset_bfc0;
+   GLuint offset_bfc1;
+
+   GLuint offset_hpos;
+   GLuint offset_edgeflag;
+};
+
+#define ATTR_SIZE  (4*4)
+
+/* Points are only culled, so no need for a clip routine, however it
+ * works out easier to have a dummy one.
+ */
+void brw_emit_unfilled_clip( struct brw_clip_compile *c );
+void brw_emit_tri_clip( struct brw_clip_compile *c );
+void brw_emit_line_clip( struct brw_clip_compile *c );
+void brw_emit_point_clip( struct brw_clip_compile *c );
+
+/* brw_clip_tri.c, for use by the unfilled clip routine:
+ */
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c );
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c );
+void brw_clip_tri( struct brw_clip_compile *c );
+void brw_clip_tri_emit_polygon( struct brw_clip_compile *c );
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
+			      GLuint nr_verts );
+
+
+/* Utils:
+ */
+
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     GLboolean force_edgeflag );
+
+void brw_clip_init_planes( struct brw_clip_compile *c );
+
+void brw_clip_emit_vue(struct brw_clip_compile *c, 
+		       struct brw_indirect vert,
+		       GLboolean allocate,
+		       GLboolean eot,
+		       GLuint header);
+
+void brw_clip_kill_thread(struct brw_clip_compile *c);
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c );
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c );
+
+void brw_clip_copy_colors( struct brw_clip_compile *c,
+			   GLuint to, GLuint from );
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c );
+
+struct brw_reg get_tmp( struct brw_clip_compile *c );
+
+void brw_clip_project_position(struct brw_clip_compile *c,
+             struct brw_reg pos );
+void brw_clip_ff_sync(struct brw_clip_compile *c);
+void brw_clip_init_ff_sync(struct brw_clip_compile *c);
+#endif
diff --git a/src/gallium/drivers/i965/brw_clip_line.c b/src/gallium/drivers/i965/brw_clip_line.c
new file mode 100644
index 0000000000..66caadc4d5
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip_line.c
@@ -0,0 +1,270 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_debug.h"
+
+#include "brw_defines.h"
+#include "brw_eu.h"
+#include "brw_clip.h"
+
+
+
+
+static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
+{
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < 4; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->reg.t           = brw_vec1_grf(i, 0);
+   c->reg.t0          = brw_vec1_grf(i, 1);
+   c->reg.t1          = brw_vec1_grf(i, 2);
+   c->reg.planemask   = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dp0         = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp1         = brw_vec1_grf(i, 4);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0); 
+      i++;
+   }
+
+   if (c->need_ff_sync) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+/* Line clipping, more or less following the following algorithm:
+ *
+ *  for (p=0;p<MAX_PLANES;p++) {
+ *     if (clipmask & (1 << p)) {
+ *        GLfloat dp0 = DOTPROD( vtx0, plane[p] );
+ *        GLfloat dp1 = DOTPROD( vtx1, plane[p] );
+ *
+ *        if (IS_NEGATIVE(dp1)) {
+ *           GLfloat t = dp1 / (dp1 - dp0);
+ *           if (t > t1) t1 = t;
+ *        } else {
+ *           GLfloat t = dp0 / (dp0 - dp1);
+ *           if (t > t0) t0 = t;
+ *        }
+ *  
+ *        if (t0 + t1 >= 1.0)
+ *           return;
+ *     }
+ *  }
+ *
+ *  interp( ctx, newvtx0, vtx0, vtx1, t0 );
+ *  interp( ctx, newvtx1, vtx1, vtx0, t1 );
+ *
+ */
+static void clip_and_emit_line( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_indirect vtx0     = brw_indirect(0, 0);
+   struct brw_indirect vtx1      = brw_indirect(1, 0);
+   struct brw_indirect newvtx0   = brw_indirect(2, 0);
+   struct brw_indirect newvtx1   = brw_indirect(3, 0);
+   struct brw_indirect plane_ptr = brw_indirect(4, 0);
+   struct brw_instruction *plane_loop;
+   struct brw_instruction *plane_active;
+   struct brw_instruction *is_negative;
+   struct brw_instruction *is_neg2 = NULL;
+   struct brw_instruction *not_culled;
+   struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
+
+   brw_MOV(p, get_addr_reg(vtx0),      brw_address(c->reg.vertex[0]));
+   brw_MOV(p, get_addr_reg(vtx1),      brw_address(c->reg.vertex[1]));
+   brw_MOV(p, get_addr_reg(newvtx0),   brw_address(c->reg.vertex[2]));
+   brw_MOV(p, get_addr_reg(newvtx1),   brw_address(c->reg.vertex[3]));
+   brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
+
+   /* Note: init t0, t1 together: 
+    */
+   brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0));
+
+   brw_clip_init_planes(c);
+   brw_clip_init_clipmask(c);
+
+   /* -ve rhw workaround */
+   if (c->chipset.is_965) {
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+              brw_imm_ud(1<<20));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
+   }
+
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   plane_loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1));
+      
+      plane_active = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+	 /* dp = DP4(vtx->position, plane) 
+	  */
+	 brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, c->offset_hpos), c->reg.plane_equation);
+
+	 /* if (IS_NEGATIVE(dp1)) 
+	  */
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	 brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, c->offset_hpos), c->reg.plane_equation);
+	 is_negative = brw_IF(p, BRW_EXECUTE_1);
+	 {
+             /*
+              * Both can be negative on GM965/G965 due to RHW workaround
+              * if so, this object should be rejected.
+              */
+             if (c->chipset.is_965) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
+                 is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+                 {
+                     brw_clip_kill_thread(c);
+                 }
+                 brw_ENDIF(p, is_neg2);
+             }
+
+             brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
+             brw_math_invert(p, c->reg.t, c->reg.t);
+             brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
+
+             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
+             brw_MOV(p, c->reg.t1, c->reg.t);
+             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 } 
+	 is_negative = brw_ELSE(p, is_negative);
+	 {
+             /* Coming back in.  We know that both cannot be negative
+              * because the line would have been culled in that case.
+              */
+
+             /* If both are positive, do nothing */
+             /* Only on GM965/G965 */
+             if (c->chipset.is_965) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
+                 is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+             }
+
+             {
+                 brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
+                 brw_math_invert(p, c->reg.t, c->reg.t);
+                 brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
+
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
+                 brw_MOV(p, c->reg.t0, c->reg.t);
+                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+             }
+
+             if (c->chipset.is_965) {
+                 brw_ENDIF(p, is_neg2);
+             }
+         }
+	 brw_ENDIF(p, is_negative);	 
+      }
+      brw_ENDIF(p, plane_active);
+      
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* while (planemask>>=1) != 0
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+   }
+   brw_WHILE(p, plane_loop);
+
+   brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
+   not_culled = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, FALSE);
+      brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, FALSE);
+
+      brw_clip_emit_vue(c, newvtx0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+      brw_clip_emit_vue(c, newvtx1, 0, 1, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END); 
+   }
+   brw_ENDIF(p, not_culled);
+   brw_clip_kill_thread(c);
+}
+
+
+
+void brw_emit_line_clip( struct brw_clip_compile *c )
+{
+   brw_clip_line_alloc_regs(c);
+   brw_clip_init_ff_sync(c);
+
+   if (c->key.do_flat_shading)
+      brw_clip_copy_colors(c, 0, 1);
+                
+   clip_and_emit_line(c);
+}
diff --git a/src/gallium/drivers/i965/brw_clip_point.c b/src/gallium/drivers/i965/brw_clip_point.c
new file mode 100644
index 0000000000..124156c1b5
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip_point.c
@@ -0,0 +1,47 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_eu.h"
+#include "brw_clip.h"
+
+
+/* Point clipping, nothing to do?
+ */
+void brw_emit_point_clip( struct brw_clip_compile *c )
+{
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_tri_alloc_regs(c, 0);
+   brw_clip_init_ff_sync(c);
+
+   brw_clip_kill_thread(c);
+}
diff --git a/src/gallium/drivers/i965/brw_clip_state.c b/src/gallium/drivers/i965/brw_clip_state.c
new file mode 100644
index 0000000000..5c3ccfd8d0
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip_state.c
@@ -0,0 +1,209 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_math.h"
+
+#include "brw_context.h"
+#include "brw_clip.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_debug.h"
+
+struct brw_clip_unit_key {
+   unsigned int total_grf;
+   unsigned int urb_entry_read_length;
+   unsigned int curb_entry_read_length;
+   unsigned int clip_mode;
+
+   unsigned int curbe_offset;
+
+   unsigned int nr_urb_entries, urb_size;
+
+   GLboolean depth_clamp;
+};
+
+static void
+clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_CLIP_PROG */
+   key->total_grf = brw->clip.prog_data->total_grf;
+   key->urb_entry_read_length = brw->clip.prog_data->urb_read_length;
+   key->curb_entry_read_length = brw->clip.prog_data->curb_read_length;
+   key->clip_mode = brw->clip.prog_data->clip_mode;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   key->curbe_offset = brw->curbe.clip_start;
+
+   /* BRW_NEW_URB_FENCE */
+   key->nr_urb_entries = brw->urb.nr_clip_entries;
+   key->urb_size = brw->urb.vsize;
+
+   /*  */
+   key->depth_clamp = 0; /* XXX: add this to gallium: ctx->Transform.DepthClamp; */
+}
+
+static enum pipe_error
+clip_unit_create_from_key(struct brw_context *brw,
+                          struct brw_clip_unit_key *key,
+                          struct brw_winsys_reloc *reloc,
+                          struct brw_winsys_buffer **bo_out)
+{
+   struct brw_clip_unit_state clip;
+   enum pipe_error ret;
+
+   memset(&clip, 0, sizeof(clip));
+
+   clip.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
+   /* reloc */
+   clip.thread0.kernel_start_pointer = 0;
+
+   clip.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   clip.thread1.single_program_flow = 1;
+
+   clip.thread3.urb_entry_read_length = key->urb_entry_read_length;
+   clip.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
+   clip.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
+   clip.thread3.dispatch_grf_start_reg = 1;
+   clip.thread3.urb_entry_read_offset = 0;
+
+   clip.thread4.nr_urb_entries = key->nr_urb_entries;
+   clip.thread4.urb_entry_allocation_size = key->urb_size - 1;
+   /* If we have enough clip URB entries to run two threads, do so.
+    */
+   if (key->nr_urb_entries >= 10) {
+      /* Half of the URB entries go to each thread, and it has to be an
+       * even number.
+       */
+      assert(key->nr_urb_entries % 2 == 0);
+      
+      /* Although up to 16 concurrent Clip threads are allowed on IGDNG, 
+       * only 2 threads can output VUEs at a time.
+       */
+      if (BRW_IS_IGDNG(brw))
+         clip.thread4.max_threads = 16 - 1;        
+      else
+         clip.thread4.max_threads = 2 - 1;
+   } else {
+      assert(key->nr_urb_entries >= 5);
+      clip.thread4.max_threads = 1 - 1;
+   }
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      clip.thread4.max_threads = 0;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      clip.thread4.stats_enable = 1;
+
+   clip.clip5.userclip_enable_flags = 0x7f;
+   clip.clip5.userclip_must_clip = 1;
+   clip.clip5.guard_band_enable = 0;
+   if (!key->depth_clamp)
+      clip.clip5.viewport_z_clip_enable = 1;
+   clip.clip5.viewport_xy_clip_enable = 1;
+   clip.clip5.vertex_position_space = BRW_CLIP_NDCSPACE;
+   clip.clip5.api_mode = BRW_CLIP_API_OGL;
+   clip.clip5.clip_mode = key->clip_mode;
+
+   if (BRW_IS_G4X(brw))
+      clip.clip5.negative_w_clip_test = 1;
+
+   clip.clip6.clipper_viewport_state_ptr = 0;
+   clip.viewport_xmin = -1;
+   clip.viewport_xmax = 1;
+   clip.viewport_ymin = -1;
+   clip.viewport_ymax = 1;
+
+   ret = brw_upload_cache(&brw->cache, BRW_CLIP_UNIT,
+                          key, sizeof(*key),
+                          reloc, 1,
+                          &clip, sizeof(clip),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+static int upload_clip_unit( struct brw_context *brw )
+{
+   struct brw_clip_unit_key key;
+   struct brw_winsys_reloc reloc[1];
+   unsigned grf_reg_count;
+   enum pipe_error ret;
+
+   clip_unit_populate_key(brw, &key);
+
+   grf_reg_count = align(key.total_grf, 16) / 16 - 1;
+
+   /* clip program relocation
+    *
+    * XXX: these reloc structs are long lived and only need to be
+    * updated when the bound BO changes.  Hopefully the stuff mixed in
+    * in the delta's is non-orthogonal.
+    */
+   assert(brw->clip.prog_bo);
+   make_reloc(&reloc[0],
+              BRW_USAGE_STATE,
+              grf_reg_count << 1,
+              offsetof(struct brw_clip_unit_state, thread0),
+              brw->clip.prog_bo);
+
+
+   if (brw_search_cache(&brw->cache, BRW_CLIP_UNIT,
+                        &key, sizeof(key),
+                        reloc, 1,
+                        NULL,
+                        &brw->clip.state_bo))
+      return PIPE_OK;
+      
+   /* Create new:
+    */
+   ret = clip_unit_create_from_key(brw, &key, 
+                                   reloc,
+                                   &brw->clip.state_bo);
+   if (ret)
+      return ret;
+   
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_clip_unit = {
+   .dirty = {
+      .mesa  = 0,
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_CLIP_PROG
+   },
+   .prepare = upload_clip_unit,
+};
diff --git a/src/gallium/drivers/i965/brw_clip_tri.c b/src/gallium/drivers/i965/brw_clip_tri.c
new file mode 100644
index 0000000000..069524bc14
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip_tri.c
@@ -0,0 +1,594 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_eu.h"
+#include "brw_clip.h"
+
+static void release_tmps( struct brw_clip_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
+			      GLuint nr_verts )
+{
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   if (c->key.nr_attrs & 1) {
+      for (j = 0; j < 3; j++) {
+	 GLuint delta = c->key.nr_attrs*16 + 32;
+
+         if (c->chipset.is_igdng)
+             delta = c->key.nr_attrs * 16 + 32 * 3;
+
+	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
+      }
+   }
+
+   c->reg.t          = brw_vec1_grf(i, 0);
+   c->reg.loopcount  = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_D);
+   c->reg.nr_verts   = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD);
+   c->reg.planemask  = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dpPrev     = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp         = brw_vec1_grf(i, 4);
+   i++;
+
+   c->reg.inlist     = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.outlist    = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.freelist   = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0); 
+      i++;
+   }
+
+   if (c->key.do_unfilled) {
+      c->reg.dir     = brw_vec4_grf(i, 0);
+      c->reg.offset  = brw_vec4_grf(i, 4);
+      i++;
+      c->reg.tmp0    = brw_vec4_grf(i, 0);
+      c->reg.tmp1    = brw_vec4_grf(i, 4);
+      i++;
+   }
+
+   if (c->need_ff_sync) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+   struct brw_instruction *is_rev;
+
+   /* Initial list of indices for incoming vertexes:
+    */
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_EQ, 
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
+
+   /* XXX: Is there an easier way to do this?  Need to reverse every
+    * second tristrip element:  Can ignore sometimes?
+    */
+   is_rev = brw_IF(p, BRW_EXECUTE_1);
+   {   
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[1]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[0]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(-1));
+   }
+   is_rev = brw_ELSE(p, is_rev);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[0]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[1]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(1));
+   }
+   brw_ENDIF(p, is_rev);
+
+   brw_MOV(p, get_element(c->reg.inlist, 2),  brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0));
+   brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3));
+}
+
+
+
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *is_poly;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_EQ, 
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   is_poly = brw_IF(p, BRW_EXECUTE_1);
+   {   
+      brw_clip_copy_colors(c, 1, 0);
+      brw_clip_copy_colors(c, 2, 0);
+   }
+   is_poly = brw_ELSE(p, is_poly);
+   {
+      brw_clip_copy_colors(c, 0, 2);
+      brw_clip_copy_colors(c, 1, 2);
+   }
+   brw_ENDIF(p, is_poly);
+}
+
+
+
+/* Use mesa's clipping algorithms, translated to GEN4 assembly.
+ */
+void brw_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_indirect vtx = brw_indirect(0, 0);
+   struct brw_indirect vtxPrev = brw_indirect(1, 0);
+   struct brw_indirect vtxOut = brw_indirect(2, 0);
+   struct brw_indirect plane_ptr = brw_indirect(3, 0);
+   struct brw_indirect inlist_ptr = brw_indirect(4, 0);
+   struct brw_indirect outlist_ptr = brw_indirect(5, 0);
+   struct brw_indirect freelist_ptr = brw_indirect(6, 0);
+   struct brw_instruction *plane_loop;
+   struct brw_instruction *plane_active;
+   struct brw_instruction *vertex_loop;
+   struct brw_instruction *next_test;
+   struct brw_instruction *prev_test;
+   
+   brw_MOV(p, get_addr_reg(vtxPrev),     brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, get_addr_reg(plane_ptr),   brw_clip_plane0_address(c));
+   brw_MOV(p, get_addr_reg(inlist_ptr),  brw_address(c->reg.inlist));
+   brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+
+   brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) );
+
+   plane_loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1));
+      
+      plane_active = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 /* vtxOut = freelist_ptr++ 
+	  */
+	 brw_MOV(p, get_addr_reg(vtxOut),       get_addr_reg(freelist_ptr) );
+	 brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE));
+
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+	    
+	 brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+	 brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0));
+
+	 vertex_loop = brw_DO(p, BRW_EXECUTE_1);
+	 {
+	    /* vtx = *input_ptr;
+	     */
+	    brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0));
+
+	    /* IS_NEGATIVE(prev) */
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	    brw_DP4(p, vec4(c->reg.dpPrev), deref_4f(vtxPrev, c->offset_hpos), c->reg.plane_equation);
+	    prev_test = brw_IF(p, BRW_EXECUTE_1);
+	    {
+	       /* IS_POSITIVE(next)
+		*/
+	       brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset_hpos), c->reg.plane_equation);
+	       next_test = brw_IF(p, BRW_EXECUTE_1);
+	       {
+
+		  /* Coming back in.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev);
+
+		  /* If (vtxOut == 0) vtxOut = vtxPrev
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev) );
+		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, GL_FALSE);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++; 
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p, next_test);
+	       
+	    }
+	    prev_test = brw_ELSE(p, prev_test);
+	    {
+	       /* *outlist_ptr++ = vtxPrev;
+		* nr_verts++;
+		*/
+	       brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev));
+	       brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+	       brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+
+	       /* IS_NEGATIVE(next)
+		*/
+	       brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset_hpos), c->reg.plane_equation);
+	       next_test = brw_IF(p, BRW_EXECUTE_1);
+	       {
+		  /* Going out of bounds.  Avoid division by zero as we
+		   * know dp != dpPrev from DIFFERENT_SIGNS, above.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp);
+
+		  /* If (vtxOut == 0) vtxOut = vtx
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx) );
+		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, GL_TRUE);		  
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++; 
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       } 	       
+	       brw_ENDIF(p, next_test);
+	    }
+	    brw_ENDIF(p, prev_test);
+	    
+	    /* vtxPrev = vtx;
+	     * inlist_ptr++;
+	     */
+	    brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx));
+	    brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short)));
+
+	    /* while (--loopcount != 0)
+	     */
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	    brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+	 } 
+	 brw_WHILE(p, vertex_loop);
+
+	 /* vtxPrev = *(outlist_ptr-1)  OR: outlist[nr_verts-1]
+	  * inlist = outlist
+	  * inlist_ptr = &inlist[0]
+	  * outlist_ptr = &outlist[0]
+	  */
+	 brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2));
+	 brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0));
+	 brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0));
+	 brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
+	 brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+      }
+      brw_ENDIF(p, plane_active);
+      
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* nr_verts >= 3 
+       */
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      c->reg.nr_verts,
+	      brw_imm_ud(3));
+   
+      /* && (planemask>>=1) != 0
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+   }
+   brw_WHILE(p, plane_loop);
+}
+
+
+
+void brw_clip_tri_emit_polygon(struct brw_clip_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop, *if_insn;
+
+   /* for (loopcount = nr_verts-2; loopcount > 0; loopcount--)
+    */
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
+   brw_ADD(p,
+	   c->reg.loopcount,
+	   c->reg.nr_verts,
+	   brw_imm_d(-2));
+
+   if_insn = brw_IF(p, BRW_EXECUTE_1);
+   {
+      struct brw_indirect v0 = brw_indirect(0, 0);
+      struct brw_indirect vptr = brw_indirect(1, 0);
+
+      brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      brw_clip_emit_vue(c, v0, 1, 0, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_START));
+      
+      brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      loop = brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_TRIFAN << 2));
+  
+	 brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      }
+      brw_WHILE(p, loop);
+
+      brw_clip_emit_vue(c, v0, 0, 1, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_END));
+   }
+   brw_ENDIF(p, if_insn);
+}
+
+static void do_clip_tri( struct brw_clip_compile *c )
+{
+   brw_clip_init_planes(c);
+
+   brw_clip_tri(c);
+}
+
+
+static void maybe_do_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *do_clip;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   do_clip = brw_IF(p, BRW_EXECUTE_1);
+   {
+      do_clip_tri(c);
+   }
+   brw_ENDIF(p, do_clip);
+}
+
+static void brw_clip_test( struct brw_clip_compile *c )
+{
+    struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+
+    struct brw_reg v0 = get_tmp(c);
+    struct brw_reg v1 = get_tmp(c);
+    struct brw_reg v2 = get_tmp(c);
+
+    struct brw_indirect vt0 = brw_indirect(0, 0);
+    struct brw_indirect vt1 = brw_indirect(1, 0);
+    struct brw_indirect vt2 = brw_indirect(2, 0);
+
+    struct brw_compile *p = &c->func;
+    struct brw_instruction *is_outside;
+    struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+    brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
+    brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
+    brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
+    brw_MOV(p, v0, deref_4f(vt0, c->offset_hpos));
+    brw_MOV(p, v1, deref_4f(vt1, c->offset_hpos));
+    brw_MOV(p, v2, deref_4f(vt2, c->offset_hpos));
+    brw_AND(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(~0x3f));
+
+    /* test nearz, xmin, ymin plane */
+    /* clip.xyz < -clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, negate(get_element(v0, 3))); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, negate(get_element(v1, 3))); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, negate(get_element(v2, 3))); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    is_outside = brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p, is_outside);
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* test farz, xmax, ymax plane */
+    /* clip.xyz > clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_G, v0, get_element(v0, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_G, v1, get_element(v1, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_G, v2, get_element(v2, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    is_outside = brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p, is_outside);
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    release_tmps(c);
+}
+
+
+void brw_emit_tri_clip( struct brw_clip_compile *c )
+{
+   struct brw_instruction *neg_rhw;
+   struct brw_compile *p = &c->func;
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_clipmask(c);
+   brw_clip_init_ff_sync(c);
+
+   /* if -ve rhw workaround bit is set, 
+      do cliptest */
+   if (c->chipset.is_965) {
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
+              brw_imm_ud(1<<20));
+      neg_rhw = brw_IF(p, BRW_EXECUTE_1); 
+      {
+         brw_clip_test(c);
+      }
+      brw_ENDIF(p, neg_rhw);
+   }
+   /* Can't push into do_clip_tri because with polygon (or quad)
+    * flatshading, need to apply the flatshade here because we don't
+    * respect the PV when converting to trifan for emit:
+    */
+   if (c->key.do_flat_shading) 
+      brw_clip_tri_flat_shade(c); 
+      
+   if ((c->key.clip_mode == BRW_CLIPMODE_NORMAL) ||
+       (c->key.clip_mode == BRW_CLIPMODE_KERNEL_CLIP))
+      do_clip_tri(c);
+   else 
+      maybe_do_clip_tri(c);
+
+   brw_clip_tri_emit_polygon(c);
+
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_kill_thread(c);
+}
diff --git a/src/gallium/drivers/i965/brw_clip_unfilled.c b/src/gallium/drivers/i965/brw_clip_unfilled.c
new file mode 100644
index 0000000000..aec835b8ce
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip_unfilled.c
@@ -0,0 +1,497 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+/* This is performed against the original triangles, so no indirection
+ * required:
+BZZZT!
+ */
+static void compute_tri_direction( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg e = c->reg.tmp0;
+   struct brw_reg f = c->reg.tmp1;
+   struct brw_reg v0 = byte_offset(c->reg.vertex[0], c->offset_hpos); 
+   struct brw_reg v1 = byte_offset(c->reg.vertex[1], c->offset_hpos); 
+   struct brw_reg v2 = byte_offset(c->reg.vertex[2], c->offset_hpos); 
+
+
+   struct brw_reg v0n = get_tmp(c);
+   struct brw_reg v1n = get_tmp(c);
+   struct brw_reg v2n = get_tmp(c);
+
+   /* Convert to NDC.
+    * NOTE: We can't modify the original vertex coordinates,
+    * as it may impact further operations.
+    * So, we have to keep normalized coordinates in temp registers.
+    *
+    * TBD-KC
+    * Try to optimize unnecessary MOV's.
+    */
+   brw_MOV(p, v0n, v0);
+   brw_MOV(p, v1n, v1);
+   brw_MOV(p, v2n, v2);
+
+   brw_clip_project_position(c, v0n);
+   brw_clip_project_position(c, v1n);
+   brw_clip_project_position(c, v2n);
+
+   /* Calculate the vectors of two edges of the triangle:
+    */
+   brw_ADD(p, e, v0n, negate(v2n)); 
+   brw_ADD(p, f, v1n, negate(v2n)); 
+
+   /* Take their crossproduct:
+    */
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, 1,2,0,3),  brw_swizzle(f,2,0,1,3));
+   brw_MAC(p, vec4(e),  negate(brw_swizzle(e, 2,0,1,3)), brw_swizzle(f,1,2,0,3));
+   brw_set_access_mode(p, BRW_ALIGN_1);
+
+   brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e));
+}
+
+
+static void cull_direction( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+   GLuint conditional;
+
+   assert (!(c->key.fill_ccw == CLIP_CULL &&
+	     c->key.fill_cw == CLIP_CULL));
+
+   if (c->key.fill_ccw == CLIP_CULL)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+   
+   ccw = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p, ccw);
+}
+
+
+
+static void copy_bfc( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+   GLuint conditional;
+
+   /* Do we have any colors to copy? 
+    */
+   if ((c->offset_color0 == 0 || c->offset_bfc0 == 0) &&
+       (c->offset_color1 == 0 || c->offset_bfc1 == 0))
+      return;
+
+   /* In some wierd degnerate cases we can end up testing the
+    * direction twice, once for culling and once for bfc copying.  Oh
+    * well, that's what you get for setting wierd GL state.
+    */
+   if (c->key.copy_bfc_ccw)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+   
+   ccw = brw_IF(p, BRW_EXECUTE_1);
+   {
+      GLuint i;
+
+      for (i = 0; i < 3; i++) {
+	 if (c->offset_color0 && c->offset_bfc0)
+	    brw_MOV(p, 
+		    byte_offset(c->reg.vertex[i], c->offset_color0),
+		    byte_offset(c->reg.vertex[i], c->offset_bfc0));
+
+	 if (c->offset_color1 && c->offset_bfc1)
+	    brw_MOV(p, 
+		    byte_offset(c->reg.vertex[i], c->offset_color0),
+		    byte_offset(c->reg.vertex[i], c->offset_bfc0));
+      }
+   }
+   brw_ENDIF(p, ccw);
+}
+
+
+
+
+/*
+  GLfloat iz	= 1.0 / dir.z;
+  GLfloat ac	= dir.x * iz;
+  GLfloat bc	= dir.y * iz;
+  offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE;
+  offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor;
+  offset *= MRD;
+*/
+static void compute_offset( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg off = c->reg.offset;
+   struct brw_reg dir = c->reg.dir;
+   
+   brw_math_invert(p, get_element(off, 2), get_element(dir, 2));
+   brw_MUL(p, vec2(off), dir, get_element(off, 2));
+
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_GE,
+	   brw_abs(get_element(off, 0)), 
+	   brw_abs(get_element(off, 1)));
+
+   brw_SEL(p, vec1(off), brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1)));
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   brw_MUL(p, vec1(off), off, brw_imm_f(c->key.offset_factor));
+   brw_ADD(p, vec1(off), off, brw_imm_f(c->key.offset_units));
+}
+
+
+static void merge_edgeflags( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *is_poly;
+   struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0);
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_EQ, 
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   /* Get away with using reg.vertex because we know that this is not
+    * a _3DPRIM_TRISTRIP_REVERSE:
+    */
+   is_poly = brw_IF(p, BRW_EXECUTE_1);
+   {   
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8));
+      brw_MOV(p, byte_offset(c->reg.vertex[0], c->offset_edgeflag), brw_imm_f(0));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9));
+      brw_MOV(p, byte_offset(c->reg.vertex[2], c->offset_edgeflag), brw_imm_f(0));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   }
+   brw_ENDIF(p, is_poly);
+}
+
+
+
+static void apply_one_offset( struct brw_clip_compile *c,
+			  struct brw_indirect vert )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg z = deref_1f(vert, c->header_position_offset +
+			       2 * type_sz(BRW_REGISTER_TYPE_F));
+
+   brw_ADD(p, z, z, vec1(c->reg.offset));
+}
+
+
+
+/***********************************************************************
+ * Output clipped polygon as an unfilled primitive:
+ */
+static void emit_lines(struct brw_clip_compile *c,
+		       GLboolean do_offset)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop;
+   struct brw_instruction *draw_edge;
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v1 = brw_indirect(1, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+   struct brw_indirect v1ptr = brw_indirect(3, 0);
+
+   /* Need a seperate loop for offset:
+    */
+   if (do_offset) {
+      brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+      brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+      loop = brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+	 brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+	    
+	 apply_one_offset(c, v0);
+	    
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      }
+      brw_WHILE(p, loop);
+   }
+
+   /* v1ptr = &inlist[nr_verts]
+    * *v1ptr = v0
+    */
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0));
+
+   loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw edge if edgeflag != 0 */
+      brw_CMP(p, 
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
+	      deref_1f(v0, c->offset_edgeflag),
+	      brw_imm_f(0));
+      draw_edge = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+	 brw_clip_emit_vue(c, v1, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END);
+      }
+      brw_ENDIF(p, draw_edge);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+   }
+   brw_WHILE(p, loop);
+}
+
+
+
+static void emit_points(struct brw_clip_compile *c,
+			GLboolean do_offset )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop;
+   struct brw_instruction *draw_point;
+
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+   loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw if edgeflag != 0 
+       */
+      brw_CMP(p, 
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
+	      deref_1f(v0, c->offset_edgeflag),
+	      brw_imm_f(0));
+      draw_point = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (do_offset)
+	    apply_one_offset(c, v0);
+
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END);
+      }
+      brw_ENDIF(p, draw_point);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+   }
+   brw_WHILE(p, loop);
+}
+
+
+
+
+
+
+
+static void emit_primitives( struct brw_clip_compile *c,
+			     GLuint mode, 
+			     GLboolean do_offset )
+{
+   switch (mode) {
+   case CLIP_FILL:
+      brw_clip_tri_emit_polygon(c);
+      break;
+
+   case CLIP_LINE:
+      emit_lines(c, do_offset);
+      break;
+
+   case CLIP_POINT:
+      emit_points(c, do_offset);
+      break;
+
+   case CLIP_CULL:
+      assert(0);
+      break;
+   }
+} 
+
+
+
+static void emit_unfilled_primitives( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+
+   /* Direction culling has already been done.
+    */
+   if (c->key.fill_ccw != c->key.fill_cw &&
+       c->key.fill_ccw != CLIP_CULL &&
+       c->key.fill_cw != CLIP_CULL)
+   {
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      get_element(c->reg.dir, 2),
+	      brw_imm_f(0));
+   
+      ccw = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+      }
+      ccw = brw_ELSE(p, ccw);
+      {
+	 emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+      }
+      brw_ENDIF(p, ccw);
+   }
+   else if (c->key.fill_cw != CLIP_CULL) {
+      emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+   }
+   else if (c->key.fill_ccw != CLIP_CULL) { 
+      emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+   }
+}
+
+
+
+
+static void check_nr_verts( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3));      
+   if_insn = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p, if_insn);
+}
+
+
+void brw_emit_unfilled_clip( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *do_clip;
+   
+
+   c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) ||
+			(c->key.fill_ccw != c->key.fill_cw) ||
+			c->key.fill_ccw == CLIP_CULL ||
+			c->key.fill_cw == CLIP_CULL ||
+			c->key.copy_bfc_cw ||
+			c->key.copy_bfc_ccw);
+
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_ff_sync(c);
+
+   assert(c->offset_edgeflag);
+
+   if (c->key.fill_ccw == CLIP_CULL &&
+       c->key.fill_cw == CLIP_CULL) {
+      brw_clip_kill_thread(c);
+      return;
+   }
+
+   merge_edgeflags(c);
+
+   /* Need to use the inlist indirection here: 
+    */
+   if (c->need_direction) 
+      compute_tri_direction(c);
+   
+   if (c->key.fill_ccw == CLIP_CULL ||
+       c->key.fill_cw == CLIP_CULL)
+      cull_direction(c);
+
+   if (c->key.offset_ccw ||
+       c->key.offset_cw)
+      compute_offset(c);
+
+   if (c->key.copy_bfc_ccw ||
+       c->key.copy_bfc_cw)
+      copy_bfc(c);
+
+   /* Need to do this whether we clip or not:
+    */
+   if (c->key.do_flat_shading)
+      brw_clip_tri_flat_shade(c);
+   
+   brw_clip_init_clipmask(c);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   do_clip = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_init_planes(c);
+      brw_clip_tri(c);
+      check_nr_verts(c);
+   }
+   brw_ENDIF(p, do_clip);
+   
+   emit_unfilled_primitives(c);
+   brw_clip_kill_thread(c);
+}
+
+
+
diff --git a/src/gallium/drivers/i965/brw_clip_util.c b/src/gallium/drivers/i965/brw_clip_util.c
new file mode 100644
index 0000000000..23e51ee9bc
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip_util.c
@@ -0,0 +1,387 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_defines.h"
+#include "brw_eu.h"
+#include "brw_clip.h"
+
+
+
+
+struct brw_reg get_tmp( struct brw_clip_compile *c )
+{
+   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+
+
+static struct brw_reg make_plane_ud(GLuint x, GLuint y, GLuint z, GLuint w)
+{
+   return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x);
+}
+
+
+void brw_clip_init_planes( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+
+   if (!c->key.nr_userclip) {
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0,    0, 0xff, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0,    0,    1, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0,    1,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff,  0,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1,    0,    0, 1));
+   }
+}
+
+
+
+#define W 3
+
+/* Project 'pos' to screen space (or back again), overwrite with results:
+ */
+void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
+{
+   struct brw_compile *p = &c->func;
+
+   /* calc rhw 
+    */
+   brw_math_invert(p, get_element(pos, W), get_element(pos, W));
+
+   /* value.xyz *= value.rhw
+    */
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, brw_writemask(pos, BRW_WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
+   brw_set_access_mode(p, BRW_ALIGN_1);
+}
+
+
+static void brw_clip_project_vertex( struct brw_clip_compile *c, 
+				     struct brw_indirect vert_addr )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+
+   /* Fixup position.  Extract from the original vertex and re-project
+    * to screen space:
+    */
+   brw_MOV(p, tmp, deref_4f(vert_addr, c->offset_hpos));
+   brw_clip_project_position(c, tmp);
+   brw_MOV(p, deref_4f(vert_addr, c->header_position_offset), tmp);
+	 
+   release_tmp(c, tmp);
+}
+
+
+
+
+/* Interpolate between two vertices and put the result into a0.0.  
+ * Increment a0.0 accordingly.
+ */
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     GLboolean force_edgeflag)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+   GLuint i;
+
+   /* Just copy the vertex header:
+    */
+   /*
+    * After CLIP stage, only first 256 bits of the VUE are read
+    * back on IGDNG, so needn't change it
+    */
+   brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
+      
+   /* Iterate over each attribute (could be done in pairs?)
+    */
+   for (i = 0; i < c->key.nr_attrs; i++) {
+      GLuint delta = i*16 + 32;
+
+      if (c->chipset.is_igdng)
+          delta = i * 16 + 32 * 3;
+
+      if (delta == c->offset_edgeflag) {
+	 if (force_edgeflag) 
+	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
+	 else
+	    brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta));
+      }
+      else {
+	 /* Interpolate: 
+	  *
+	  *        New = attr0 + t*attr1 - t*attr0
+	  */
+	 brw_MUL(p, 
+		 vec4(brw_null_reg()),
+		 deref_4f(v1_ptr, delta),
+		 t0);
+
+	 brw_MAC(p, 
+		 tmp,	      
+		 negate(deref_4f(v0_ptr, delta)),
+		 t0); 
+	      
+	 brw_ADD(p,
+		 deref_4f(dest_ptr, delta), 
+		 deref_4f(v0_ptr, delta),
+		 tmp);
+      }
+   }
+
+   if (i & 1) {
+      GLuint delta = i*16 + 32;
+
+      if (c->chipset.is_igdng)
+          delta = i * 16 + 32 * 3;
+
+      brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
+   }
+
+   release_tmp(c, tmp);
+
+   /* Recreate the projected (NDC) coordinate in the new vertex
+    * header:
+    */
+   brw_clip_project_vertex(c, dest_ptr );
+}
+
+
+
+
+#define MAX_MRF 16
+
+void brw_clip_emit_vue(struct brw_clip_compile *c, 
+		       struct brw_indirect vert,
+		       GLboolean allocate,
+		       GLboolean eot,
+		       GLuint header)
+{
+   struct brw_compile *p = &c->func;
+   GLuint start = c->last_mrf;
+
+   brw_clip_ff_sync(c);
+
+   assert(!(allocate && eot));
+   
+   /* Cycle through mrf regs - probably futile as we have to wait for
+    * the allocation response anyway.  Also, the order this function
+    * is invoked doesn't correspond to the order the instructions will
+    * be executed, so it won't have any effect in many cases.
+    */
+#if 0
+   if (start + c->nr_regs + 1 >= MAX_MRF)
+      start = 0;
+
+   c->last_mrf = start + c->nr_regs + 1;
+#endif
+	
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy_from_indirect(p, brw_message_reg(start+1), vert, c->nr_regs);
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+
+   /* Send each vertex as a seperate write to the urb.  This
+    * is different to the concept in brw_sf_emit.c, where
+    * subsequent writes are used to build up a single urb
+    * entry.  Each of these writes instantiates a seperate
+    * urb entry - (I think... what about 'allocate'?)
+    */
+   brw_urb_WRITE(p, 
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 start,
+		 c->reg.R0,
+		 allocate,
+		 1,		/* used */
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response_length */ 
+		 eot,		/* eot */
+		 1,		/* writes_complete */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+void brw_clip_kill_thread(struct brw_clip_compile *c)
+{
+   struct brw_compile *p = &c->func;
+
+   brw_clip_ff_sync(c);
+   /* Send an empty message to kill the thread and release any
+    * allocated urb entry:
+    */
+   brw_urb_WRITE(p, 
+		 retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+		 0,		/* allocate */
+		 0,		/* used */
+		 1, 		/* msg len */
+		 0, 		/* response len */
+		 1, 		/* eot */
+		 1,		/* writes complete */
+		 0,
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c )
+{
+   return brw_address(c->reg.fixed_planes);
+}
+
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c )
+{
+   if (c->key.nr_userclip) {
+      return brw_imm_uw(16);
+   }
+   else {
+      return brw_imm_uw(4);
+   }
+}
+
+
+/* If flatshading, distribute color from provoking vertex prior to
+ * clipping.
+ */
+void brw_clip_copy_colors( struct brw_clip_compile *c,
+			   GLuint to, GLuint from )
+{
+   struct brw_compile *p = &c->func;
+
+   if (c->offset_color0)
+      brw_MOV(p, 
+	      byte_offset(c->reg.vertex[to], c->offset_color0),
+	      byte_offset(c->reg.vertex[from], c->offset_color0));
+
+   if (c->offset_color1)
+      brw_MOV(p, 
+	      byte_offset(c->reg.vertex[to], c->offset_color1),
+	      byte_offset(c->reg.vertex[from], c->offset_color1));
+
+   if (c->offset_bfc0)
+      brw_MOV(p, 
+	      byte_offset(c->reg.vertex[to], c->offset_bfc0),
+	      byte_offset(c->reg.vertex[from], c->offset_bfc0));
+
+   if (c->offset_bfc1)
+      brw_MOV(p, 
+	      byte_offset(c->reg.vertex[to], c->offset_bfc1),
+	      byte_offset(c->reg.vertex[from], c->offset_bfc1));
+}
+
+
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg incoming = get_element_ud(c->reg.R0, 2);
+   
+   /* Shift so that lowest outcode bit is rightmost: 
+    */
+   brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));
+
+   if (c->key.nr_userclip) {
+      struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);
+
+      /* Rearrange userclip outcodes so that they come directly after
+       * the fixed plane bits.
+       */
+      brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
+      brw_SHR(p, tmp, tmp, brw_imm_ud(8));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);
+      
+      release_tmp(c, tmp);
+   }
+}
+
+void brw_clip_ff_sync(struct brw_clip_compile *c)
+{
+    if (c->need_ff_sync) {
+        struct brw_compile *p = &c->func;
+        struct brw_instruction *need_ff_sync;
+
+        brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+        brw_AND(p, brw_null_reg(), c->reg.ff_sync, brw_imm_ud(0x1));
+        need_ff_sync = brw_IF(p, BRW_EXECUTE_1);
+        {
+            brw_OR(p, c->reg.ff_sync, c->reg.ff_sync, brw_imm_ud(0x1));
+            brw_ff_sync(p, 
+                    c->reg.R0,
+                    0,
+                    c->reg.R0,
+                    1,	
+                    1,		/* used */
+                    1,  	/* msg length */
+                    1,		/* response length */
+                    0,		/* eot */
+                    1,		/* write compelete */
+                    0,		/* urb offset */
+                    BRW_URB_SWIZZLE_NONE);
+        }
+        brw_ENDIF(p, need_ff_sync);
+        brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    }
+}
+
+void brw_clip_init_ff_sync(struct brw_clip_compile *c)
+{
+    if (c->need_ff_sync) {
+	struct brw_compile *p = &c->func;
+        
+        brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));
+    }
+}
diff --git a/src/gallium/drivers/i965/brw_context.c b/src/gallium/drivers/i965/brw_context.c
new file mode 100644
index 0000000000..227bc790de
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_context.c
@@ -0,0 +1,158 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "pipe/p_context.h"
+#include "util/u_inlines.h"
+#include "util/u_simple_list.h"
+
+#include "brw_context.h"
+#include "brw_draw.h"
+#include "brw_state.h"
+#include "brw_batchbuffer.h"
+#include "brw_winsys.h"
+#include "brw_resource.h"
+#include "brw_screen.h"
+
+
+static void brw_destroy_context( struct pipe_context *pipe )
+{
+   struct brw_context *brw = brw_context(pipe);
+   int i;
+
+   brw_context_flush( brw );
+   brw_batchbuffer_free( brw->batch );
+   brw_destroy_state(brw);
+
+   brw_draw_cleanup( brw );
+
+   brw_pipe_blend_cleanup( brw );
+   brw_pipe_depth_stencil_cleanup( brw );
+   brw_pipe_framebuffer_cleanup( brw );
+   brw_pipe_flush_cleanup( brw );
+   brw_pipe_misc_cleanup( brw );
+   brw_pipe_query_cleanup( brw );
+   brw_pipe_rast_cleanup( brw );
+   brw_pipe_sampler_cleanup( brw );
+   brw_pipe_shader_cleanup( brw );
+   brw_pipe_vertex_cleanup( brw );
+   brw_pipe_clear_cleanup( brw );
+
+   brw_hw_cc_cleanup( brw );
+
+
+   FREE(brw->wm.compile_data);
+
+   for (i = 0; i < brw->curr.fb.nr_cbufs; i++)
+      pipe_surface_reference(&brw->curr.fb.cbufs[i], NULL);
+   brw->curr.fb.nr_cbufs = 0;
+   pipe_surface_reference(&brw->curr.fb.zsbuf, NULL);
+
+   bo_reference(&brw->curbe.curbe_bo, NULL);
+   bo_reference(&brw->vs.prog_bo, NULL);
+   bo_reference(&brw->vs.state_bo, NULL);
+   bo_reference(&brw->vs.bind_bo, NULL);
+   bo_reference(&brw->gs.prog_bo, NULL);
+   bo_reference(&brw->gs.state_bo, NULL);
+   bo_reference(&brw->clip.prog_bo, NULL);
+   bo_reference(&brw->clip.state_bo, NULL);
+   bo_reference(&brw->clip.vp_bo, NULL);
+   bo_reference(&brw->sf.prog_bo, NULL);
+   bo_reference(&brw->sf.state_bo, NULL);
+   bo_reference(&brw->sf.vp_bo, NULL);
+
+   for (i = 0; i < Elements(brw->wm.sdc_bo); i++)
+      bo_reference(&brw->wm.sdc_bo[i], NULL);
+
+   bo_reference(&brw->wm.bind_bo, NULL);
+
+   for (i = 0; i < Elements(brw->wm.surf_bo); i++)
+      bo_reference(&brw->wm.surf_bo[i], NULL);
+
+   bo_reference(&brw->wm.sampler_bo, NULL);
+   bo_reference(&brw->wm.prog_bo, NULL);
+   bo_reference(&brw->wm.state_bo, NULL);
+}
+
+
+struct pipe_context *brw_create_context(struct pipe_screen *screen,
+					void *priv)
+{
+   struct brw_context *brw = (struct brw_context *) CALLOC_STRUCT(brw_context);
+
+   if (!brw) {
+      debug_printf("%s: failed to alloc context\n", __FUNCTION__);
+      return NULL;
+   }
+
+   brw->base.screen = screen;
+   brw->base.priv = priv;
+   brw->base.destroy = brw_destroy_context;
+   brw->sws = brw_screen(screen)->sws;
+   brw->chipset = brw_screen(screen)->chipset;
+
+   brw_init_resource_functions( brw );
+   brw_pipe_blend_init( brw );
+   brw_pipe_depth_stencil_init( brw );
+   brw_pipe_framebuffer_init( brw );
+   brw_pipe_flush_init( brw );
+   brw_pipe_misc_init( brw );
+   brw_pipe_query_init( brw );
+   brw_pipe_rast_init( brw );
+   brw_pipe_sampler_init( brw );
+   brw_pipe_shader_init( brw );
+   brw_pipe_vertex_init( brw );
+   brw_pipe_clear_init( brw );
+
+   brw_hw_cc_init( brw );
+
+   brw_init_state( brw );
+   brw_draw_init( brw );
+
+   brw->state.dirty.mesa = ~0;
+   brw->state.dirty.brw = ~0;
+
+   brw->flags.always_emit_state = 0;
+
+   make_empty_list(&brw->query.active_head);
+
+   brw->batch = brw_batchbuffer_alloc( brw->sws, brw->chipset );
+   if (brw->batch == NULL)
+      goto fail;
+
+   return &brw->base;
+
+fail:
+   if (brw->batch)
+      brw_batchbuffer_free( brw->batch );
+   return NULL;
+}
+
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
new file mode 100644
index 0000000000..94c9c443f0
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -0,0 +1,862 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRWCONTEXT_INC
+#define BRWCONTEXT_INC
+
+#include "brw_structs.h"
+#include "brw_winsys.h"
+#include "brw_reg.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "tgsi/tgsi_scan.h"
+
+
+/* Glossary:
+ *
+ * URB - uniform resource buffer.  A mid-sized buffer which is
+ * partitioned between the fixed function units and used for passing
+ * values (vertices, primitives, constants) between them.
+ *
+ * CURBE - constant URB entry.  An urb region (entry) used to hold
+ * constant values which the fixed function units can be instructed to
+ * preload into the GRF when spawning a thread.
+ *
+ * VUE - vertex URB entry.  An urb entry holding a vertex and usually
+ * a vertex header.  The header contains control information and
+ * things like primitive type, Begin/end flags and clip codes.  
+ *
+ * PUE - primitive URB entry.  An urb entry produced by the setup (SF)
+ * unit holding rasterization and interpolation parameters.
+ *
+ * GRF - general register file.  One of several register files
+ * addressable by programmed threads.  The inputs (r0, payload, curbe,
+ * urb) of the thread are preloaded to this area before the thread is
+ * spawned.  The registers are individually 8 dwords wide and suitable
+ * for general usage.  Registers holding thread input values are not
+ * special and may be overwritten.
+ *
+ * MRF - message register file.  Threads communicate (and terminate)
+ * by sending messages.  Message parameters are placed in contiguous
+ * MRF registers.  All program output is via these messages.  URB
+ * entries are populated by sending a message to the shared URB
+ * function containing the new data, together with a control word,
+ * often an unmodified copy of R0.
+ *
+ * R0 - GRF register 0.  Typically holds control information used when
+ * sending messages to other threads.
+ *
+ * EU or GEN4 EU: The name of the programmable subsystem of the
+ * i965 hardware.  Threads are executed by the EU, the registers
+ * described above are part of the EU architecture.
+ *
+ * Fixed function units:
+ *
+ * CS - Command streamer.  Notional first unit, little software
+ * interaction.  Holds the URB entries used for constant data, ie the
+ * CURBEs.
+ *
+ * VF/VS - Vertex Fetch / Vertex Shader.  The fixed function part of
+ * this unit is responsible for pulling vertices out of vertex buffers
+ * in vram and injecting them into the processing pipe as VUEs.  If
+ * enabled, it first passes them to a VS thread which is a good place
+ * for the driver to implement any active vertex shader.
+ *
+ * GS - Geometry Shader.  This corresponds to a new DX10 concept.  If
+ * enabled, incoming strips etc are passed to GS threads in individual
+ * line/triangle/point units.  The GS thread may perform arbitary
+ * computation and emit whatever primtives with whatever vertices it
+ * chooses.  This makes GS an excellent place to implement GL's
+ * unfilled polygon modes, though of course it is capable of much
+ * more.  Additionally, GS is used to translate away primitives not
+ * handled by latter units, including Quads and Lineloops.
+ *
+ * CS - Clipper.  Mesa's clipping algorithms are imported to run on
+ * this unit.  The fixed function part performs cliptesting against
+ * the 6 fixed clipplanes and makes decisions on whether or not the
+ * incoming primitive needs to be passed to a thread for clipping.
+ * User clip planes are handled via cooperation with the VS thread.
+ *
+ * SF - Strips Fans or Setup: Triangles are prepared for
+ * rasterization.  Interpolation coefficients are calculated.
+ * Flatshading and two-side lighting usually performed here.
+ *
+ * WM - Windower.  Interpolation of vertex attributes performed here.
+ * Fragment shader implemented here.  SIMD aspects of EU taken full
+ * advantage of, as pixels are processed in blocks of 16.
+ *
+ * CC - Color Calculator.  No EU threads associated with this unit.
+ * Handles blending and (presumably) depth and stencil testing.
+ */
+
+#define BRW_MAX_CURBE                    (32*16)
+
+
+/* Need a value to say a particular vertex shader output isn't
+ * present.  Limits us to 63 outputs currently.
+ */
+#define BRW_OUTPUT_NOT_PRESENT           ((1<<6)-1)
+
+
+struct brw_context;
+
+struct brw_depth_stencil_state {
+   /* Precalculated hardware state:
+    */
+   struct brw_cc0 cc0;
+   struct brw_cc1 cc1;
+   struct brw_cc2 cc2;
+   struct brw_cc3 cc3;
+   struct brw_cc7 cc7;
+
+   unsigned iz_lookup;
+};
+
+
+struct brw_blend_state {
+   /* Precalculated hardware state:
+    */
+   struct brw_cc2 cc2;
+   struct brw_cc3 cc3;
+   struct brw_cc5 cc5;
+   struct brw_cc6 cc6;
+
+   struct brw_surf_ss0 ss0;
+};
+
+struct brw_rasterizer_state;
+
+struct brw_immediate_data {
+   unsigned nr;
+   float (*data)[4];
+};
+
+struct brw_vertex_shader {
+   const struct tgsi_token *tokens;
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
+
+   struct tgsi_shader_info info;
+   struct brw_immediate_data immediates;
+
+   GLuint has_flow_control:1;
+   GLuint use_const_buffer:1;
+
+   /* Offsets of special vertex shader outputs required for clipping.
+    */
+   GLuint output_hpos:6;        /* not always zero? */
+   GLuint output_color0:6;
+   GLuint output_color1:6;
+   GLuint output_bfc0:6;
+   GLuint output_bfc1:6;
+   GLuint output_edgeflag:6;
+
+   unsigned id;
+};
+
+struct brw_fs_signature {
+   GLuint nr_inputs;
+   struct {
+      GLuint interp:3;          /* TGSI_INTERPOLATE_x */
+      GLuint semantic:5;        /* TGSI_SEMANTIC_x */
+      GLuint semantic_index:24;
+   } input[PIPE_MAX_SHADER_INPUTS];
+};
+
+#define brw_fs_signature_size(s) (offsetof(struct brw_fs_signature, input) + \
+                                  ((s)->nr_inputs * sizeof (s)->input[0])) 
+
+
+struct brw_fragment_shader {
+   const struct tgsi_token *tokens;
+   struct tgsi_shader_info info;
+
+   struct brw_fs_signature signature;
+   struct brw_immediate_data immediates;
+
+   unsigned iz_lookup;
+   /*unsigned wm_lookup;*/
+   
+   unsigned  uses_depth:1;
+   unsigned  has_flow_control:1;
+
+   unsigned id;
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
+};
+
+
+struct brw_sampler {
+   struct brw_ss0 ss0;
+   struct brw_ss1 ss1;
+   float border_color[4];
+   struct brw_ss3 ss3;
+};
+
+
+
+#define PIPE_NEW_DEPTH_STENCIL_ALPHA    0x1
+#define PIPE_NEW_RAST                   0x2
+#define PIPE_NEW_BLEND                  0x4
+#define PIPE_NEW_VIEWPORT               0x8
+#define PIPE_NEW_SAMPLERS               0x10
+#define PIPE_NEW_VERTEX_BUFFER          0x20
+#define PIPE_NEW_VERTEX_ELEMENT         0x40
+#define PIPE_NEW_FRAGMENT_SHADER        0x80
+#define PIPE_NEW_VERTEX_SHADER          0x100
+#define PIPE_NEW_FRAGMENT_CONSTANTS     0x200
+#define PIPE_NEW_VERTEX_CONSTANTS       0x400
+#define PIPE_NEW_CLIP                   0x800
+#define PIPE_NEW_INDEX_BUFFER           0x1000
+#define PIPE_NEW_INDEX_RANGE            0x2000
+#define PIPE_NEW_BLEND_COLOR            0x4000
+#define PIPE_NEW_POLYGON_STIPPLE        0x8000
+#define PIPE_NEW_FRAMEBUFFER_DIMENSIONS 0x10000
+#define PIPE_NEW_DEPTH_BUFFER           0x20000
+#define PIPE_NEW_COLOR_BUFFERS          0x40000
+#define PIPE_NEW_QUERY                  0x80000
+#define PIPE_NEW_SCISSOR                0x100000
+#define PIPE_NEW_BOUND_TEXTURES         0x200000
+#define PIPE_NEW_NR_CBUFS               0x400000
+#define PIPE_NEW_FRAGMENT_SIGNATURE     0x800000
+
+
+
+#define BRW_NEW_URB_FENCE               0x1
+#define BRW_NEW_FRAGMENT_PROGRAM        0x2
+#define BRW_NEW_VERTEX_PROGRAM          0x4
+#define BRW_NEW_INPUT_DIMENSIONS        0x8
+#define BRW_NEW_CURBE_OFFSETS           0x10
+#define BRW_NEW_REDUCED_PRIMITIVE       0x20
+#define BRW_NEW_PRIMITIVE               0x40
+#define BRW_NEW_CONTEXT                 0x80
+#define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
+#define BRW_NEW_PSP                     0x800
+#define BRW_NEW_WM_SURFACES		0x1000
+#define BRW_NEW_xxx                     0x2000 /* was FENCE */
+#define BRW_NEW_INDICES			0x4000
+
+/**
+ * Used for any batch entry with a relocated pointer that will be used
+ * by any 3D rendering.  Need to re-emit these fresh in each
+ * batchbuffer as the referenced buffers may be relocated in the
+ * meantime.
+ */
+#define BRW_NEW_BATCH			0x10000
+#define BRW_NEW_NR_WM_SURFACES		0x40000
+#define BRW_NEW_NR_VS_SURFACES		0x80000
+#define BRW_NEW_INDEX_BUFFER		0x100000
+
+struct brw_state_flags {
+   /** State update flags signalled by mesa internals */
+   GLuint mesa;
+   /**
+    * State update flags signalled as the result of brw_tracked_state updates
+    */
+   GLuint brw;
+   /** State update flags signalled by brw_state_cache.c searches */
+   GLuint cache;
+};
+
+
+
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs:
+ */
+struct brw_wm_prog_data {
+   GLuint curb_read_length;
+   GLuint urb_read_length;
+
+   GLuint first_curbe_grf;
+   GLuint total_grf;
+   GLuint total_scratch;
+
+   GLuint nr_params;       /**< number of float params/constants */
+   GLboolean error;
+
+   /* Pointer to tracked values (only valid once
+    * _mesa_load_state_parameters has been called at runtime).
+    */
+   const GLfloat *param[BRW_MAX_CURBE];
+};
+
+struct brw_sf_prog_data {
+   GLuint urb_read_length;
+   GLuint total_grf;
+
+   /* Each vertex may have upto 12 attributes, 4 components each,
+    * except WPOS which requires only 2.  (11*4 + 2) == 44 ==> 11
+    * rows.
+    *
+    * Actually we use 4 for each, so call it 12 rows.
+    */
+   GLuint urb_entry_size;
+};
+
+
+struct brw_clip_prog_data;
+
+struct brw_gs_prog_data {
+   GLuint urb_read_length;
+   GLuint total_grf;
+};
+
+struct brw_vs_prog_data {
+   GLuint curb_read_length;
+   GLuint urb_read_length;
+   GLuint total_grf;
+
+   GLuint nr_outputs;
+   GLuint nr_inputs;
+
+   GLuint nr_params;       /**< number of TGSI_FILE_CONSTANT's */
+
+   GLboolean writes_psiz;
+
+   /* Used for calculating urb partitions:
+    */
+   GLuint urb_entry_size;
+};
+
+
+/* Size == 0 if output either not written, or always [0,0,0,1]
+ */
+struct brw_vs_output_sizes {
+   GLubyte output_size[PIPE_MAX_SHADER_OUTPUTS];
+};
+
+
+/** Number of texture sampler units */
+#define BRW_MAX_TEX_UNIT 16
+
+/** Max number of render targets in a shader */
+#define BRW_MAX_DRAW_BUFFERS 4
+
+/**
+ * Size of our surface binding table for the WM.
+ * This contains pointers to the drawing surfaces and current texture
+ * objects and shader constant buffers (+2).
+ */
+#define BRW_WM_MAX_SURF (BRW_MAX_DRAW_BUFFERS + BRW_MAX_TEX_UNIT + 1)
+
+/**
+ * Helpers to convert drawing buffers, textures and constant buffers
+ * to surface binding table indexes, for WM.
+ */
+#define BTI_COLOR_BUF(d)          (d)
+#define BTI_FRAGMENT_CONSTANTS    (BRW_MAX_DRAW_BUFFERS) 
+#define BTI_TEXTURE(t)            (BRW_MAX_DRAW_BUFFERS + 1 + (t))
+
+/**
+ * Size of surface binding table for the VS.
+ * Only one constant buffer for now.
+ */
+#define BRW_VS_MAX_SURF 1
+
+/**
+ * Only a VS constant buffer
+ */
+#define SURF_INDEX_VERT_CONST_BUFFER 0
+
+
+/* Bit of a hack to align these with the winsys buffer_data_type enum.
+ */
+enum brw_cache_id {
+   BRW_CC_VP         = BRW_DATA_GS_CC_VP,
+   BRW_CC_UNIT       = BRW_DATA_GS_CC_UNIT,
+   BRW_WM_PROG       = BRW_DATA_GS_WM_PROG,
+   BRW_SAMPLER_DEFAULT_COLOR    = BRW_DATA_GS_SAMPLER_DEFAULT_COLOR,
+   BRW_SAMPLER       = BRW_DATA_GS_SAMPLER,
+   BRW_WM_UNIT       = BRW_DATA_GS_WM_UNIT,
+   BRW_SF_PROG       = BRW_DATA_GS_SF_PROG,
+   BRW_SF_VP         = BRW_DATA_GS_SF_VP,
+   BRW_SF_UNIT       = BRW_DATA_GS_SF_UNIT,
+   BRW_VS_UNIT       = BRW_DATA_GS_VS_UNIT,
+   BRW_VS_PROG       = BRW_DATA_GS_VS_PROG,
+   BRW_GS_UNIT       = BRW_DATA_GS_GS_UNIT,
+   BRW_GS_PROG       = BRW_DATA_GS_GS_PROG,
+   BRW_CLIP_VP       = BRW_DATA_GS_CLIP_VP,
+   BRW_CLIP_UNIT     = BRW_DATA_GS_CLIP_UNIT,
+   BRW_CLIP_PROG     = BRW_DATA_GS_CLIP_PROG,
+   BRW_SS_SURFACE    = BRW_DATA_SS_SURFACE,
+   BRW_SS_SURF_BIND  = BRW_DATA_SS_SURF_BIND,
+
+   BRW_MAX_CACHE
+};
+
+struct brw_cache_item {
+   /**
+    * Effectively part of the key, cache_id identifies what kind of state
+    * buffer is involved, and also which brw->state.dirty.cache flag should
+    * be set when this cache item is chosen.
+    */
+   enum brw_cache_id cache_id;
+   /** 32-bit hash of the key data */
+   GLuint hash;
+   GLuint key_size;		/* for variable-sized keys */
+   const void *key;
+   struct brw_winsys_reloc *relocs;
+   GLuint nr_relocs;
+
+   struct brw_winsys_buffer *bo;
+   GLuint data_size;
+
+   struct brw_cache_item *next;
+};   
+
+
+
+struct brw_cache {
+   struct brw_context *brw;
+   struct brw_winsys_screen *sws;
+
+   struct brw_cache_item **items;
+   GLuint size, n_items;
+
+   enum brw_buffer_type buffer_type;
+
+   GLuint key_size[BRW_MAX_CACHE];		/* for fixed-size keys */
+   GLuint aux_size[BRW_MAX_CACHE];
+   char *name[BRW_MAX_CACHE];
+   
+
+   /* Record of the last BOs chosen for each cache_id.  Used to set
+    * brw->state.dirty.cache when a new cache item is chosen.
+    */
+   struct brw_winsys_buffer *last_bo[BRW_MAX_CACHE];
+};
+
+
+struct brw_tracked_state {
+   struct brw_state_flags dirty;
+   int (*prepare)( struct brw_context *brw );
+   int (*emit)( struct brw_context *brw );
+};
+
+/* Flags for brw->state.cache.
+ */
+#define CACHE_NEW_CC_VP                  (1<<BRW_CC_VP)
+#define CACHE_NEW_CC_UNIT                (1<<BRW_CC_UNIT)
+#define CACHE_NEW_WM_PROG                (1<<BRW_WM_PROG)
+#define CACHE_NEW_SAMPLER_DEFAULT_COLOR  (1<<BRW_SAMPLER_DEFAULT_COLOR)
+#define CACHE_NEW_SAMPLER                (1<<BRW_SAMPLER)
+#define CACHE_NEW_WM_UNIT                (1<<BRW_WM_UNIT)
+#define CACHE_NEW_SF_PROG                (1<<BRW_SF_PROG)
+#define CACHE_NEW_SF_VP                  (1<<BRW_SF_VP)
+#define CACHE_NEW_SF_UNIT                (1<<BRW_SF_UNIT)
+#define CACHE_NEW_VS_UNIT                (1<<BRW_VS_UNIT)
+#define CACHE_NEW_VS_PROG                (1<<BRW_VS_PROG)
+#define CACHE_NEW_GS_UNIT                (1<<BRW_GS_UNIT)
+#define CACHE_NEW_GS_PROG                (1<<BRW_GS_PROG)
+#define CACHE_NEW_CLIP_VP                (1<<BRW_CLIP_VP)
+#define CACHE_NEW_CLIP_UNIT              (1<<BRW_CLIP_UNIT)
+#define CACHE_NEW_CLIP_PROG              (1<<BRW_CLIP_PROG)
+#define CACHE_NEW_SURFACE                (1<<BRW_SS_SURFACE)
+#define CACHE_NEW_SURF_BIND              (1<<BRW_SS_SURF_BIND)
+
+struct brw_cached_batch_item {
+   struct header *header;
+   GLuint sz;
+   struct brw_cached_batch_item *next;
+};
+   
+
+
+/* Protect against a future where VERT_ATTRIB_MAX > 32.  Wouldn't life
+ * be easier if C allowed arrays of packed elements?
+ */
+#define VS_INPUT_BITMASK_DWORDS  ((PIPE_MAX_SHADER_INPUTS+31)/32)
+
+
+
+
+struct brw_vertex_info {
+   GLuint sizes[VS_INPUT_BITMASK_DWORDS * 2]; /* sizes:2[VERT_ATTRIB_MAX] */
+};
+
+
+struct brw_query_object {
+   /** Doubly linked list of active query objects in the context. */
+   struct brw_query_object *prev, *next;
+
+   /** Last query BO associated with this query. */
+   struct brw_winsys_buffer *bo;
+   /** First index in bo with query data for this object. */
+   int first_index;
+   /** Last index in bo with query data for this object. */
+   int last_index;
+
+   /* Total count of pixels from previous BOs */
+   uint64_t result;
+};
+
+#define CC_RELOC_VP 0
+
+
+/**
+ * brw_context is derived from pipe_context
+ */
+struct brw_context 
+{
+   struct pipe_context base;
+   struct brw_chipset chipset;
+
+   struct brw_winsys_screen *sws;
+
+   struct brw_batchbuffer *batch;
+
+   GLuint primitive;
+   GLuint reduced_primitive;
+
+   /* Active state from the state tracker: 
+    */
+   struct {
+      struct brw_vertex_shader *vertex_shader;
+      struct brw_fragment_shader *fragment_shader;
+      const struct brw_blend_state *blend;
+      const struct brw_rasterizer_state *rast;
+      const struct brw_depth_stencil_state *zstencil;
+      const struct brw_vertex_element_packet *velems;
+
+      const struct brw_sampler *sampler[PIPE_MAX_SAMPLERS];
+      unsigned num_samplers;
+
+      struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
+      struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+      unsigned num_fragment_sampler_views;
+      unsigned num_vertex_buffers;
+
+      struct pipe_scissor_state scissor;
+      struct pipe_viewport_state viewport;
+      struct pipe_stencil_ref stencil_ref;
+      struct pipe_framebuffer_state fb;
+      struct pipe_clip_state ucp;
+      struct pipe_resource *vertex_constants;
+      struct pipe_resource *fragment_constants;
+
+      struct brw_blend_constant_color bcc;
+      struct brw_cc1 cc1_stencil_ref;
+      struct brw_polygon_stipple bps;
+      struct brw_cc_viewport ccv;
+
+      /**
+       * Index buffer for this draw_prims call.
+       *
+       * Updates are signaled by PIPE_NEW_INDEX_BUFFER.
+       */
+      struct pipe_resource *index_buffer;
+      unsigned index_size;
+
+      /* Updates are signalled by PIPE_NEW_INDEX_RANGE:
+       */
+      unsigned min_index;
+      unsigned max_index;
+
+   } curr;
+
+   struct {
+      struct brw_state_flags dirty;
+
+      /**
+       * List of buffers accumulated in brw_validate_state to receive
+       * dri_bo_check_aperture treatment before exec, so we can know if we
+       * should flush the batch and try again before emitting primitives.
+       *
+       * This can be a fixed number as we only have a limited number of
+       * objects referenced from the batchbuffer in a primitive emit,
+       * consisting of the vertex buffers, pipelined state pointers,
+       * the CURBE, the depth buffer, and a query BO.
+       */
+      struct brw_winsys_buffer *validated_bos[PIPE_MAX_SHADER_INPUTS + 16];
+      int validated_bo_count;
+   } state;
+
+   struct brw_cache cache;  /** non-surface items */
+   struct brw_cache surface_cache;  /* surface items */
+   struct brw_cached_batch_item *cached_batch_items;
+
+   struct {
+      struct u_upload_mgr *upload_vertex;
+      struct u_upload_mgr *upload_index;
+      
+      /* Information on uploaded vertex buffers:
+       */
+      struct {
+	 unsigned stride;	/* in bytes between successive vertices */
+	 unsigned offset;	/* in bytes, of first vertex in bo */
+	 unsigned vertex_count;	/* count of valid vertices which may be accessed */
+	 struct brw_winsys_buffer *bo;
+      } vb[PIPE_MAX_ATTRIBS];
+
+      unsigned nr_vb;		/* currently the same as curr.num_vertex_buffers */
+   } vb;
+
+   struct {
+      /* Updates to these fields are signaled by BRW_NEW_INDEX_BUFFER. */
+      struct brw_winsys_buffer *bo;
+      unsigned int offset;
+      unsigned int size;
+      /* Offset to index buffer index to use in CMD_3D_PRIM so that we can
+       * avoid re-uploading the IB packet over and over if we're actually
+       * referencing the same index buffer.
+       */
+      unsigned int start_vertex_offset;
+   } ib;
+
+
+   /* BRW_NEW_URB_ALLOCATIONS:
+    */
+   struct {
+      GLuint vsize;		/* vertex size plus header in urb registers */
+      GLuint csize;		/* constant buffer size in urb registers */
+      GLuint sfsize;		/* setup data size in urb registers */
+
+      GLboolean constrained;
+
+      GLuint nr_vs_entries;
+      GLuint nr_gs_entries;
+      GLuint nr_clip_entries;
+      GLuint nr_sf_entries;
+      GLuint nr_cs_entries;
+
+      GLuint vs_start;
+      GLuint gs_start;
+      GLuint clip_start;
+      GLuint sf_start;
+      GLuint cs_start;
+   } urb;
+
+   
+   /* BRW_NEW_CURBE_OFFSETS: 
+    */
+   struct {
+      GLuint wm_start;  /**< pos of first wm const in CURBE buffer */
+      GLuint wm_size;   /**< number of float[4] consts, multiple of 16 */
+      GLuint clip_start;
+      GLuint clip_size;
+      GLuint vs_start;
+      GLuint vs_size;
+      GLuint total_size;
+
+      struct brw_winsys_buffer *curbe_bo;
+      /** Offset within curbe_bo of space for current curbe entry */
+      GLuint curbe_offset;
+      /** Offset within curbe_bo of space for next curbe entry */
+      GLuint curbe_next_offset;
+
+      GLfloat *last_buf;
+      GLuint last_bufsz;
+      /**
+       *  Whether we should create a new bo instead of reusing the old one
+       * (if we just dispatch the batch pointing at the old one.
+       */
+      GLboolean need_new_bo;
+   } curbe;
+
+   struct {
+      struct brw_vs_prog_data *prog_data;
+
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+
+      /** Binding table of pointers to surf_bo entries */
+      struct brw_winsys_buffer *bind_bo;
+      struct brw_winsys_buffer *surf_bo[BRW_VS_MAX_SURF];
+      GLuint nr_surfaces;      
+   } vs;
+
+   struct {
+      struct brw_gs_prog_data *prog_data;
+
+      GLboolean prog_active;
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+   } gs;
+
+   struct {
+      struct brw_clip_prog_data *prog_data;
+
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+      struct brw_winsys_buffer *vp_bo;
+   } clip;
+
+
+   struct {
+      struct brw_sf_prog_data *prog_data;
+
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+      struct brw_winsys_buffer *vp_bo;
+   } sf;
+
+   struct {
+      struct brw_wm_prog_data *prog_data;
+      struct brw_wm_compile *compile_data;
+
+      /** Input sizes, calculated from active vertex program.
+       * One bit per fragment program input attribute.
+       */
+      /*GLbitfield input_size_masks[4];*/
+
+      /** Array of surface default colors (texture border color) */
+      struct brw_winsys_buffer *sdc_bo[BRW_MAX_TEX_UNIT];
+
+      GLuint render_surf;
+      GLuint nr_surfaces;      
+
+      GLuint max_threads;
+      struct brw_winsys_buffer *scratch_bo;
+
+      GLuint sampler_count;
+      struct brw_winsys_buffer *sampler_bo;
+
+      /** Binding table of pointers to surf_bo entries */
+      struct brw_winsys_buffer *bind_bo;
+      struct brw_winsys_buffer *surf_bo[BRW_WM_MAX_SURF];
+
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+   } wm;
+
+
+   struct {
+      struct brw_winsys_buffer *state_bo;
+
+      struct brw_cc_unit_state cc;
+      struct brw_winsys_reloc reloc[1];
+   } cc;
+
+   struct {
+      struct brw_query_object active_head;
+      struct brw_winsys_buffer *bo;
+      int index;
+      GLboolean active;
+      int stats_wm;
+   } query;
+
+   struct {
+      unsigned always_emit_state:1;
+      unsigned always_flush_batch:1;
+      unsigned force_swtnl:1;
+      unsigned no_swtnl:1;
+   } flags;
+
+   /* Used to give every program string a unique id
+    */
+   GLuint program_id;
+};
+
+
+
+/*======================================================================
+ * brw_queryobj.c
+ */
+void brw_init_query(struct brw_context *brw);
+enum pipe_error brw_prepare_query_begin(struct brw_context *brw);
+void brw_emit_query_begin(struct brw_context *brw);
+void brw_emit_query_end(struct brw_context *brw);
+
+/*======================================================================
+ * brw_state_dump.c
+ */
+void brw_debug_batch(struct brw_context *intel);
+
+
+/*======================================================================
+ * brw_pipe_*.c
+ */
+void brw_pipe_blend_init( struct brw_context *brw );
+void brw_pipe_depth_stencil_init( struct brw_context *brw );
+void brw_pipe_framebuffer_init( struct brw_context *brw );
+void brw_pipe_flush_init( struct brw_context *brw );
+void brw_pipe_misc_init( struct brw_context *brw );
+void brw_pipe_query_init( struct brw_context *brw );
+void brw_pipe_rast_init( struct brw_context *brw );
+void brw_pipe_sampler_init( struct brw_context *brw );
+void brw_pipe_shader_init( struct brw_context *brw );
+void brw_pipe_vertex_init( struct brw_context *brw );
+void brw_pipe_clear_init( struct brw_context *brw );
+
+
+void brw_pipe_blend_cleanup( struct brw_context *brw );
+void brw_pipe_depth_stencil_cleanup( struct brw_context *brw );
+void brw_pipe_framebuffer_cleanup( struct brw_context *brw );
+void brw_pipe_flush_cleanup( struct brw_context *brw );
+void brw_pipe_misc_cleanup( struct brw_context *brw );
+void brw_pipe_query_cleanup( struct brw_context *brw );
+void brw_pipe_rast_cleanup( struct brw_context *brw );
+void brw_pipe_sampler_cleanup( struct brw_context *brw );
+void brw_pipe_shader_cleanup( struct brw_context *brw );
+void brw_pipe_vertex_cleanup( struct brw_context *brw );
+void brw_pipe_clear_cleanup( struct brw_context *brw );
+
+void brw_hw_cc_init( struct brw_context *brw );
+void brw_hw_cc_cleanup( struct brw_context *brw );
+
+
+
+void brw_context_flush( struct brw_context *brw );
+
+
+/* brw_urb.c
+ */
+int brw_upload_urb_fence(struct brw_context *brw);
+
+/* brw_curbe.c
+ */
+int brw_upload_cs_urb_state(struct brw_context *brw);
+
+/* brw_context.c
+ */
+struct pipe_context *brw_create_context(struct pipe_screen *screen,
+					void *priv);
+
+/*======================================================================
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static INLINE struct brw_context *
+brw_context( struct pipe_context *ctx )
+{
+   return (struct brw_context *)ctx;
+}
+
+
+#define BRW_IS_965(brw)    ((brw)->chipset.is_965)
+#define BRW_IS_IGDNG(brw)  ((brw)->chipset.is_igdng)
+#define BRW_IS_G4X(brw)    ((brw)->chipset.is_g4x)
+
+
+#endif
+
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
new file mode 100644
index 0000000000..a701de33f5
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -0,0 +1,382 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "brw_batchbuffer.h"
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+#include "brw_debug.h"
+
+
+/**
+ * Partition the CURBE between the various users of constant values:
+ * Note that vertex and fragment shaders can now fetch constants out
+ * of constant buffers.  We no longer allocatea block of the GRF for
+ * constants.  That greatly reduces the demand for space in the CURBE.
+ * Some of the comments within are dated...
+ */
+static int calculate_curbe_offsets( struct brw_context *brw )
+{
+   /* CACHE_NEW_WM_PROG */
+   const GLuint nr_fp_regs = brw->wm.prog_data->curb_read_length;
+   
+   /* BRW_NEW_VERTEX_PROGRAM */
+   const GLuint nr_vp_regs = brw->vs.prog_data->curb_read_length;
+   GLuint nr_clip_regs = 0;
+   GLuint total_regs;
+
+   /* PIPE_NEW_CLIP */
+   if (brw->curr.ucp.nr) {
+      GLuint nr_planes = 6 + brw->curr.ucp.nr;
+      nr_clip_regs = (nr_planes * 4 + 15) / 16;
+   }
+
+
+   total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
+
+   /* When this is > 32, want to use a true constant buffer to hold
+    * the extra constants.
+    */
+   assert(total_regs <= 32);
+
+   /* Lazy resize:
+    */
+   if (nr_fp_regs > brw->curbe.wm_size ||
+       nr_vp_regs > brw->curbe.vs_size ||
+       nr_clip_regs != brw->curbe.clip_size ||
+       (total_regs < brw->curbe.total_size / 4 &&
+	brw->curbe.total_size > 16)) {
+
+      GLuint reg = 0;
+
+      /* Calculate a new layout: 
+       */
+      reg = 0;
+      brw->curbe.wm_start = reg;
+      brw->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
+      brw->curbe.clip_start = reg;
+      brw->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
+      brw->curbe.vs_start = reg;
+      brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
+      brw->curbe.total_size = reg;
+
+      if (BRW_DEBUG & DEBUG_CURBE)
+	 debug_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n",
+		      brw->curbe.wm_start,
+		      brw->curbe.wm_size,
+		      brw->curbe.clip_start,
+		      brw->curbe.clip_size,
+		      brw->curbe.vs_start,
+		      brw->curbe.vs_size );
+
+      brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS;
+   }
+
+   return 0;
+}
+
+
+const struct brw_tracked_state brw_curbe_offsets = {
+   .dirty = {
+      .mesa = PIPE_NEW_CLIP,
+      .brw  = BRW_NEW_VERTEX_PROGRAM,
+      .cache = CACHE_NEW_WM_PROG
+   },
+   .prepare = calculate_curbe_offsets
+};
+
+
+
+
+/* Define the number of curbes within CS's urb allocation.  Multiple
+ * urb entries -> multiple curbes.  These will be used by
+ * fixed-function hardware in a double-buffering scheme to avoid a
+ * pipeline stall each time the contents of the curbe is changed.
+ */
+int brw_upload_cs_urb_state(struct brw_context *brw)
+{
+   struct brw_cs_urb_state cs_urb;
+   memset(&cs_urb, 0, sizeof(cs_urb));
+
+   /* It appears that this is the state packet for the CS unit, ie. the
+    * urb entries detailed here are housed in the CS range from the
+    * URB_FENCE command.
+    */
+   cs_urb.header.opcode = CMD_CS_URB_STATE;
+   cs_urb.header.length = sizeof(cs_urb)/4 - 2;
+
+   /* BRW_NEW_URB_FENCE */
+   cs_urb.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
+   cs_urb.bits0.urb_entry_size = brw->urb.csize - 1;
+
+   assert(brw->urb.nr_cs_entries);
+   BRW_CACHED_BATCH_STRUCT(brw, &cs_urb);
+   return 0;
+}
+
+static GLfloat fixed_plane[6][4] = {
+   { 0,    0,   -1, 1 },
+   { 0,    0,    1, 1 },
+   { 0,   -1,    0, 1 },
+   { 0,    1,    0, 1 },
+   {-1,    0,    0, 1 },
+   { 1,    0,    0, 1 }
+};
+
+/* Upload a new set of constants.  Too much variability to go into the
+ * cache mechanism, but maybe would benefit from a comparison against
+ * the current uploaded set of constants.
+ */
+static enum pipe_error prepare_curbe_buffer(struct brw_context *brw)
+{
+   const GLuint sz = brw->curbe.total_size;
+   const GLuint bufsz = sz * 16 * sizeof(GLfloat);
+   enum pipe_error ret;
+   GLfloat *buf;
+   GLuint i;
+
+   if (sz == 0) {
+      if (brw->curbe.last_buf) {
+	 FREE(brw->curbe.last_buf);
+	 brw->curbe.last_buf = NULL;
+	 brw->curbe.last_bufsz  = 0;
+      }
+      return 0;
+   }
+
+   buf = (GLfloat *) CALLOC(bufsz, 1);
+
+   /* fragment shader constants */
+   if (brw->curbe.wm_size) {
+      const struct brw_fragment_shader *fs = brw->curr.fragment_shader;
+      GLuint offset = brw->curbe.wm_start * 16;
+      GLuint nr_immediate, nr_const;
+
+      nr_immediate = fs->immediates.nr;
+      if (nr_immediate) {
+         memcpy(&buf[offset], 
+                fs->immediates.data,
+                nr_immediate * 4 * sizeof(float));
+
+         offset += nr_immediate * 4;
+      }
+
+      nr_const = fs->info.file_max[TGSI_FILE_CONSTANT] + 1;
+/*      nr_const = brw->wm.prog_data->nr_params; */
+      if (nr_const) {
+	 pipe_buffer_read( &brw->base,
+			   brw->curr.fragment_constants,
+			   0,
+			   nr_const * 4 * sizeof(float),
+			   &buf[offset]);
+      }
+   }
+
+
+   /* The clipplanes are actually delivered to both CLIP and VS units.
+    * VS uses them to calculate the outcode bitmasks.
+    */
+   if (brw->curbe.clip_size) {
+      GLuint offset = brw->curbe.clip_start * 16;
+      GLuint j;
+
+      /* If any planes are going this way, send them all this way:
+       */
+      for (i = 0; i < 6; i++) {
+	 buf[offset + i * 4 + 0] = fixed_plane[i][0];
+	 buf[offset + i * 4 + 1] = fixed_plane[i][1];
+	 buf[offset + i * 4 + 2] = fixed_plane[i][2];
+	 buf[offset + i * 4 + 3] = fixed_plane[i][3];
+      }
+
+      /* Clip planes:
+       */
+      assert(brw->curr.ucp.nr <= 6);
+      for (j = 0; j < brw->curr.ucp.nr; j++) {
+	 buf[offset + i * 4 + 0] = brw->curr.ucp.ucp[j][0];
+	 buf[offset + i * 4 + 1] = brw->curr.ucp.ucp[j][1];
+	 buf[offset + i * 4 + 2] = brw->curr.ucp.ucp[j][2];
+	 buf[offset + i * 4 + 3] = brw->curr.ucp.ucp[j][3];
+	 i++;
+      }
+   }
+
+   /* vertex shader constants */
+   if (brw->curbe.vs_size) {
+      GLuint offset = brw->curbe.vs_start * 16;
+      const struct brw_vertex_shader *vs = brw->curr.vertex_shader;
+      GLuint nr_immediate, nr_const;
+
+      nr_immediate = vs->immediates.nr;
+      if (nr_immediate) {
+         memcpy(&buf[offset], 
+                vs->immediates.data,
+                nr_immediate * 4 * sizeof(float));
+
+         offset += nr_immediate * 4;
+      }
+
+      nr_const = vs->info.file_max[TGSI_FILE_CONSTANT] + 1;
+      if (nr_const) {
+         /* XXX: note that constant buffers are currently *already* in
+          * buffer objects.  If we want to keep on putting them into the
+          * curbe, makes sense to treat constbuf's specially with malloc.
+          */
+         
+         /* XXX: what if user's constant buffer is too small?
+          */
+	 pipe_buffer_read(&brw->base,
+			  brw->curr.vertex_constants,
+			  0,
+			  nr_const * 4 * sizeof(float),
+			  &buf[offset]);
+      }
+   }
+
+   if (BRW_DEBUG & DEBUG_CURBE) {
+      for (i = 0; i < sz*16; i+=4) 
+	 debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
+
+      debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+		   (void *)brw->curbe.last_buf, (void *)buf,
+		   bufsz, brw->curbe.last_bufsz,
+		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
+   }
+
+   if (brw->curbe.curbe_bo != NULL &&
+       brw->curbe.last_buf &&
+       bufsz == brw->curbe.last_bufsz &&
+       memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
+      /* constants have not changed */
+      FREE(buf);
+   } 
+   else {
+      /* constants have changed */
+      FREE(brw->curbe.last_buf);
+
+      brw->curbe.last_buf = buf;
+      brw->curbe.last_bufsz = bufsz;
+
+      if (brw->curbe.curbe_bo != NULL &&
+	  (brw->curbe.need_new_bo ||
+	   brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size))
+      {
+	 bo_reference(&brw->curbe.curbe_bo, NULL);
+      }
+
+      if (brw->curbe.curbe_bo == NULL) {
+	 /* Allocate a single page for CURBE entries for this
+	  * batchbuffer.  They're generally around 64b.  We will
+	  * discard the curbe buffer after the batch is flushed to
+	  * avoid synchronous updates.
+	  */
+	 ret = brw->sws->bo_alloc(brw->sws, 
+                                  BRW_BUFFER_TYPE_CURBE,
+                                  4096, 1 << 6,
+                                  &brw->curbe.curbe_bo);
+         if (ret)
+            return ret;
+
+	 brw->curbe.curbe_next_offset = 0;
+      }
+
+      brw->curbe.curbe_offset = brw->curbe.curbe_next_offset;
+      brw->curbe.curbe_next_offset += bufsz;
+      brw->curbe.curbe_next_offset = align(brw->curbe.curbe_next_offset, 64);
+
+      /* Copy data to the buffer:
+       */
+      brw->sws->bo_subdata(brw->curbe.curbe_bo,
+                           BRW_DATA_CONSTANT_BUFFER,
+			   brw->curbe.curbe_offset,
+			   bufsz,
+			   buf,
+                           NULL, 0);
+   }
+
+   brw_add_validated_bo(brw, brw->curbe.curbe_bo);
+
+   /* Because this provokes an action (ie copy the constants into the
+    * URB), it shouldn't be shortcircuited if identical to the
+    * previous time - because eg. the urb destination may have
+    * changed, or the urb contents different to last time.
+    *
+    * Note that the data referred to is actually copied internally,
+    * not just used in place according to passed pointer.
+    *
+    * It appears that the CS unit takes care of using each available
+    * URB entry (Const URB Entry == CURBE) in turn, and issuing
+    * flushes as necessary when doublebuffering of CURBEs isn't
+    * possible.
+    */
+
+   return 0;
+}
+
+static enum pipe_error emit_curbe_buffer(struct brw_context *brw)
+{
+   GLuint sz = brw->curbe.total_size;
+
+   BEGIN_BATCH(2, IGNORE_CLIPRECTS);
+   if (sz == 0) {
+      OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
+      OUT_BATCH(0);
+   } else {
+      OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2));
+      OUT_RELOC(brw->curbe.curbe_bo,
+		BRW_USAGE_STATE,
+		(sz - 1) + brw->curbe.curbe_offset);
+   }
+   ADVANCE_BATCH();
+   return 0;
+}
+
+const struct brw_tracked_state brw_curbe_buffer = {
+   .dirty = {
+      .mesa = (PIPE_NEW_FRAGMENT_CONSTANTS |
+	       PIPE_NEW_VERTEX_CONSTANTS |
+	       PIPE_NEW_CLIP),
+      .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
+	       BRW_NEW_VERTEX_PROGRAM |
+	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
+	       BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */
+	       BRW_NEW_CURBE_OFFSETS |
+	       BRW_NEW_BATCH),
+      .cache = (CACHE_NEW_WM_PROG) 
+   },
+   .prepare = prepare_curbe_buffer,
+   .emit = emit_curbe_buffer,
+};
+
diff --git a/src/gallium/drivers/i965/brw_debug.h b/src/gallium/drivers/i965/brw_debug.h
new file mode 100644
index 0000000000..ae8e9254a6
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_debug.h
@@ -0,0 +1,43 @@
+#ifndef BRW_DEBUG_H
+#define BRW_DEBUG_H
+
+/* ================================================================
+ * Debugging:
+ */
+
+#define DEBUG_TEXTURE	        0x1
+#define DEBUG_STATE	        0x2
+#define DEBUG_IOCTL	        0x4
+#define DEBUG_BLIT	        0x8
+#define DEBUG_CURBE             0x10
+#define DEBUG_FALLBACKS	        0x20
+#define DEBUG_VERBOSE	        0x40
+#define DEBUG_BATCH             0x80
+#define DEBUG_PIXEL             0x100
+#define DEBUG_WINSYS            0x200
+#define DEBUG_MIN_URB           0x400
+#define DEBUG_DISASSEM           0x800
+#define DEBUG_unused3           0x1000
+#define DEBUG_SYNC	        0x2000
+#define DEBUG_PRIMS	        0x4000
+#define DEBUG_VERTS	        0x8000
+#define DEBUG_unused4           0x10000
+#define DEBUG_DMA               0x20000
+#define DEBUG_SANITY            0x40000
+#define DEBUG_SLEEP             0x80000
+#define DEBUG_STATS             0x100000
+#define DEBUG_unused5           0x200000
+#define DEBUG_SINGLE_THREAD     0x400000
+#define DEBUG_WM                0x800000
+#define DEBUG_URB               0x1000000
+#define DEBUG_VS                0x2000000
+
+#ifdef DEBUG
+extern int BRW_DEBUG;
+#else
+#define BRW_DEBUG 0
+#endif
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_defines.h b/src/gallium/drivers/i965/brw_defines.h
new file mode 100644
index 0000000000..e201ce4d7c
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_defines.h
@@ -0,0 +1,847 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+#ifndef BRW_DEFINES_H
+#define BRW_DEFINES_H
+
+/* 3D state:
+ */
+#define _3DOP_3DSTATE_PIPELINED       0x0
+#define _3DOP_3DSTATE_NONPIPELINED    0x1
+#define _3DOP_3DCONTROL               0x2
+#define _3DOP_3DPRIMITIVE             0x3
+
+#define _3DSTATE_PIPELINED_POINTERS       0x00
+#define _3DSTATE_BINDING_TABLE_POINTERS   0x01
+#define _3DSTATE_VERTEX_BUFFERS           0x08
+#define _3DSTATE_VERTEX_ELEMENTS          0x09
+#define _3DSTATE_INDEX_BUFFER             0x0A
+#define _3DSTATE_VF_STATISTICS            0x0B
+#define _3DSTATE_DRAWING_RECTANGLE            0x00
+#define _3DSTATE_CONSTANT_COLOR               0x01
+#define _3DSTATE_SAMPLER_PALETTE_LOAD         0x02
+#define _3DSTATE_CHROMA_KEY                   0x04
+#define _3DSTATE_DEPTH_BUFFER                 0x05
+#define _3DSTATE_POLY_STIPPLE_OFFSET          0x06
+#define _3DSTATE_POLY_STIPPLE_PATTERN         0x07
+#define _3DSTATE_LINE_STIPPLE                 0x08
+#define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP    0x09
+#define _3DCONTROL    0x00
+
+#define PIPE_CONTROL_NOWRITE          0x00
+#define PIPE_CONTROL_WRITEIMMEDIATE   0x01
+#define PIPE_CONTROL_WRITEDEPTH       0x02
+#define PIPE_CONTROL_WRITETIMESTAMP   0x03
+
+#define PIPE_CONTROL_GTTWRITE_PROCESS_LOCAL 0x00
+#define PIPE_CONTROL_GTTWRITE_GLOBAL        0x01
+
+#define _3DPRIM_POINTLIST         0x01
+#define _3DPRIM_LINELIST          0x02
+#define _3DPRIM_LINESTRIP         0x03
+#define _3DPRIM_TRILIST           0x04
+#define _3DPRIM_TRISTRIP          0x05
+#define _3DPRIM_TRIFAN            0x06
+#define _3DPRIM_QUADLIST          0x07
+#define _3DPRIM_QUADSTRIP         0x08
+#define _3DPRIM_LINELIST_ADJ      0x09
+#define _3DPRIM_LINESTRIP_ADJ     0x0A
+#define _3DPRIM_TRILIST_ADJ       0x0B
+#define _3DPRIM_TRISTRIP_ADJ      0x0C
+#define _3DPRIM_TRISTRIP_REVERSE  0x0D
+#define _3DPRIM_POLYGON           0x0E
+#define _3DPRIM_RECTLIST          0x0F
+#define _3DPRIM_LINELOOP          0x10
+#define _3DPRIM_POINTLIST_BF      0x11
+#define _3DPRIM_LINESTRIP_CONT    0x12
+#define _3DPRIM_LINESTRIP_BF      0x13
+#define _3DPRIM_LINESTRIP_CONT_BF 0x14
+#define _3DPRIM_TRIFAN_NOSTIPPLE  0x15
+
+#define _3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL 0
+#define _3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     1
+
+#define BRW_ANISORATIO_2     0 
+#define BRW_ANISORATIO_4     1 
+#define BRW_ANISORATIO_6     2 
+#define BRW_ANISORATIO_8     3 
+#define BRW_ANISORATIO_10    4 
+#define BRW_ANISORATIO_12    5 
+#define BRW_ANISORATIO_14    6 
+#define BRW_ANISORATIO_16    7
+
+#define BRW_BLENDFACTOR_ONE                 0x1
+#define BRW_BLENDFACTOR_SRC_COLOR           0x2
+#define BRW_BLENDFACTOR_SRC_ALPHA           0x3
+#define BRW_BLENDFACTOR_DST_ALPHA           0x4
+#define BRW_BLENDFACTOR_DST_COLOR           0x5
+#define BRW_BLENDFACTOR_SRC_ALPHA_SATURATE  0x6
+#define BRW_BLENDFACTOR_CONST_COLOR         0x7
+#define BRW_BLENDFACTOR_CONST_ALPHA         0x8
+#define BRW_BLENDFACTOR_SRC1_COLOR          0x9
+#define BRW_BLENDFACTOR_SRC1_ALPHA          0x0A
+#define BRW_BLENDFACTOR_ZERO                0x11
+#define BRW_BLENDFACTOR_INV_SRC_COLOR       0x12
+#define BRW_BLENDFACTOR_INV_SRC_ALPHA       0x13
+#define BRW_BLENDFACTOR_INV_DST_ALPHA       0x14
+#define BRW_BLENDFACTOR_INV_DST_COLOR       0x15
+#define BRW_BLENDFACTOR_INV_CONST_COLOR     0x17
+#define BRW_BLENDFACTOR_INV_CONST_ALPHA     0x18
+#define BRW_BLENDFACTOR_INV_SRC1_COLOR      0x19
+#define BRW_BLENDFACTOR_INV_SRC1_ALPHA      0x1A
+
+#define BRW_BLENDFUNCTION_ADD               0
+#define BRW_BLENDFUNCTION_SUBTRACT          1
+#define BRW_BLENDFUNCTION_REVERSE_SUBTRACT  2
+#define BRW_BLENDFUNCTION_MIN               3
+#define BRW_BLENDFUNCTION_MAX               4
+
+#define BRW_ALPHATEST_FORMAT_UNORM8         0
+#define BRW_ALPHATEST_FORMAT_FLOAT32        1
+
+#define BRW_CHROMAKEY_KILL_ON_ANY_MATCH  0
+#define BRW_CHROMAKEY_REPLACE_BLACK      1
+
+#define BRW_CLIP_API_OGL     0
+#define BRW_CLIP_API_DX      1
+
+#define BRW_CLIPMODE_NORMAL              0
+#define BRW_CLIPMODE_CLIP_ALL            1
+#define BRW_CLIPMODE_CLIP_NON_REJECTED   2
+#define BRW_CLIPMODE_REJECT_ALL          3
+#define BRW_CLIPMODE_ACCEPT_ALL          4
+#define BRW_CLIPMODE_KERNEL_CLIP         5
+
+#define BRW_CLIP_NDCSPACE     0
+#define BRW_CLIP_SCREENSPACE  1
+
+#define BRW_COMPAREFUNCTION_ALWAYS       0
+#define BRW_COMPAREFUNCTION_NEVER        1
+#define BRW_COMPAREFUNCTION_LESS         2
+#define BRW_COMPAREFUNCTION_EQUAL        3
+#define BRW_COMPAREFUNCTION_LEQUAL       4
+#define BRW_COMPAREFUNCTION_GREATER      5
+#define BRW_COMPAREFUNCTION_NOTEQUAL     6
+#define BRW_COMPAREFUNCTION_GEQUAL       7
+
+#define BRW_COVERAGE_PIXELS_HALF     0
+#define BRW_COVERAGE_PIXELS_1        1
+#define BRW_COVERAGE_PIXELS_2        2
+#define BRW_COVERAGE_PIXELS_4        3
+
+#define BRW_CULLMODE_BOTH        0
+#define BRW_CULLMODE_NONE        1
+#define BRW_CULLMODE_FRONT       2
+#define BRW_CULLMODE_BACK        3
+
+#define BRW_DEFAULTCOLOR_R8G8B8A8_UNORM      0
+#define BRW_DEFAULTCOLOR_R32G32B32A32_FLOAT  1
+
+#define BRW_DEPTHFORMAT_D32_FLOAT_S8X24_UINT     0
+#define BRW_DEPTHFORMAT_D32_FLOAT                1
+#define BRW_DEPTHFORMAT_D24_UNORM_S8_UINT        2
+#define BRW_DEPTHFORMAT_D16_UNORM                5
+
+#define BRW_FLOATING_POINT_IEEE_754        0
+#define BRW_FLOATING_POINT_NON_IEEE_754    1
+
+#define BRW_FRONTWINDING_CW      0
+#define BRW_FRONTWINDING_CCW     1
+
+#define BRW_SPRITE_POINT_ENABLE  16
+
+#define BRW_INDEX_BYTE     0
+#define BRW_INDEX_WORD     1
+#define BRW_INDEX_DWORD    2
+
+#define BRW_LOGICOPFUNCTION_CLEAR            0
+#define BRW_LOGICOPFUNCTION_NOR              1
+#define BRW_LOGICOPFUNCTION_AND_INVERTED     2
+#define BRW_LOGICOPFUNCTION_COPY_INVERTED    3
+#define BRW_LOGICOPFUNCTION_AND_REVERSE      4
+#define BRW_LOGICOPFUNCTION_INVERT           5
+#define BRW_LOGICOPFUNCTION_XOR              6
+#define BRW_LOGICOPFUNCTION_NAND             7
+#define BRW_LOGICOPFUNCTION_AND              8
+#define BRW_LOGICOPFUNCTION_EQUIV            9
+#define BRW_LOGICOPFUNCTION_NOOP             10
+#define BRW_LOGICOPFUNCTION_OR_INVERTED      11
+#define BRW_LOGICOPFUNCTION_COPY             12
+#define BRW_LOGICOPFUNCTION_OR_REVERSE       13
+#define BRW_LOGICOPFUNCTION_OR               14
+#define BRW_LOGICOPFUNCTION_SET              15  
+
+#define BRW_MAPFILTER_NEAREST        0x0 
+#define BRW_MAPFILTER_LINEAR         0x1 
+#define BRW_MAPFILTER_ANISOTROPIC    0x2
+
+#define BRW_MIPFILTER_NONE        0   
+#define BRW_MIPFILTER_NEAREST     1   
+#define BRW_MIPFILTER_LINEAR      3
+
+#define BRW_POLYGON_FRONT_FACING     0
+#define BRW_POLYGON_BACK_FACING      1
+
+#define BRW_PREFILTER_ALWAYS     0x0 
+#define BRW_PREFILTER_NEVER      0x1
+#define BRW_PREFILTER_LESS       0x2
+#define BRW_PREFILTER_EQUAL      0x3
+#define BRW_PREFILTER_LEQUAL     0x4
+#define BRW_PREFILTER_GREATER    0x5
+#define BRW_PREFILTER_NOTEQUAL   0x6
+#define BRW_PREFILTER_GEQUAL     0x7
+
+#define BRW_PROVOKING_VERTEX_0    0
+#define BRW_PROVOKING_VERTEX_1    1 
+#define BRW_PROVOKING_VERTEX_2    2
+
+#define BRW_RASTRULE_UPPER_LEFT  0    
+#define BRW_RASTRULE_UPPER_RIGHT 1
+/* These are listed as "Reserved, but not seen as useful"
+ * in Intel documentation (page 212, "Point Rasterization Rule",
+ * section 7.4 "SF Pipeline State Summary", of document
+ * "Intel® 965 Express Chipset Family and Intel® G35 Express
+ * Chipset Graphics Controller Programmer's Reference Manual,
+ * Volume 2: 3D/Media", Revision 1.0b as of January 2008,
+ * available at 
+ *     http://intellinuxgraphics.org/documentation.html
+ * at the time of this writing).
+ *
+ * These appear to be supported on at least some
+ * i965-family devices, and the BRW_RASTRULE_LOWER_RIGHT
+ * is useful when using OpenGL to render to a FBO
+ * (which has the pixel coordinate Y orientation inverted
+ * with respect to the normal OpenGL pixel coordinate system).
+ */
+#define BRW_RASTRULE_LOWER_LEFT  2
+#define BRW_RASTRULE_LOWER_RIGHT 3
+
+#define BRW_RENDERTARGET_CLAMPRANGE_UNORM    0
+#define BRW_RENDERTARGET_CLAMPRANGE_SNORM    1
+#define BRW_RENDERTARGET_CLAMPRANGE_FORMAT   2
+
+#define BRW_STENCILOP_KEEP               0
+#define BRW_STENCILOP_ZERO               1
+#define BRW_STENCILOP_REPLACE            2
+#define BRW_STENCILOP_INCRSAT            3
+#define BRW_STENCILOP_DECRSAT            4
+#define BRW_STENCILOP_INCR               5
+#define BRW_STENCILOP_DECR               6
+#define BRW_STENCILOP_INVERT             7
+
+#define BRW_SURFACE_MIPMAPLAYOUT_BELOW   0
+#define BRW_SURFACE_MIPMAPLAYOUT_RIGHT   1
+
+#define BRW_SURFACEFORMAT_R32G32B32A32_FLOAT             0x000 
+#define BRW_SURFACEFORMAT_R32G32B32A32_SINT              0x001 
+#define BRW_SURFACEFORMAT_R32G32B32A32_UINT              0x002 
+#define BRW_SURFACEFORMAT_R32G32B32A32_UNORM             0x003 
+#define BRW_SURFACEFORMAT_R32G32B32A32_SNORM             0x004 
+#define BRW_SURFACEFORMAT_R64G64_FLOAT                   0x005 
+#define BRW_SURFACEFORMAT_R32G32B32X32_FLOAT             0x006 
+#define BRW_SURFACEFORMAT_R32G32B32A32_SSCALED           0x007
+#define BRW_SURFACEFORMAT_R32G32B32A32_USCALED           0x008
+#define BRW_SURFACEFORMAT_R32G32B32_FLOAT                0x040 
+#define BRW_SURFACEFORMAT_R32G32B32_SINT                 0x041 
+#define BRW_SURFACEFORMAT_R32G32B32_UINT                 0x042 
+#define BRW_SURFACEFORMAT_R32G32B32_UNORM                0x043 
+#define BRW_SURFACEFORMAT_R32G32B32_SNORM                0x044 
+#define BRW_SURFACEFORMAT_R32G32B32_SSCALED              0x045 
+#define BRW_SURFACEFORMAT_R32G32B32_USCALED              0x046 
+#define BRW_SURFACEFORMAT_R16G16B16A16_UNORM             0x080 
+#define BRW_SURFACEFORMAT_R16G16B16A16_SNORM             0x081 
+#define BRW_SURFACEFORMAT_R16G16B16A16_SINT              0x082 
+#define BRW_SURFACEFORMAT_R16G16B16A16_UINT              0x083 
+#define BRW_SURFACEFORMAT_R16G16B16A16_FLOAT             0x084 
+#define BRW_SURFACEFORMAT_R32G32_FLOAT                   0x085 
+#define BRW_SURFACEFORMAT_R32G32_SINT                    0x086 
+#define BRW_SURFACEFORMAT_R32G32_UINT                    0x087 
+#define BRW_SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS       0x088 
+#define BRW_SURFACEFORMAT_X32_TYPELESS_G8X24_UINT        0x089 
+#define BRW_SURFACEFORMAT_L32A32_FLOAT                   0x08A 
+#define BRW_SURFACEFORMAT_R32G32_UNORM                   0x08B 
+#define BRW_SURFACEFORMAT_R32G32_SNORM                   0x08C 
+#define BRW_SURFACEFORMAT_R64_FLOAT                      0x08D 
+#define BRW_SURFACEFORMAT_R16G16B16X16_UNORM             0x08E 
+#define BRW_SURFACEFORMAT_R16G16B16X16_FLOAT             0x08F 
+#define BRW_SURFACEFORMAT_A32X32_FLOAT                   0x090 
+#define BRW_SURFACEFORMAT_L32X32_FLOAT                   0x091 
+#define BRW_SURFACEFORMAT_I32X32_FLOAT                   0x092 
+#define BRW_SURFACEFORMAT_R16G16B16A16_SSCALED           0x093
+#define BRW_SURFACEFORMAT_R16G16B16A16_USCALED           0x094
+#define BRW_SURFACEFORMAT_R32G32_SSCALED                 0x095
+#define BRW_SURFACEFORMAT_R32G32_USCALED                 0x096
+#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM                 0x0C0 
+#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB            0x0C1 
+#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM              0x0C2 
+#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM_SRGB         0x0C3 
+#define BRW_SURFACEFORMAT_R10G10B10A2_UINT               0x0C4 
+#define BRW_SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM       0x0C5 
+#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM                 0x0C7 
+#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB            0x0C8 
+#define BRW_SURFACEFORMAT_R8G8B8A8_SNORM                 0x0C9 
+#define BRW_SURFACEFORMAT_R8G8B8A8_SINT                  0x0CA 
+#define BRW_SURFACEFORMAT_R8G8B8A8_UINT                  0x0CB 
+#define BRW_SURFACEFORMAT_R16G16_UNORM                   0x0CC 
+#define BRW_SURFACEFORMAT_R16G16_SNORM                   0x0CD 
+#define BRW_SURFACEFORMAT_R16G16_SINT                    0x0CE 
+#define BRW_SURFACEFORMAT_R16G16_UINT                    0x0CF 
+#define BRW_SURFACEFORMAT_R16G16_FLOAT                   0x0D0 
+#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM              0x0D1 
+#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM_SRGB         0x0D2 
+#define BRW_SURFACEFORMAT_R11G11B10_FLOAT                0x0D3 
+#define BRW_SURFACEFORMAT_R32_SINT                       0x0D6 
+#define BRW_SURFACEFORMAT_R32_UINT                       0x0D7 
+#define BRW_SURFACEFORMAT_R32_FLOAT                      0x0D8 
+#define BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS          0x0D9 
+#define BRW_SURFACEFORMAT_X24_TYPELESS_G8_UINT           0x0DA 
+#define BRW_SURFACEFORMAT_L16A16_UNORM                   0x0DF 
+#define BRW_SURFACEFORMAT_I24X8_UNORM                    0x0E0 
+#define BRW_SURFACEFORMAT_L24X8_UNORM                    0x0E1 
+#define BRW_SURFACEFORMAT_A24X8_UNORM                    0x0E2 
+#define BRW_SURFACEFORMAT_I32_FLOAT                      0x0E3 
+#define BRW_SURFACEFORMAT_L32_FLOAT                      0x0E4 
+#define BRW_SURFACEFORMAT_A32_FLOAT                      0x0E5 
+#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM                 0x0E9 
+#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB            0x0EA 
+#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM                 0x0EB 
+#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM_SRGB            0x0EC 
+#define BRW_SURFACEFORMAT_R9G9B9E5_SHAREDEXP             0x0ED 
+#define BRW_SURFACEFORMAT_B10G10R10X2_UNORM              0x0EE 
+#define BRW_SURFACEFORMAT_L16A16_FLOAT                   0x0F0 
+#define BRW_SURFACEFORMAT_R32_UNORM                      0x0F1 
+#define BRW_SURFACEFORMAT_R32_SNORM                      0x0F2 
+#define BRW_SURFACEFORMAT_R10G10B10X2_USCALED            0x0F3
+#define BRW_SURFACEFORMAT_R8G8B8A8_SSCALED               0x0F4
+#define BRW_SURFACEFORMAT_R8G8B8A8_USCALED               0x0F5
+#define BRW_SURFACEFORMAT_R16G16_SSCALED                 0x0F6
+#define BRW_SURFACEFORMAT_R16G16_USCALED                 0x0F7
+#define BRW_SURFACEFORMAT_R32_SSCALED                    0x0F8
+#define BRW_SURFACEFORMAT_R32_USCALED                    0x0F9
+#define BRW_SURFACEFORMAT_B5G6R5_UNORM                   0x100 
+#define BRW_SURFACEFORMAT_B5G6R5_UNORM_SRGB              0x101 
+#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM                 0x102 
+#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM_SRGB            0x103 
+#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM                 0x104 
+#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM_SRGB            0x105 
+#define BRW_SURFACEFORMAT_R8G8_UNORM                     0x106 
+#define BRW_SURFACEFORMAT_R8G8_SNORM                     0x107 
+#define BRW_SURFACEFORMAT_R8G8_SINT                      0x108 
+#define BRW_SURFACEFORMAT_R8G8_UINT                      0x109 
+#define BRW_SURFACEFORMAT_R16_UNORM                      0x10A 
+#define BRW_SURFACEFORMAT_R16_SNORM                      0x10B 
+#define BRW_SURFACEFORMAT_R16_SINT                       0x10C 
+#define BRW_SURFACEFORMAT_R16_UINT                       0x10D 
+#define BRW_SURFACEFORMAT_R16_FLOAT                      0x10E 
+#define BRW_SURFACEFORMAT_I16_UNORM                      0x111 
+#define BRW_SURFACEFORMAT_L16_UNORM                      0x112 
+#define BRW_SURFACEFORMAT_A16_UNORM                      0x113 
+#define BRW_SURFACEFORMAT_L8A8_UNORM                     0x114 
+#define BRW_SURFACEFORMAT_I16_FLOAT                      0x115
+#define BRW_SURFACEFORMAT_L16_FLOAT                      0x116
+#define BRW_SURFACEFORMAT_A16_FLOAT                      0x117
+#define BRW_SURFACEFORMAT_L8A8_UNORM_SRGB                0x118
+#define BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB            0x11B
+#define BRW_SURFACEFORMAT_R8G8_SSCALED                   0x11C
+#define BRW_SURFACEFORMAT_R8G8_USCALED                   0x11D
+#define BRW_SURFACEFORMAT_R16_SSCALED                    0x11E
+#define BRW_SURFACEFORMAT_R16_USCALED                    0x11F
+#define BRW_SURFACEFORMAT_R8_UNORM                       0x140 
+#define BRW_SURFACEFORMAT_R8_SNORM                       0x141 
+#define BRW_SURFACEFORMAT_R8_SINT                        0x142 
+#define BRW_SURFACEFORMAT_R8_UINT                        0x143 
+#define BRW_SURFACEFORMAT_A8_UNORM                       0x144 
+#define BRW_SURFACEFORMAT_I8_UNORM                       0x145 
+#define BRW_SURFACEFORMAT_L8_UNORM                       0x146 
+#define BRW_SURFACEFORMAT_P4A4_UNORM                     0x147 
+#define BRW_SURFACEFORMAT_A4P4_UNORM                     0x148
+#define BRW_SURFACEFORMAT_R8_SSCALED                     0x149
+#define BRW_SURFACEFORMAT_R8_USCALED                     0x14A
+#define BRW_SURFACEFORMAT_L8_UNORM_SRGB                  0x14C
+#define BRW_SURFACEFORMAT_R1_UINT                        0x181 
+#define BRW_SURFACEFORMAT_YCRCB_NORMAL                   0x182 
+#define BRW_SURFACEFORMAT_YCRCB_SWAPUVY                  0x183 
+#define BRW_SURFACEFORMAT_BC1_UNORM                      0x186 
+#define BRW_SURFACEFORMAT_BC2_UNORM                      0x187 
+#define BRW_SURFACEFORMAT_BC3_UNORM                      0x188 
+#define BRW_SURFACEFORMAT_BC4_UNORM                      0x189 
+#define BRW_SURFACEFORMAT_BC5_UNORM                      0x18A 
+#define BRW_SURFACEFORMAT_BC1_UNORM_SRGB                 0x18B 
+#define BRW_SURFACEFORMAT_BC2_UNORM_SRGB                 0x18C 
+#define BRW_SURFACEFORMAT_BC3_UNORM_SRGB                 0x18D 
+#define BRW_SURFACEFORMAT_MONO8                          0x18E 
+#define BRW_SURFACEFORMAT_YCRCB_SWAPUV                   0x18F 
+#define BRW_SURFACEFORMAT_YCRCB_SWAPY                    0x190 
+#define BRW_SURFACEFORMAT_DXT1_RGB                       0x191 
+#define BRW_SURFACEFORMAT_FXT1                           0x192 
+#define BRW_SURFACEFORMAT_R8G8B8_UNORM                   0x193 
+#define BRW_SURFACEFORMAT_R8G8B8_SNORM                   0x194 
+#define BRW_SURFACEFORMAT_R8G8B8_SSCALED                 0x195 
+#define BRW_SURFACEFORMAT_R8G8B8_USCALED                 0x196 
+#define BRW_SURFACEFORMAT_R64G64B64A64_FLOAT             0x197 
+#define BRW_SURFACEFORMAT_R64G64B64_FLOAT                0x198 
+#define BRW_SURFACEFORMAT_BC4_SNORM                      0x199 
+#define BRW_SURFACEFORMAT_BC5_SNORM                      0x19A 
+#define BRW_SURFACEFORMAT_R16G16B16_UNORM                0x19C 
+#define BRW_SURFACEFORMAT_R16G16B16_SNORM                0x19D 
+#define BRW_SURFACEFORMAT_R16G16B16_SSCALED              0x19E 
+#define BRW_SURFACEFORMAT_R16G16B16_USCALED              0x19F
+#define BRW_SURFACEFORMAT_INVALID                        0xFFF
+
+#define BRW_SURFACERETURNFORMAT_FLOAT32  0
+#define BRW_SURFACERETURNFORMAT_S1       1
+
+#define BRW_SURFACE_1D      0
+#define BRW_SURFACE_2D      1
+#define BRW_SURFACE_3D      2
+#define BRW_SURFACE_CUBE    3
+#define BRW_SURFACE_BUFFER  4
+#define BRW_SURFACE_NULL    7
+
+#define BRW_TEXCOORDMODE_WRAP            0
+#define BRW_TEXCOORDMODE_MIRROR          1
+#define BRW_TEXCOORDMODE_CLAMP           2
+#define BRW_TEXCOORDMODE_CUBE            3
+#define BRW_TEXCOORDMODE_CLAMP_BORDER    4
+#define BRW_TEXCOORDMODE_MIRROR_ONCE     5
+
+#define BRW_THREAD_PRIORITY_NORMAL   0
+#define BRW_THREAD_PRIORITY_HIGH     1
+
+#define BRW_TILEWALK_XMAJOR                 0
+#define BRW_TILEWALK_YMAJOR                 1
+
+#define BRW_VERTEX_SUBPIXEL_PRECISION_8BITS  0
+#define BRW_VERTEX_SUBPIXEL_PRECISION_4BITS  1
+
+/* Execution Unit (EU) defines
+ */
+
+#define BRW_ALIGN_1   0
+#define BRW_ALIGN_16  1
+
+#define BRW_ADDRESS_DIRECT                        0
+#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER    1
+
+#define BRW_CHANNEL_X     0
+#define BRW_CHANNEL_Y     1
+#define BRW_CHANNEL_Z     2
+#define BRW_CHANNEL_W     3
+
+#define BRW_COMPRESSION_NONE          0
+#define BRW_COMPRESSION_2NDHALF       1
+#define BRW_COMPRESSION_COMPRESSED    2
+
+#define BRW_CONDITIONAL_NONE  0
+#define BRW_CONDITIONAL_Z     1
+#define BRW_CONDITIONAL_NZ    2
+#define BRW_CONDITIONAL_EQ    1	/* Z */
+#define BRW_CONDITIONAL_NEQ   2	/* NZ */
+#define BRW_CONDITIONAL_G     3
+#define BRW_CONDITIONAL_GE    4
+#define BRW_CONDITIONAL_L     5
+#define BRW_CONDITIONAL_LE    6
+#define BRW_CONDITIONAL_R     7
+#define BRW_CONDITIONAL_O     8
+#define BRW_CONDITIONAL_U     9
+
+#define BRW_DEBUG_NONE        0
+#define BRW_DEBUG_BREAKPOINT  1
+
+#define BRW_DEPENDENCY_NORMAL         0
+#define BRW_DEPENDENCY_NOTCLEARED     1
+#define BRW_DEPENDENCY_NOTCHECKED     2
+#define BRW_DEPENDENCY_DISABLE        3
+
+#define BRW_EXECUTE_1     0
+#define BRW_EXECUTE_2     1
+#define BRW_EXECUTE_4     2
+#define BRW_EXECUTE_8     3
+#define BRW_EXECUTE_16    4
+#define BRW_EXECUTE_32    5
+
+#define BRW_HORIZONTAL_STRIDE_0   0
+#define BRW_HORIZONTAL_STRIDE_1   1
+#define BRW_HORIZONTAL_STRIDE_2   2
+#define BRW_HORIZONTAL_STRIDE_4   3
+
+#define BRW_INSTRUCTION_NORMAL    0
+#define BRW_INSTRUCTION_SATURATE  1
+
+#define BRW_MASK_ENABLE   0
+#define BRW_MASK_DISABLE  1
+
+#define BRW_OPCODE_MOV        1
+#define BRW_OPCODE_SEL        2
+#define BRW_OPCODE_NOT        4
+#define BRW_OPCODE_AND        5
+#define BRW_OPCODE_OR         6
+#define BRW_OPCODE_XOR        7
+#define BRW_OPCODE_SHR        8
+#define BRW_OPCODE_SHL        9
+#define BRW_OPCODE_RSR        10
+#define BRW_OPCODE_RSL        11
+#define BRW_OPCODE_ASR        12
+#define BRW_OPCODE_CMP        16
+#define BRW_OPCODE_CMPN       17
+#define BRW_OPCODE_JMPI       32
+#define BRW_OPCODE_IF         34
+#define BRW_OPCODE_IFF        35
+#define BRW_OPCODE_ELSE       36
+#define BRW_OPCODE_ENDIF      37
+#define BRW_OPCODE_DO         38
+#define BRW_OPCODE_WHILE      39
+#define BRW_OPCODE_BREAK      40
+#define BRW_OPCODE_CONTINUE   41
+#define BRW_OPCODE_HALT       42
+#define BRW_OPCODE_MSAVE      44
+#define BRW_OPCODE_MRESTORE   45
+#define BRW_OPCODE_PUSH       46
+#define BRW_OPCODE_POP        47
+#define BRW_OPCODE_WAIT       48
+#define BRW_OPCODE_SEND       49
+#define BRW_OPCODE_ADD        64
+#define BRW_OPCODE_MUL        65
+#define BRW_OPCODE_AVG        66
+#define BRW_OPCODE_FRC        67
+#define BRW_OPCODE_RNDU       68
+#define BRW_OPCODE_RNDD       69
+#define BRW_OPCODE_RNDE       70
+#define BRW_OPCODE_RNDZ       71
+#define BRW_OPCODE_MAC        72
+#define BRW_OPCODE_MACH       73
+#define BRW_OPCODE_LZD        74
+#define BRW_OPCODE_SAD2       80
+#define BRW_OPCODE_SADA2      81
+#define BRW_OPCODE_DP4        84
+#define BRW_OPCODE_DPH        85
+#define BRW_OPCODE_DP3        86
+#define BRW_OPCODE_DP2        87
+#define BRW_OPCODE_DPA2       88
+#define BRW_OPCODE_LINE       89
+#define BRW_OPCODE_NOP        126
+
+#define BRW_PREDICATE_NONE             0
+#define BRW_PREDICATE_NORMAL           1
+#define BRW_PREDICATE_ALIGN1_ANYV             2
+#define BRW_PREDICATE_ALIGN1_ALLV             3
+#define BRW_PREDICATE_ALIGN1_ANY2H            4
+#define BRW_PREDICATE_ALIGN1_ALL2H            5
+#define BRW_PREDICATE_ALIGN1_ANY4H            6
+#define BRW_PREDICATE_ALIGN1_ALL4H            7
+#define BRW_PREDICATE_ALIGN1_ANY8H            8
+#define BRW_PREDICATE_ALIGN1_ALL8H            9
+#define BRW_PREDICATE_ALIGN1_ANY16H           10
+#define BRW_PREDICATE_ALIGN1_ALL16H           11
+#define BRW_PREDICATE_ALIGN16_REPLICATE_X     2
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Y     3
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Z     4
+#define BRW_PREDICATE_ALIGN16_REPLICATE_W     5
+#define BRW_PREDICATE_ALIGN16_ANY4H           6
+#define BRW_PREDICATE_ALIGN16_ALL4H           7
+
+#define BRW_ARCHITECTURE_REGISTER_FILE    0
+#define BRW_GENERAL_REGISTER_FILE         1
+#define BRW_MESSAGE_REGISTER_FILE         2
+#define BRW_IMMEDIATE_VALUE               3
+
+#define BRW_REGISTER_TYPE_UD  0
+#define BRW_REGISTER_TYPE_D   1
+#define BRW_REGISTER_TYPE_UW  2
+#define BRW_REGISTER_TYPE_W   3
+#define BRW_REGISTER_TYPE_UB  4
+#define BRW_REGISTER_TYPE_B   5
+#define BRW_REGISTER_TYPE_VF  5	/* packed float vector, immediates only? */
+#define BRW_REGISTER_TYPE_HF  6
+#define BRW_REGISTER_TYPE_V   6	/* packed int vector, immediates only, uword dest only */
+#define BRW_REGISTER_TYPE_F   7
+
+#define BRW_ARF_NULL                  0x00
+#define BRW_ARF_ADDRESS               0x10
+#define BRW_ARF_ACCUMULATOR           0x20   
+#define BRW_ARF_FLAG                  0x30
+#define BRW_ARF_MASK                  0x40
+#define BRW_ARF_MASK_STACK            0x50
+#define BRW_ARF_MASK_STACK_DEPTH      0x60
+#define BRW_ARF_STATE                 0x70
+#define BRW_ARF_CONTROL               0x80
+#define BRW_ARF_NOTIFICATION_COUNT    0x90
+#define BRW_ARF_IP                    0xA0
+
+#define BRW_AMASK   0
+#define BRW_IMASK   1
+#define BRW_LMASK   2
+#define BRW_CMASK   3
+
+
+
+#define BRW_THREAD_NORMAL     0
+#define BRW_THREAD_ATOMIC     1
+#define BRW_THREAD_SWITCH     2
+
+#define BRW_VERTICAL_STRIDE_0                 0
+#define BRW_VERTICAL_STRIDE_1                 1
+#define BRW_VERTICAL_STRIDE_2                 2
+#define BRW_VERTICAL_STRIDE_4                 3
+#define BRW_VERTICAL_STRIDE_8                 4
+#define BRW_VERTICAL_STRIDE_16                5
+#define BRW_VERTICAL_STRIDE_32                6
+#define BRW_VERTICAL_STRIDE_64                7
+#define BRW_VERTICAL_STRIDE_128               8
+#define BRW_VERTICAL_STRIDE_256               9
+#define BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL   0xF
+
+#define BRW_WIDTH_1       0
+#define BRW_WIDTH_2       1
+#define BRW_WIDTH_4       2
+#define BRW_WIDTH_8       3
+#define BRW_WIDTH_16      4
+
+#define BRW_STATELESS_BUFFER_BOUNDARY_1K      0
+#define BRW_STATELESS_BUFFER_BOUNDARY_2K      1
+#define BRW_STATELESS_BUFFER_BOUNDARY_4K      2
+#define BRW_STATELESS_BUFFER_BOUNDARY_8K      3
+#define BRW_STATELESS_BUFFER_BOUNDARY_16K     4
+#define BRW_STATELESS_BUFFER_BOUNDARY_32K     5
+#define BRW_STATELESS_BUFFER_BOUNDARY_64K     6
+#define BRW_STATELESS_BUFFER_BOUNDARY_128K    7
+#define BRW_STATELESS_BUFFER_BOUNDARY_256K    8
+#define BRW_STATELESS_BUFFER_BOUNDARY_512K    9
+#define BRW_STATELESS_BUFFER_BOUNDARY_1M      10
+#define BRW_STATELESS_BUFFER_BOUNDARY_2M      11
+
+#define BRW_POLYGON_FACING_FRONT      0
+#define BRW_POLYGON_FACING_BACK       1
+
+#define BRW_MESSAGE_TARGET_NULL               0
+#define BRW_MESSAGE_TARGET_MATH               1
+#define BRW_MESSAGE_TARGET_SAMPLER            2
+#define BRW_MESSAGE_TARGET_GATEWAY            3
+#define BRW_MESSAGE_TARGET_DATAPORT_READ      4
+#define BRW_MESSAGE_TARGET_DATAPORT_WRITE     5
+#define BRW_MESSAGE_TARGET_URB                6
+#define BRW_MESSAGE_TARGET_THREAD_SPAWNER     7
+
+#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32     0
+#define BRW_SAMPLER_RETURN_FORMAT_UINT32      2
+#define BRW_SAMPLER_RETURN_FORMAT_SINT32      3
+
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE              0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE             0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS        0
+#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX             1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD        1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD         1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS  2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
+#define BRW_SAMPLER_MESSAGE_SIMD8_RESINFO             2
+#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO            2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD                3
+#define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
+
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG            0
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_IGDNG          0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG           0
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG       1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_BIAS_IGDNG     1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG      1
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_IGDNG        2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_IGDNG      2
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD_IGDNG       2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG    3
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE_IGDNG  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG   3
+
+/* for IGDNG only */
+#define BRW_SAMPLER_SIMD_MODE_SIMD4X2                   0
+#define BRW_SAMPLER_SIMD_MODE_SIMD8                     1
+#define BRW_SAMPLER_SIMD_MODE_SIMD16                    2
+#define BRW_SAMPLER_SIMD_MODE_SIMD32_64                 3
+
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
+#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
+#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS     3
+#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS     4
+
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD     0
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS    2
+
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
+
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ          2
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
+
+#define BRW_DATAPORT_READ_TARGET_DATA_CACHE      0
+#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE    1
+#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE   2
+
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE                0
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED     1
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01         2
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23         3
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01       4
+
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE                0
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE           1
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_BLOCK_WRITE                2
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE            3
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE              4
+#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
+#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
+
+#define BRW_MATH_FUNCTION_INV                              1
+#define BRW_MATH_FUNCTION_LOG                              2
+#define BRW_MATH_FUNCTION_EXP                              3
+#define BRW_MATH_FUNCTION_SQRT                             4
+#define BRW_MATH_FUNCTION_RSQ                              5
+#define BRW_MATH_FUNCTION_SIN                              6 /* was 7 */
+#define BRW_MATH_FUNCTION_COS                              7 /* was 8 */
+#define BRW_MATH_FUNCTION_SINCOS                           8 /* was 6 */
+#define BRW_MATH_FUNCTION_TAN                              9
+#define BRW_MATH_FUNCTION_POW                              10
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
+#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER                13
+
+#define BRW_MATH_INTEGER_UNSIGNED     0
+#define BRW_MATH_INTEGER_SIGNED       1
+
+#define BRW_MATH_PRECISION_FULL        0
+#define BRW_MATH_PRECISION_PARTIAL     1
+
+#define BRW_MATH_SATURATE_NONE         0
+#define BRW_MATH_SATURATE_SATURATE     1
+
+#define BRW_MATH_DATA_VECTOR  0
+#define BRW_MATH_DATA_SCALAR  1
+
+#define BRW_URB_OPCODE_WRITE  0
+
+#define BRW_URB_SWIZZLE_NONE          0
+#define BRW_URB_SWIZZLE_INTERLEAVE    1
+#define BRW_URB_SWIZZLE_TRANSPOSE     2
+
+#define BRW_SCRATCH_SPACE_SIZE_1K     0
+#define BRW_SCRATCH_SPACE_SIZE_2K     1
+#define BRW_SCRATCH_SPACE_SIZE_4K     2
+#define BRW_SCRATCH_SPACE_SIZE_8K     3
+#define BRW_SCRATCH_SPACE_SIZE_16K    4
+#define BRW_SCRATCH_SPACE_SIZE_32K    5
+#define BRW_SCRATCH_SPACE_SIZE_64K    6
+#define BRW_SCRATCH_SPACE_SIZE_128K   7
+#define BRW_SCRATCH_SPACE_SIZE_256K   8
+#define BRW_SCRATCH_SPACE_SIZE_512K   9
+#define BRW_SCRATCH_SPACE_SIZE_1M     10
+#define BRW_SCRATCH_SPACE_SIZE_2M     11
+
+
+
+
+#define CMD_URB_FENCE                 0x6000
+#define CMD_CS_URB_STATE              0x6001
+#define CMD_CONST_BUFFER              0x6002
+
+#define CMD_STATE_BASE_ADDRESS        0x6101
+#define CMD_STATE_INSN_POINTER        0x6102
+#define CMD_PIPELINE_SELECT_965       0x6104
+#define CMD_PIPELINE_SELECT_GM45      0x6904
+
+#define CMD_PIPELINED_STATE_POINTERS  0x7800
+#define CMD_BINDING_TABLE_PTRS        0x7801
+
+#define CMD_VERTEX_BUFFER             0x7808
+# define BRW_VB0_INDEX_SHIFT		27
+# define BRW_VB0_ACCESS_VERTEXDATA	(0 << 26)
+# define BRW_VB0_ACCESS_INSTANCEDATA	(1 << 26)
+# define BRW_VB0_PITCH_SHIFT		0
+
+#define CMD_VERTEX_ELEMENT            0x7809
+# define BRW_VE0_INDEX_SHIFT		27
+# define BRW_VE0_FORMAT_SHIFT		16
+# define BRW_VE0_VALID			(1 << 26)
+# define BRW_VE0_SRC_OFFSET_SHIFT	0
+# define BRW_VE1_COMPONENT_NOSTORE	0
+# define BRW_VE1_COMPONENT_STORE_SRC	1
+# define BRW_VE1_COMPONENT_STORE_0	2
+# define BRW_VE1_COMPONENT_STORE_1_FLT	3
+# define BRW_VE1_COMPONENT_STORE_1_INT	4
+# define BRW_VE1_COMPONENT_STORE_VID	5
+# define BRW_VE1_COMPONENT_STORE_IID	6
+# define BRW_VE1_COMPONENT_STORE_PID	7
+# define BRW_VE1_COMPONENT_0_SHIFT	28
+# define BRW_VE1_COMPONENT_1_SHIFT	24
+# define BRW_VE1_COMPONENT_2_SHIFT	20
+# define BRW_VE1_COMPONENT_3_SHIFT	16
+# define BRW_VE1_DST_OFFSET_SHIFT	0
+
+#define CMD_INDEX_BUFFER              0x780a
+#define CMD_VF_STATISTICS_965         0x780b
+#define CMD_VF_STATISTICS_GM45        0x680b
+
+#define CMD_DRAW_RECT                 0x7900
+#define CMD_BLEND_CONSTANT_COLOR      0x7901
+#define CMD_CHROMA_KEY                0x7904
+#define CMD_DEPTH_BUFFER              0x7905
+#define CMD_POLY_STIPPLE_OFFSET       0x7906
+#define CMD_POLY_STIPPLE_PATTERN      0x7907
+#define CMD_LINE_STIPPLE_PATTERN      0x7908
+#define CMD_GLOBAL_DEPTH_OFFSET_CLAMP 0x7909
+#define CMD_AA_LINE_PARAMETERS        0x790a
+
+#define CMD_PIPE_CONTROL              0x7a00
+
+#define CMD_3D_PRIM                   0x7b00
+
+#define CMD_MI_FLUSH                  0x0200
+
+
+/* Various values from the R0 vertex header:
+ */
+#define R02_PRIM_END    0x1
+#define R02_PRIM_START  0x2
+
+#define URB_SIZES(brw)                  (BRW_IS_IGDNG(brw) ? 1024 : \
+                                         (BRW_IS_G4X(brw) ? 384 : 256))  /* 512 bit units */
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_disasm.c b/src/gallium/drivers/i965/brw_disasm.c
new file mode 100644
index 0000000000..28c83515ba
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_disasm.c
@@ -0,0 +1,922 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdarg.h>
+
+#include "brw_disasm.h"
+#include "brw_structs.h"
+#include "brw_reg.h"
+#include "brw_defines.h"
+
+struct {
+    char    *name;
+    int	    nsrc;
+    int	    ndst;
+} opcode[128] = {
+    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+
+    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 1, .ndst = 01 },
+    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+};
+
+char *conditional_modifier[16] = {
+    [BRW_CONDITIONAL_NONE] = "",
+    [BRW_CONDITIONAL_Z] = ".e",
+    [BRW_CONDITIONAL_NZ] = ".ne",
+    [BRW_CONDITIONAL_G] = ".g",
+    [BRW_CONDITIONAL_GE] = ".ge",
+    [BRW_CONDITIONAL_L] = ".l",
+    [BRW_CONDITIONAL_LE] = ".le",
+    [BRW_CONDITIONAL_R] = ".r",
+    [BRW_CONDITIONAL_O] = ".o",
+    [BRW_CONDITIONAL_U] = ".u",
+};
+
+char *negate[2] = {
+    [0] = "",
+    [1] = "-",
+};
+
+char *_abs[2] = {
+    [0] = "",
+    [1] = "(abs)",
+};
+
+char *vert_stride[16] = {
+    [0] = "0",
+    [1] = "1",
+    [2] = "2",
+    [3] = "4",
+    [4] = "8",
+    [5] = "16",
+    [6] = "32",
+    [15] = "VxH",
+};
+
+char *width[8] = {
+    [0] = "1",
+    [1] = "2",
+    [2] = "4",
+    [3] = "8",
+    [4] = "16",
+};
+
+char *horiz_stride[4] = {
+    [0] = "0",
+    [1] = "1",
+    [2] = "2",
+    [3] = "4"
+};
+
+char *chan_sel[4] = {
+    [0] = "x",
+    [1] = "y",
+    [2] = "z",
+    [3] = "w",
+};
+
+char *dest_condmod[16] = {
+   [0] = NULL
+};
+
+char *debug_ctrl[2] = {
+    [0] = "",
+    [1] = ".breakpoint"
+};
+
+char *saturate[2] = {
+    [0] = "",
+    [1] = ".sat"
+};
+
+char *exec_size[8] = {
+    [0] = "1",
+    [1] = "2",
+    [2] = "4",
+    [3] = "8",
+    [4] = "16",
+    [5] = "32"
+};
+
+char *pred_inv[2] = {
+    [0] = "+",
+    [1] = "-"
+};
+
+char *pred_ctrl_align16[16] = {
+    [1] = "",
+    [2] = ".x",
+    [3] = ".y",
+    [4] = ".z",
+    [5] = ".w",
+    [6] = ".any4h",
+    [7] = ".all4h",
+};
+
+char *pred_ctrl_align1[16] = {
+    [1] = "",
+    [2] = ".anyv",
+    [3] = ".allv",
+    [4] = ".any2h",
+    [5] = ".all2h",
+    [6] = ".any4h",
+    [7] = ".all4h",
+    [8] = ".any8h",
+    [9] = ".all8h",
+    [10] = ".any16h",
+    [11] = ".all16h",
+};
+
+char *thread_ctrl[4] = {
+    [0] = "",
+    [2] = "switch"
+};
+
+char *compr_ctrl[4] = {
+    [0] = "",
+    [1] = "sechalf",
+    [2] = "compr",
+};
+
+char *dep_ctrl[4] = {
+    [0] = "",
+    [1] = "NoDDClr",
+    [2] = "NoDDChk",
+    [3] = "NoDDClr,NoDDChk",
+};
+
+char *mask_ctrl[4] = {
+    [0] = "",
+    [1] = "nomask",
+};
+
+char *access_mode[2] = {
+    [0] = "align1",
+    [1] = "align16",
+};
+
+char *reg_encoding[8] = {
+    [0] = "UD",
+    [1] = "D",
+    [2] = "UW",
+    [3] = "W",
+    [4] = "UB",
+    [5] = "B",
+    [7] = "F"
+};
+
+char *imm_encoding[8] = {
+    [0] = "UD",
+    [1] = "D",
+    [2] = "UW",
+    [3] = "W",
+    [5] = "VF",
+    [6] = "V",
+    [7] = "F"
+};
+
+char *reg_file[4] = {
+    [0] = "A",
+    [1] = "g",
+    [2] = "m",
+    [3] = "imm",
+};
+
+char *writemask[16] = {
+    [0x0] = ".",
+    [0x1] = ".x",
+    [0x2] = ".y",
+    [0x3] = ".xy",
+    [0x4] = ".z",
+    [0x5] = ".xz",
+    [0x6] = ".yz",
+    [0x7] = ".xyz",
+    [0x8] = ".w",
+    [0x9] = ".xw",
+    [0xa] = ".yw",
+    [0xb] = ".xyw",
+    [0xc] = ".zw",
+    [0xd] = ".xzw",
+    [0xe] = ".yzw",
+    [0xf] = "",
+};
+
+char *end_of_thread[2] = {
+    [0] = "",
+    [1] = "EOT"
+};
+
+char *target_function[16] = {
+    [BRW_MESSAGE_TARGET_NULL] = "null",
+    [BRW_MESSAGE_TARGET_MATH] = "math",
+    [BRW_MESSAGE_TARGET_SAMPLER] = "sampler",
+    [BRW_MESSAGE_TARGET_GATEWAY] = "gateway",
+    [BRW_MESSAGE_TARGET_DATAPORT_READ] = "read",
+    [BRW_MESSAGE_TARGET_DATAPORT_WRITE] = "write",
+    [BRW_MESSAGE_TARGET_URB] = "urb",
+    [BRW_MESSAGE_TARGET_THREAD_SPAWNER] = "thread_spawner"
+};
+
+char *math_function[16] = {
+    [BRW_MATH_FUNCTION_INV] = "inv",
+    [BRW_MATH_FUNCTION_LOG] = "log",
+    [BRW_MATH_FUNCTION_EXP] = "exp",
+    [BRW_MATH_FUNCTION_SQRT] = "sqrt",
+    [BRW_MATH_FUNCTION_RSQ] = "rsq",
+    [BRW_MATH_FUNCTION_SIN] = "sin",
+    [BRW_MATH_FUNCTION_COS] = "cos",
+    [BRW_MATH_FUNCTION_SINCOS] = "sincos",
+    [BRW_MATH_FUNCTION_TAN] = "tan",
+    [BRW_MATH_FUNCTION_POW] = "pow",
+    [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+    [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intmod",
+    [BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intdiv",
+};
+
+char *math_saturate[2] = {
+    [0] = "",
+    [1] = "sat"
+};
+
+char *math_signed[2] = {
+    [0] = "",
+    [1] = "signed"
+};
+
+char *math_scalar[2] = {
+    [0] = "",
+    [1] = "scalar"
+};
+
+char *math_precision[2] = {
+    [0] = "",
+    [1] = "partial_precision"
+};
+
+char *urb_swizzle[4] = {
+    [BRW_URB_SWIZZLE_NONE] = "",
+    [BRW_URB_SWIZZLE_INTERLEAVE] = "interleave",
+    [BRW_URB_SWIZZLE_TRANSPOSE] = "transpose",
+};
+
+char *urb_allocate[2] = {
+    [0] = "",
+    [1] = "allocate"
+};
+
+char *urb_used[2] = {
+    [0] = "",
+    [1] = "used"
+};
+
+char *urb_complete[2] = {
+    [0] = "",
+    [1] = "complete"
+};
+
+char *sampler_target_format[4] = {
+    [0] = "F",
+    [2] = "UD",
+    [3] = "D"
+};
+
+
+static int column;
+
+static int string (FILE *file, char *string)
+{
+    fputs (string, file);
+    column += strlen (string);
+    return 0;
+}
+
+static int format (FILE *f, char *format, ...)
+{
+    char    buf[1024];
+    va_list	args;
+    va_start (args, format);
+
+    vsnprintf (buf, sizeof (buf) - 1, format, args);
+    va_end (args);
+    string (f, buf);
+    return 0;
+}
+
+static int newline (FILE *f)
+{
+    putc ('\n', f);
+    column = 0;
+    return 0;
+}
+
+static int pad (FILE *f, int c)
+{
+    do
+	string (f, " ");
+    while (column < c);
+    return 0;
+}
+
+static int control (FILE *file, char *name, char *ctrl[], GLuint id, int *space)
+{
+    if (!ctrl[id]) {
+	fprintf (file, "*** invalid %s value %d ",
+		 name, id);
+	return 1;
+    }
+    if (ctrl[id][0])
+    {
+	if (space && *space)
+	    string (file, " ");
+	string (file, ctrl[id]);
+	if (space)
+	    *space = 1;
+    }
+    return 0;
+}
+
+static int print_opcode (FILE *file, int id)
+{
+    if (!opcode[id].name) {
+	format (file, "*** invalid opcode value %d ", id);
+	return 1;
+    }
+    string (file, opcode[id].name);
+    return 0;
+}
+
+static int reg (FILE *file, GLuint _reg_file, GLuint _reg_nr)
+{
+    int	err = 0;
+    if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) {
+	switch (_reg_nr & 0xf0) {
+	case BRW_ARF_NULL:
+	    string (file, "null");
+	    return -1;
+	case BRW_ARF_ADDRESS:
+	    format (file, "a%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_ACCUMULATOR:
+	    format (file, "acc%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_MASK:
+	    format (file, "mask%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_MASK_STACK:
+	    format (file, "msd%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_STATE:
+	    format (file, "sr%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_CONTROL:
+	    format (file, "cr%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_NOTIFICATION_COUNT:
+	    format (file, "n%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_IP:
+	    string (file, "ip");
+	    return -1;
+	    break;
+	default:
+	    format (file, "ARF%d", _reg_nr);
+	    break;
+	}
+    } else {
+	err  |= control (file, "src reg file", reg_file, _reg_file, NULL);
+	format (file, "%d", _reg_nr);
+    }
+    return err;
+}
+
+static int dest (FILE *file, const struct brw_instruction *inst)
+{
+    int	err = 0;
+
+    if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    err |= reg (file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr);
+	    if (err == -1)
+		return 0;
+	    if (inst->bits1.da1.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.da1.dest_subreg_nr);
+	    format (file, "<%d>", inst->bits1.da1.dest_horiz_stride);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
+	}
+	else
+	{
+	    string (file, "g[a0");
+	    if (inst->bits1.ia1.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.ia1.dest_subreg_nr);
+	    if (inst->bits1.ia1.dest_indirect_offset)
+		format (file, " %d", inst->bits1.ia1.dest_indirect_offset);
+	    string (file, "]");
+	    format (file, "<%d>", inst->bits1.ia1.dest_horiz_stride);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.ia1.dest_reg_type, NULL);
+	}
+    }
+    else
+    {
+	if (inst->bits1.da16.dest_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    err |= reg (file, inst->bits1.da16.dest_reg_file, inst->bits1.da16.dest_reg_nr);
+	    if (err == -1)
+		return 0;
+	    if (inst->bits1.da16.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.da16.dest_subreg_nr);
+	    string (file, "<1>");
+	    err |= control (file, "writemask", writemask, inst->bits1.da16.dest_writemask, NULL);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da16.dest_reg_type, NULL);
+	}
+	else
+	{
+	    err = 1;
+	    string (file, "Indirect align16 address mode not supported");
+	}
+    }
+
+    return 0;
+}
+
+static int src_align1_region (FILE *file,
+			      GLuint _vert_stride, GLuint _width, GLuint _horiz_stride)
+{
+    int err = 0;
+    string (file, "<");
+    err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+    string (file, ",");
+    err |= control (file, "width", width, _width, NULL);
+    string (file, ",");
+    err |= control (file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+    string (file, ">");
+    return err;
+}
+
+static int src_da1 (FILE *file, GLuint type, GLuint _reg_file,
+		    GLuint _vert_stride, GLuint _width, GLuint _horiz_stride,
+		    GLuint reg_num, GLuint sub_reg_num, GLuint __abs, GLuint _negate)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    err |= reg (file, _reg_file, reg_num);
+    if (err == -1)
+	return 0;
+    if (sub_reg_num)
+	format (file, ".%d", sub_reg_num);
+    src_align1_region (file, _vert_stride, _width, _horiz_stride);
+    err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    return err;
+}
+
+static int src_ia1 (FILE *file,
+		    GLuint type,
+		    GLuint _reg_file,
+		    GLint _addr_imm,
+		    GLuint _addr_subreg_nr,
+		    GLuint _negate,
+		    GLuint __abs,
+		    GLuint _addr_mode,
+		    GLuint _horiz_stride,
+		    GLuint _width,
+		    GLuint _vert_stride)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    string (file, "g[a0");
+    if (_addr_subreg_nr)
+	format (file, ".%d", _addr_subreg_nr);
+    if (_addr_imm)
+	format (file, " %d", _addr_imm);
+    string (file, "]");
+    src_align1_region (file, _vert_stride, _width, _horiz_stride);
+    err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    return err;
+}
+
+static int src_da16 (FILE *file,
+		     GLuint _reg_type,
+		     GLuint _reg_file,
+		     GLuint _vert_stride,
+		     GLuint _reg_nr,
+		     GLuint _subreg_nr,
+		     GLuint __abs,
+		     GLuint _negate,
+		     GLuint swz_x,
+		     GLuint swz_y,
+		     GLuint swz_z,
+		     GLuint swz_w)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    err |= reg (file, _reg_file, _reg_nr);
+    if (err == -1)
+	return 0;
+    if (_subreg_nr)
+	format (file, ".%d", _subreg_nr);
+    string (file, "<");
+    err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+    string (file, ",1,1>");
+    err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL);
+    /*
+     * Three kinds of swizzle display:
+     *  identity - nothing printed
+     *  1->all	 - print the single channel
+     *  1->1     - print the mapping
+     */
+    if (swz_x == BRW_CHANNEL_X &&
+	swz_y == BRW_CHANNEL_Y &&
+	swz_z == BRW_CHANNEL_Z &&
+	swz_w == BRW_CHANNEL_W)
+    {
+	;
+    }
+    else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+    {
+	string (file, ".");
+	err |= control (file, "channel select", chan_sel, swz_x, NULL);
+    }
+    else
+    {
+	string (file, ".");
+	err |= control (file, "channel select", chan_sel, swz_x, NULL);
+	err |= control (file, "channel select", chan_sel, swz_y, NULL);
+	err |= control (file, "channel select", chan_sel, swz_z, NULL);
+	err |= control (file, "channel select", chan_sel, swz_w, NULL);
+    }
+    return err;
+}
+
+
+static int imm (FILE *file, GLuint type, const struct brw_instruction *inst) {
+    switch (type) {
+    case BRW_REGISTER_TYPE_UD:
+	format (file, "0x%08xUD", inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_D:
+	format (file, "%dD", inst->bits3.d);
+	break;
+    case BRW_REGISTER_TYPE_UW:
+	format (file, "0x%04xUW", (uint16_t) inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_W:
+	format (file, "%dW", (int16_t) inst->bits3.d);
+	break;
+    case BRW_REGISTER_TYPE_UB:
+	format (file, "0x%02xUB", (int8_t) inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_VF:
+	format (file, "Vector Float");
+	break;
+    case BRW_REGISTER_TYPE_V:
+	format (file, "0x%08xV", inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_F:
+	format (file, "%-gF", inst->bits3.f);
+    }
+    return 0;
+}
+
+static int src0 (FILE *file, const struct brw_instruction *inst)
+{
+    if (inst->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE)
+	return imm (file, inst->bits1.da1.src0_reg_type,
+		    inst);
+    else if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits2.da1.src0_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da1 (file,
+			    inst->bits1.da1.src0_reg_type,
+			    inst->bits1.da1.src0_reg_file,
+			    inst->bits2.da1.src0_vert_stride,
+			    inst->bits2.da1.src0_width,
+			    inst->bits2.da1.src0_horiz_stride,
+			    inst->bits2.da1.src0_reg_nr,
+			    inst->bits2.da1.src0_subreg_nr,
+			    inst->bits2.da1.src0_abs,
+			    inst->bits2.da1.src0_negate);
+	}
+	else
+	{
+	    return src_ia1 (file,
+			    inst->bits1.ia1.src0_reg_type,
+			    inst->bits1.ia1.src0_reg_file,
+			    inst->bits2.ia1.src0_indirect_offset,
+			    inst->bits2.ia1.src0_subreg_nr,
+			    inst->bits2.ia1.src0_negate,
+			    inst->bits2.ia1.src0_abs,
+			    inst->bits2.ia1.src0_address_mode,
+			    inst->bits2.ia1.src0_horiz_stride,
+			    inst->bits2.ia1.src0_width,
+			    inst->bits2.ia1.src0_vert_stride);
+	}
+    }
+    else
+    {
+	if (inst->bits2.da16.src0_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da16 (file,
+			     inst->bits1.da16.src0_reg_type,
+			     inst->bits1.da16.src0_reg_file,
+			     inst->bits2.da16.src0_vert_stride,
+			     inst->bits2.da16.src0_reg_nr,
+			     inst->bits2.da16.src0_subreg_nr,
+			     inst->bits2.da16.src0_abs,
+			     inst->bits2.da16.src0_negate,
+			     inst->bits2.da16.src0_swz_x,
+			     inst->bits2.da16.src0_swz_y,
+			     inst->bits2.da16.src0_swz_z,
+			     inst->bits2.da16.src0_swz_w);
+	}
+	else
+	{
+	    string (file, "Indirect align16 address mode not supported");
+	    return 1;
+	}
+    }
+}
+
+static int src1 (FILE *file, const struct brw_instruction *inst)
+{
+    if (inst->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
+	return imm (file, inst->bits1.da1.src1_reg_type,
+		    inst);
+    else if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits3.da1.src1_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da1 (file,
+			    inst->bits1.da1.src1_reg_type,
+			    inst->bits1.da1.src1_reg_file,
+			    inst->bits3.da1.src1_vert_stride,
+			    inst->bits3.da1.src1_width,
+			    inst->bits3.da1.src1_horiz_stride,
+			    inst->bits3.da1.src1_reg_nr,
+			    inst->bits3.da1.src1_subreg_nr,
+			    inst->bits3.da1.src1_abs,
+			    inst->bits3.da1.src1_negate);
+	}
+	else
+	{
+	    return src_ia1 (file,
+			    inst->bits1.ia1.src1_reg_type,
+			    inst->bits1.ia1.src1_reg_file,
+			    inst->bits3.ia1.src1_indirect_offset,
+			    inst->bits3.ia1.src1_subreg_nr,
+			    inst->bits3.ia1.src1_negate,
+			    inst->bits3.ia1.src1_abs,
+			    inst->bits3.ia1.src1_address_mode,
+			    inst->bits3.ia1.src1_horiz_stride,
+			    inst->bits3.ia1.src1_width,
+			    inst->bits3.ia1.src1_vert_stride);
+	}
+    }
+    else
+    {
+	if (inst->bits3.da16.src1_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da16 (file,
+			     inst->bits1.da16.src1_reg_type,
+			     inst->bits1.da16.src1_reg_file,
+			     inst->bits3.da16.src1_vert_stride,
+			     inst->bits3.da16.src1_reg_nr,
+			     inst->bits3.da16.src1_subreg_nr,
+			     inst->bits3.da16.src1_abs,
+			     inst->bits3.da16.src1_negate,
+			     inst->bits3.da16.src1_swz_x,
+			     inst->bits3.da16.src1_swz_y,
+			     inst->bits3.da16.src1_swz_z,
+			     inst->bits3.da16.src1_swz_w);
+	}
+	else
+	{
+	    string (file, "Indirect align16 address mode not supported");
+	    return 1;
+	}
+    }
+}
+
+int brw_disasm_insn (FILE *file, const struct brw_instruction *inst)
+{
+    int	err = 0;
+    int space = 0;
+
+    if (inst->header.predicate_control) {
+	string (file, "(");
+	err |= control (file, "predicate inverse", pred_inv, inst->header.predicate_inverse, NULL);
+	string (file, "f0");
+	if (inst->bits2.da1.flag_reg_nr)
+	    format (file, ".%d", inst->bits2.da1.flag_reg_nr);
+	if (inst->header.access_mode == BRW_ALIGN_1)
+	    err |= control (file, "predicate control align1", pred_ctrl_align1,
+			    inst->header.predicate_control, NULL);
+	else
+	    err |= control (file, "predicate control align16", pred_ctrl_align16,
+			    inst->header.predicate_control, NULL);
+	string (file, ") ");
+    }
+
+    err |= print_opcode (file, inst->header.opcode);
+    err |= control (file, "saturate", saturate, inst->header.saturate, NULL);
+    err |= control (file, "debug control", debug_ctrl, inst->header.debug_control, NULL);
+
+    if (inst->header.opcode != BRW_OPCODE_SEND)
+	err |= control (file, "conditional modifier", conditional_modifier,
+			inst->header.destreg__conditionalmod, NULL);
+
+    if (inst->header.opcode != BRW_OPCODE_NOP) {
+	string (file, "(");
+	err |= control (file, "execution size", exec_size, inst->header.execution_size, NULL);
+	string (file, ")");
+    }
+
+    if (inst->header.opcode == BRW_OPCODE_SEND)
+	format (file, " %d", inst->header.destreg__conditionalmod);
+
+    if (opcode[inst->header.opcode].ndst > 0) {
+	pad (file, 16);
+	err |= dest (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 0) {
+	pad (file, 32);
+	err |= src0 (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 1) {
+	pad (file, 48);
+	err |= src1 (file, inst);
+    }
+
+    if (inst->header.opcode == BRW_OPCODE_SEND) {
+	newline (file);
+	pad (file, 16);
+	space = 0;
+	err |= control (file, "target function", target_function,
+			inst->bits3.generic.msg_target, &space);
+	switch (inst->bits3.generic.msg_target) {
+	case BRW_MESSAGE_TARGET_MATH:
+	    err |= control (file, "math function", math_function,
+			    inst->bits3.math.function, &space);
+	    err |= control (file, "math saturate", math_saturate,
+			    inst->bits3.math.saturate, &space);
+	    err |= control (file, "math signed", math_signed,
+			    inst->bits3.math.int_type, &space);
+	    err |= control (file, "math scalar", math_scalar,
+			    inst->bits3.math.data_type, &space);
+	    err |= control (file, "math precision", math_precision,
+			    inst->bits3.math.precision, &space);
+	    break;
+	case BRW_MESSAGE_TARGET_SAMPLER:
+	    format (file, " (%d, %d, ",
+		    inst->bits3.sampler.binding_table_index,
+		    inst->bits3.sampler.sampler);
+	    err |= control (file, "sampler target format", sampler_target_format,
+			    inst->bits3.sampler.return_format, NULL);
+	    string (file, ")");
+	    break;
+	case BRW_MESSAGE_TARGET_DATAPORT_WRITE:
+	    format (file, " (%d, %d, %d, %d)",
+		    inst->bits3.dp_write.binding_table_index,
+		    (inst->bits3.dp_write.pixel_scoreboard_clear << 3) |
+		    inst->bits3.dp_write.msg_control,
+		    inst->bits3.dp_write.msg_type,
+		    inst->bits3.dp_write.send_commit_msg);
+	    break;
+	case BRW_MESSAGE_TARGET_URB:
+	    format (file, " %d", inst->bits3.urb.offset);
+	    space = 1;
+	    err |= control (file, "urb swizzle", urb_swizzle,
+			    inst->bits3.urb.swizzle_control, &space);
+	    err |= control (file, "urb allocate", urb_allocate,
+			    inst->bits3.urb.allocate, &space);
+	    err |= control (file, "urb used", urb_used,
+			    inst->bits3.urb.used, &space);
+	    err |= control (file, "urb complete", urb_complete,
+			    inst->bits3.urb.complete, &space);
+	    break;
+	case BRW_MESSAGE_TARGET_THREAD_SPAWNER:
+	    break;
+	default:
+	    format (file, "unsupported target %d", inst->bits3.generic.msg_target);
+	    break;
+	}
+	if (space)
+	    string (file, " ");
+	format (file, "mlen %d",
+		inst->bits3.generic.msg_length);
+	format (file, " rlen %d",
+		inst->bits3.generic.response_length);
+    }
+    pad (file, 64);
+    if (inst->header.opcode != BRW_OPCODE_NOP) {
+	string (file, "{");
+	space = 1;
+	err |= control(file, "access mode", access_mode, inst->header.access_mode, &space);
+	err |= control (file, "mask control", mask_ctrl, inst->header.mask_control, &space);
+	err |= control (file, "dependency control", dep_ctrl, inst->header.dependency_control, &space);
+	err |= control (file, "compression control", compr_ctrl, inst->header.compression_control, &space);
+	err |= control (file, "thread control", thread_ctrl, inst->header.thread_control, &space);
+	if (inst->header.opcode == BRW_OPCODE_SEND)
+	    err |= control (file, "end of thread", end_of_thread,
+			    inst->bits3.generic.end_of_thread, &space);
+	if (space)
+	    string (file, " ");
+	string (file, "}");
+    }
+    string (file, ";");
+    newline (file);
+    return err;
+}
+
+
+int brw_disasm (FILE *file, 
+                const struct brw_instruction *inst,
+                unsigned count)
+{
+   int i, err;
+
+   for (i = 0; i < count; i++) {
+      err = brw_disasm_insn(stderr, &inst[i]);
+      if (err)
+         return err;
+   }
+
+   fprintf(file, "\n");
+   return 0;
+}
+
diff --git a/src/gallium/drivers/i965/brw_disasm.h b/src/gallium/drivers/i965/brw_disasm.h
new file mode 100644
index 0000000000..ba5b109c48
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_disasm.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#ifndef BRW_DISASM_H
+#define BRW_DISASM_H
+
+#include <stdio.h>
+
+struct brw_instruction;
+
+int brw_disasm_insn (FILE *file, const struct brw_instruction *inst);
+int brw_disasm (FILE *file, 
+                const struct brw_instruction *inst,
+                unsigned count);
+
+#endif
+
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
new file mode 100644
index 0000000000..4625c2048f
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -0,0 +1,291 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_inlines.h"
+#include "util/u_prim.h"
+#include "util/u_upload_mgr.h"
+
+#include "brw_draw.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_debug.h"
+
+#include "brw_batchbuffer.h"
+
+
+static uint32_t prim_to_hw_prim[PIPE_PRIM_POLYGON+1] = {
+   _3DPRIM_POINTLIST,
+   _3DPRIM_LINELIST,
+   _3DPRIM_LINELOOP,
+   _3DPRIM_LINESTRIP,
+   _3DPRIM_TRILIST,
+   _3DPRIM_TRISTRIP,
+   _3DPRIM_TRIFAN,
+   _3DPRIM_QUADLIST,
+   _3DPRIM_QUADSTRIP,
+   _3DPRIM_POLYGON
+};
+
+
+
+/* When the primitive changes, set a state bit and re-validate.  Not
+ * the nicest and would rather deal with this by having all the
+ * programs be immune to the active primitive (ie. cope with all
+ * possibilities).  That may not be realistic however.
+ */
+static int brw_set_prim(struct brw_context *brw, unsigned prim )
+{
+
+   if (BRW_DEBUG & DEBUG_PRIMS)
+      debug_printf("PRIM: %s\n", u_prim_name(prim));
+   
+   if (prim != brw->primitive) {
+      unsigned reduced_prim;
+
+      brw->primitive = prim;
+      brw->state.dirty.brw |= BRW_NEW_PRIMITIVE;
+
+      reduced_prim = u_reduced_prim(prim);
+      if (reduced_prim != brw->reduced_primitive) {
+	 brw->reduced_primitive = reduced_prim;
+	 brw->state.dirty.brw |= BRW_NEW_REDUCED_PRIMITIVE;
+      }
+   }
+
+   return prim_to_hw_prim[prim];
+}
+
+
+
+static int brw_emit_prim(struct brw_context *brw,
+			 unsigned start,
+			 unsigned count,
+			 boolean indexed,
+			 uint32_t hw_prim)
+{
+   struct brw_3d_primitive prim_packet;
+   int ret;
+
+   if (BRW_DEBUG & DEBUG_PRIMS)
+      debug_printf("%s start %d count %d indexed %d hw_prim %d\n",
+                   __FUNCTION__, start, count, indexed, hw_prim); 
+
+   prim_packet.header.opcode = CMD_3D_PRIM;
+   prim_packet.header.length = sizeof(prim_packet)/4 - 2;
+   prim_packet.header.pad = 0;
+   prim_packet.header.topology = hw_prim;
+   prim_packet.header.indexed = indexed;
+
+   prim_packet.verts_per_instance = count;
+   prim_packet.start_vert_location = start;
+   if (indexed)
+      prim_packet.start_vert_location += brw->ib.start_vertex_offset;
+   prim_packet.instance_count = 1;
+   prim_packet.start_instance_location = 0;
+   prim_packet.base_vert_location = 0; /* prim->basevertex; XXX: add this to gallium */
+
+
+   /* If we're set to always flush, do it before and after the primitive emit.
+    * We want to catch both missed flushes that hurt instruction/state cache
+    * and missed flushes of the render cache as it heads to other parts of
+    * the besides the draw code.
+    */
+   if (0) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH((CMD_MI_FLUSH << 16) | BRW_FLUSH_STATE_CACHE);
+      ADVANCE_BATCH();
+   }
+   if (prim_packet.verts_per_instance) {
+      ret = brw_batchbuffer_data( brw->batch, &prim_packet,
+				  sizeof(prim_packet), LOOP_CLIPRECTS);
+      if (ret)
+	 return ret;
+   }
+   if (0) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH((CMD_MI_FLUSH << 16) | BRW_FLUSH_STATE_CACHE);
+      ADVANCE_BATCH();
+   }
+
+   return 0;
+}
+
+
+/* May fail if out of video memory for texture or vbo upload, or on
+ * fallback conditions.
+ */
+static int
+try_draw_range_elements(struct brw_context *brw,
+			struct pipe_resource *index_buffer,
+			unsigned hw_prim, 
+			unsigned start, unsigned count)
+{
+   int ret;
+
+   ret = brw_validate_state(brw);
+   if (ret)
+      return ret;
+
+   /* Check that we can fit our state in with our existing batchbuffer, or
+    * flush otherwise.
+    */
+   ret = brw->sws->check_aperture_space(brw->sws,
+					brw->state.validated_bos,
+					brw->state.validated_bo_count);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_state(brw);
+   if (ret)
+      return ret;
+   
+   ret = brw_emit_prim(brw, start, count, index_buffer != NULL, hw_prim);
+   if (ret)
+      return ret;
+
+   if (brw->flags.always_flush_batch)
+      brw_context_flush( brw );
+
+   return 0;
+}
+
+
+static void
+brw_draw_range_elements(struct pipe_context *pipe,
+			struct pipe_resource *index_buffer,
+			unsigned index_size, int index_bias,
+			unsigned min_index,
+			unsigned max_index,
+			unsigned mode, unsigned start, unsigned count)
+{
+   struct brw_context *brw = brw_context(pipe);
+   int ret;
+   uint32_t hw_prim;
+
+   hw_prim = brw_set_prim(brw, mode);
+
+   if (BRW_DEBUG & DEBUG_PRIMS)
+      debug_printf("PRIM: %s start %d count %d index_buffer %p\n",
+                   u_prim_name(mode), start, count, (void *)index_buffer);
+
+   assert(index_bias == 0);
+
+   /* Potentially trigger upload of new index buffer.
+    *
+    * XXX: do we need to go through state validation to achieve this?
+    * Could just call upload code directly.
+    */
+   if (brw->curr.index_buffer != index_buffer ||
+       brw->curr.index_size != index_size) {
+      pipe_resource_reference( &brw->curr.index_buffer, index_buffer );
+      brw->curr.index_size = index_size;
+      brw->state.dirty.mesa |= PIPE_NEW_INDEX_BUFFER;
+   }
+
+   /* XXX: do we really care?
+    */
+   if (brw->curr.min_index != min_index ||
+       brw->curr.max_index != max_index) 
+   { 
+      brw->curr.min_index = min_index;
+      brw->curr.max_index = max_index;
+      brw->state.dirty.mesa |= PIPE_NEW_INDEX_RANGE;
+   }
+
+
+   /* Make a first attempt at drawing:
+    */
+   ret = try_draw_range_elements(brw, index_buffer, hw_prim, start, count );
+
+   /* Otherwise, flush and retry:
+    */
+   if (ret != 0) {
+      brw_context_flush( brw );
+      ret = try_draw_range_elements(brw, index_buffer, hw_prim, start, count );
+      assert(ret == 0);
+   }
+}
+
+static void
+brw_draw_elements(struct pipe_context *pipe,
+		  struct pipe_resource *index_buffer,
+		  unsigned index_size, int index_bias,
+		  unsigned mode, 
+		  unsigned start, unsigned count)
+{
+   brw_draw_range_elements( pipe, index_buffer,
+                            index_size, index_bias,
+                            0, 0xffffffff,
+                            mode, 
+                            start, count );
+}
+
+static void
+brw_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   brw_draw_elements(pipe, NULL, 0, 0, mode, start, count);
+}
+
+
+
+boolean brw_draw_init( struct brw_context *brw )
+{
+   /* Register our drawing function: 
+    */
+   brw->base.draw_arrays = brw_draw_arrays;
+   brw->base.draw_elements = brw_draw_elements;
+   brw->base.draw_range_elements = brw_draw_range_elements;
+
+   /* Create helpers for uploading data in user buffers:
+    */
+   brw->vb.upload_vertex = u_upload_create( &brw->base,
+					    128 * 1024,
+					    64,
+					    PIPE_BIND_VERTEX_BUFFER );
+   if (brw->vb.upload_vertex == NULL)
+      return FALSE;
+
+   brw->vb.upload_index = u_upload_create( &brw->base,
+					   32 * 1024,
+					   64,
+					   PIPE_BIND_INDEX_BUFFER );
+   if (brw->vb.upload_index == NULL)
+      return FALSE;
+
+   return TRUE;
+}
+
+void brw_draw_cleanup( struct brw_context *brw )
+{
+   u_upload_destroy( brw->vb.upload_vertex );
+   u_upload_destroy( brw->vb.upload_index );
+
+   bo_reference(&brw->ib.bo, NULL);
+}
diff --git a/src/gallium/drivers/i965/brw_draw.h b/src/gallium/drivers/i965/brw_draw.h
new file mode 100644
index 0000000000..8dc5dbce62
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_draw.h
@@ -0,0 +1,39 @@
+ /**************************************************************************
+ * 
+ * Copyright 2005 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_DRAW_H
+#define BRW_DRAW_H
+
+#include "brw_types.h"
+
+struct brw_context;
+
+boolean brw_draw_init( struct brw_context *brw );
+void brw_draw_cleanup( struct brw_context *brw );
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c
new file mode 100644
index 0000000000..337eee8cd9
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@@ -0,0 +1,352 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_context.h"
+#include "util/u_inlines.h"
+
+#include "util/u_upload_mgr.h"
+#include "util/u_math.h"
+
+#include "brw_draw.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_screen.h"
+#include "brw_batchbuffer.h"
+#include "brw_debug.h"
+#include "brw_resource.h"
+
+
+
+
+static unsigned get_index_type(int type)
+{
+   switch (type) {
+   case 1: return BRW_INDEX_BYTE;
+   case 2: return BRW_INDEX_WORD;
+   case 4: return BRW_INDEX_DWORD;
+   default: assert(0); return 0;
+   }
+}
+
+
+static int brw_prepare_vertices(struct brw_context *brw)
+{
+   unsigned int min_index = brw->curr.min_index;
+   unsigned int max_index = brw->curr.max_index;
+   GLuint i;
+   int ret;
+
+   if (BRW_DEBUG & DEBUG_VERTS)
+      debug_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
+
+
+   for (i = 0; i < brw->curr.num_vertex_buffers; i++) {
+      struct pipe_vertex_buffer *vb = &brw->curr.vertex_buffer[i];
+      struct brw_winsys_buffer *bo;
+      struct pipe_resource *upload_buf = NULL;
+      unsigned offset;
+      
+      if (BRW_DEBUG & DEBUG_VERTS)
+	 debug_printf("%s vb[%d] user:%d offset:0x%x sz:0x%x stride:0x%x\n",
+		      __FUNCTION__, i,
+		      brw_buffer_is_user_buffer(vb->buffer),
+		      vb->buffer_offset,
+		      vb->buffer->width0,
+		      vb->stride);
+
+      if (brw_buffer_is_user_buffer(vb->buffer)) {
+
+	 /* XXX: simplify this.  Stop the state trackers from generating
+	  * zero-stride buffers & have them use additional constants (or
+	  * add support for >1 constant buffer) instead.
+	  */
+	 unsigned size = (vb->stride == 0 ? 
+			  vb->buffer->width0 - vb->buffer_offset :
+			  MAX2(vb->buffer->width0 - vb->buffer_offset,
+			       vb->stride * (max_index + 1 - min_index)));
+
+	 ret = u_upload_buffer( brw->vb.upload_vertex, 
+				vb->buffer_offset + min_index * vb->stride,
+				size,
+				vb->buffer,
+				&offset,
+				&upload_buf );
+	 if (ret)
+	    return ret;
+
+	 bo = brw_buffer(upload_buf)->bo;
+	 
+	 assert(offset + size <= bo->size);
+      }
+      else
+      {
+	 offset = vb->buffer_offset;
+	 bo = brw_buffer(vb->buffer)->bo;
+      }
+
+      assert(offset < bo->size);
+      
+      /* Set up post-upload info about this vertex buffer:
+       */
+      brw->vb.vb[i].offset = offset;
+      brw->vb.vb[i].stride = vb->stride;
+      brw->vb.vb[i].vertex_count = (vb->stride == 0 ?
+				    1 :
+				    (bo->size - offset) / vb->stride);
+
+      bo_reference( &brw->vb.vb[i].bo,  bo );
+
+      /* Don't need to retain this reference.  We have a reference on
+       * the underlying winsys buffer:
+       */
+      pipe_resource_reference( &upload_buf, NULL );
+   }
+
+   brw->vb.nr_vb = i;
+   brw_prepare_query_begin(brw);
+
+   for (i = 0; i < brw->vb.nr_vb; i++) {
+      brw_add_validated_bo(brw, brw->vb.vb[i].bo);
+   }
+
+   return 0;
+}
+
+static int brw_emit_vertex_buffers( struct brw_context *brw )
+{
+   int i;
+
+   /* If the VS doesn't read any inputs (calculating vertex position from
+    * a state variable for some reason, for example), just bail.
+    *
+    * The stale VB state stays in place, but they don't do anything unless
+    * a VE loads from them.
+    */
+   if (brw->vb.nr_vb == 0) {
+      if (BRW_DEBUG & DEBUG_VERTS)
+	 debug_printf("%s: no active vertex buffers\n", __FUNCTION__);
+
+      return 0;
+   }
+
+   /* Emit VB state packets.
+    */
+   BEGIN_BATCH(1 + brw->vb.nr_vb * 4, IGNORE_CLIPRECTS);
+   OUT_BATCH((CMD_VERTEX_BUFFER << 16) |
+	     ((1 + brw->vb.nr_vb * 4) - 2));
+
+   for (i = 0; i < brw->vb.nr_vb; i++) {
+      OUT_BATCH((i << BRW_VB0_INDEX_SHIFT) |
+		BRW_VB0_ACCESS_VERTEXDATA |
+		(brw->vb.vb[i].stride << BRW_VB0_PITCH_SHIFT));
+      OUT_RELOC(brw->vb.vb[i].bo,
+		BRW_USAGE_VERTEX,
+		brw->vb.vb[i].offset);
+      if (BRW_IS_IGDNG(brw)) {
+	 OUT_RELOC(brw->vb.vb[i].bo,
+		   BRW_USAGE_VERTEX,
+		   brw->vb.vb[i].bo->size - 1);
+      } else
+	 OUT_BATCH(brw->vb.vb[i].stride ? brw->vb.vb[i].vertex_count : 0);
+      OUT_BATCH(0); /* Instance data step rate */
+   }
+   ADVANCE_BATCH();
+   return 0;
+}
+
+
+
+static int brw_emit_vertex_elements(struct brw_context *brw)
+{
+   const struct brw_vertex_element_packet *brw_velems = brw->curr.velems;
+   unsigned size = brw_velems->header.length + 2;
+
+   /* why is this here */
+   brw_emit_query_begin(brw);
+
+   brw_batchbuffer_data(brw->batch, brw_velems, size * 4, IGNORE_CLIPRECTS);
+
+   return 0;
+}
+
+
+static int brw_emit_vertices( struct brw_context *brw )
+{
+   int ret;
+
+   ret = brw_emit_vertex_buffers( brw );
+   if (ret)
+      return ret;
+
+   /* XXX should separate this? */
+   ret = brw_emit_vertex_elements( brw );
+   if (ret)
+      return ret;
+
+   return 0;
+}
+
+
+const struct brw_tracked_state brw_vertices = {
+   .dirty = {
+      .mesa = (PIPE_NEW_INDEX_RANGE |
+               PIPE_NEW_VERTEX_BUFFER |
+               PIPE_NEW_VERTEX_ELEMENT),
+      .brw = BRW_NEW_BATCH,
+      .cache = 0,
+   },
+   .prepare = brw_prepare_vertices,
+   .emit = brw_emit_vertices,
+};
+
+
+static int brw_prepare_indices(struct brw_context *brw)
+{
+   struct pipe_resource *index_buffer = brw->curr.index_buffer;
+   struct pipe_resource *upload_buf = NULL;
+   struct brw_winsys_buffer *bo = NULL;
+   GLuint offset;
+   GLuint index_size;
+   GLuint ib_size;
+   int ret;
+
+   if (index_buffer == NULL)
+      return 0;
+
+   if (BRW_DEBUG & DEBUG_VERTS)
+      debug_printf("%s: index_size:%d index_buffer->size:%d\n",
+		   __FUNCTION__,
+		   brw->curr.index_size,
+		   brw->curr.index_buffer->width0);
+
+   ib_size = index_buffer->width0;
+   index_size = brw->curr.index_size;
+
+   /* Turn userbuffer into a proper hardware buffer?
+    */
+   if (brw_buffer_is_user_buffer(index_buffer)) {
+
+      ret = u_upload_buffer( brw->vb.upload_index,
+			     0,
+			     ib_size,
+			     index_buffer,
+			     &offset,
+			     &upload_buf );
+      if (ret)
+	 return ret;
+
+      bo = brw_buffer(upload_buf)->bo;
+
+      /* XXX: annotate the userbuffer with the upload information so
+       * that successive calls don't get re-uploaded.
+       */
+   }
+   else {
+      bo = brw_buffer(index_buffer)->bo;
+      ib_size = bo->size;
+      offset = 0;
+   }
+
+   /* Use CMD_3D_PRIM's start_vertex_offset to avoid re-uploading the
+    * index buffer state when we're just moving the start index of our
+    * drawing.
+    *
+    * In gallium this will happen in the case where successive draw
+    * calls are made with (distinct?) userbuffers, but the upload_mgr
+    * places the data into a single winsys buffer.
+    * 
+    * This statechange doesn't raise any state flags and is always
+    * just merged into the final draw packet:
+    */
+   if (1) {
+      assert((offset & (index_size - 1)) == 0);
+      brw->ib.start_vertex_offset = offset / index_size;
+   }
+
+   /* These statechanges trigger a new CMD_INDEX_BUFFER packet:
+    */
+   if (brw->ib.bo != bo ||
+       brw->ib.size != ib_size)
+   {
+      bo_reference(&brw->ib.bo, bo);
+      brw->ib.size = ib_size;
+      brw->state.dirty.brw |= BRW_NEW_INDEX_BUFFER;
+   }
+
+   pipe_resource_reference( &upload_buf, NULL );
+   brw_add_validated_bo(brw, brw->ib.bo);
+   return 0;
+}
+
+const struct brw_tracked_state brw_indices = {
+   .dirty = {
+      .mesa = PIPE_NEW_INDEX_BUFFER,
+      .brw = 0,
+      .cache = 0,
+   },
+   .prepare = brw_prepare_indices,
+};
+
+static int brw_emit_index_buffer(struct brw_context *brw)
+{
+   /* Emit the indexbuffer packet:
+    */
+   if (brw->ib.bo)
+   {
+      struct brw_indexbuffer ib;
+
+      memset(&ib, 0, sizeof(ib));
+
+      ib.header.bits.opcode = CMD_INDEX_BUFFER;
+      ib.header.bits.length = sizeof(ib)/4 - 2;
+      ib.header.bits.index_format = get_index_type(brw->ib.size);
+      ib.header.bits.cut_index_enable = 0;
+
+      BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+      OUT_BATCH( ib.header.dword );
+      OUT_RELOC(brw->ib.bo,
+		BRW_USAGE_VERTEX,
+		brw->ib.offset);
+      OUT_RELOC(brw->ib.bo,
+		BRW_USAGE_VERTEX,
+		brw->ib.offset + brw->ib.size - 1);
+      OUT_BATCH( 0 );
+      ADVANCE_BATCH();
+   }
+
+   return 0;
+}
+
+const struct brw_tracked_state brw_index_buffer = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH | BRW_NEW_INDEX_BUFFER,
+      .cache = 0,
+   },
+   .emit = brw_emit_index_buffer,
+};
diff --git a/src/gallium/drivers/i965/brw_eu.c b/src/gallium/drivers/i965/brw_eu.c
new file mode 100644
index 0000000000..a8fcb5f97e
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_eu.c
@@ -0,0 +1,262 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+#include "util/u_memory.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+
+/* How does predicate control work when execution_size != 8?  Do I
+ * need to test/set for 0xffff when execution_size is 16?
+ */
+void brw_set_predicate_control_flag_value( struct brw_compile *p, GLuint value )
+{
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   if (value != 0xff) {
+      if (value != p->flag_value) {
+	 brw_push_insn_state(p);
+	 brw_MOV(p, brw_flag_reg(), brw_imm_uw(value));
+	 p->flag_value = value;
+	 brw_pop_insn_state(p);
+      }
+
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+   }   
+}
+
+void brw_set_predicate_control( struct brw_compile *p, GLuint pc )
+{
+   p->current->header.predicate_control = pc;
+}
+
+void brw_set_conditionalmod( struct brw_compile *p, GLuint conditional )
+{
+   p->current->header.destreg__conditionalmod = conditional;
+}
+
+void brw_set_access_mode( struct brw_compile *p, GLuint access_mode )
+{
+   p->current->header.access_mode = access_mode;
+}
+
+void brw_set_compression_control( struct brw_compile *p, GLboolean compression_control )
+{
+   p->current->header.compression_control = compression_control;
+}
+
+void brw_set_mask_control( struct brw_compile *p, GLuint value )
+{
+   p->current->header.mask_control = value;
+}
+
+void brw_set_saturate( struct brw_compile *p, GLuint value )
+{
+   p->current->header.saturate = value;
+}
+
+void brw_push_insn_state( struct brw_compile *p )
+{
+   assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
+   memcpy(p->current+1, p->current, sizeof(struct brw_instruction));
+   p->current++;   
+}
+
+void brw_pop_insn_state( struct brw_compile *p )
+{
+   assert(p->current != p->stack);
+   p->current--;
+}
+
+
+/***********************************************************************
+ */
+void brw_init_compile( struct brw_context *brw, struct brw_compile *p )
+{
+   p->brw = brw;
+   p->nr_insn = 0;
+   p->current = p->stack;
+   memset(p->current, 0, sizeof(p->current[0]));
+
+   /* Some defaults?
+    */
+   brw_set_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
+   brw_set_saturate(p, 0);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_predicate_control_flag_value(p, 0xff); 
+}
+
+
+enum pipe_error brw_get_program( struct brw_compile *p,
+                                 const GLuint **data,
+                                 GLuint *sz )
+{
+   GLuint i;
+
+   for (i = 0; i < 8; i++)
+      brw_NOP(p);
+
+   /* Is the generated program malformed for some reason?
+    */
+   if (p->error)
+      return PIPE_ERROR_BAD_INPUT;
+
+   *sz = p->nr_insn * sizeof(struct brw_instruction);
+   *data = (const GLuint *)p->store;
+   return PIPE_OK;
+}
+
+
+
+/**
+ * Subroutine calls require special attention.
+ * Mesa instructions may be expanded into multiple hardware instructions
+ * so the prog_instruction::BranchTarget field can't be used as an index
+ * into the hardware instructions.
+ *
+ * The BranchTarget field isn't needed, however.  Mesa's GLSL compiler
+ * emits CAL and BGNSUB instructions with labels that can be used to map
+ * subroutine calls to actual subroutine code blocks.
+ *
+ * The structures and function here implement patching of CAL instructions
+ * so they jump to the right subroutine code...
+ */
+
+
+/**
+ * For each OPCODE_BGNSUB we create one of these.
+ */
+struct brw_eu_label
+{
+   GLuint label;     /**< the label number */
+   GLuint position;  /**< the position of the brw instruction for this label */
+   struct brw_eu_label *next;  /**< next in linked list */
+};
+
+
+/**
+ * For each OPCODE_CAL we create one of these.
+ */
+struct brw_eu_call
+{
+   GLuint call_inst_pos;  /**< location of the CAL instruction */
+   GLuint label;
+   struct brw_eu_call *next;  /**< next in linked list */
+};
+
+
+/**
+ * Called for each OPCODE_BGNSUB.
+ */
+void
+brw_save_label(struct brw_compile *c, unsigned l, GLuint position)
+{
+   struct brw_eu_label *label = CALLOC_STRUCT(brw_eu_label);
+   label->label = l;
+   label->position = position;
+   label->next = c->first_label;
+   c->first_label = label;
+}
+
+
+/**
+ * Called for each OPCODE_CAL.
+ */
+void
+brw_save_call(struct brw_compile *c, GLuint label, GLuint call_pos)
+{
+   struct brw_eu_call *call = CALLOC_STRUCT(brw_eu_call);
+   call->call_inst_pos = call_pos;
+   call->label = label;
+   call->next = c->first_call;
+   c->first_call = call;
+}
+
+
+/**
+ * Lookup a label, return label's position/offset.
+ */
+static GLuint
+brw_lookup_label(struct brw_compile *c, unsigned l)
+{
+   const struct brw_eu_label *label;
+   for (label = c->first_label; label; label = label->next) {
+      if (l == label->label) {
+         return label->position;
+      }
+   }
+   abort();  /* should never happen */
+   return ~0;
+}
+
+
+/**
+ * When we're done generating code, this function is called to resolve
+ * subroutine calls.
+ */
+void
+brw_resolve_cals(struct brw_compile *c)
+{
+    const struct brw_eu_call *call;
+
+    for (call = c->first_call; call; call = call->next) {
+        const GLuint sub_loc = brw_lookup_label(c, call->label);
+	struct brw_instruction *brw_call_inst = &c->store[call->call_inst_pos];
+	struct brw_instruction *brw_sub_inst = &c->store[sub_loc];
+	GLint offset = brw_sub_inst - brw_call_inst;
+
+	/* patch brw_inst1 to point to brw_inst2 */
+	brw_set_src1(brw_call_inst, brw_imm_d(offset * 16));
+    }
+
+    /* free linked list of calls */
+    {
+        struct brw_eu_call *call, *next;
+        for (call = c->first_call; call; call = next) {
+	    next = call->next;
+	    FREE(call);
+	}
+	c->first_call = NULL;
+    }
+
+    /* free linked list of labels */
+    {
+        struct brw_eu_label *label, *next;
+	for (label = c->first_label; label; label = next) {
+	    next = label->next;
+	    FREE(label);
+	}
+	c->first_label = NULL;
+    }
+}
diff --git a/src/gallium/drivers/i965/brw_eu.h b/src/gallium/drivers/i965/brw_eu.h
new file mode 100644
index 0000000000..af509b2e5f
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_eu.h
@@ -0,0 +1,992 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+   
+
+#ifndef BRW_EU_H
+#define BRW_EU_H
+
+#include "util/u_debug.h"
+#include "pipe/p_defines.h"
+
+#include "brw_structs.h"
+#include "brw_defines.h"
+
+#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
+#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
+
+#define BRW_SWIZZLE_NOOP      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XYZW      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
+#define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
+
+#define BRW_WRITEMASK_NONE     0x00
+#define BRW_WRITEMASK_X        0x01
+#define BRW_WRITEMASK_Y        0x02
+#define BRW_WRITEMASK_XY       0x03
+#define BRW_WRITEMASK_Z        0x04
+#define BRW_WRITEMASK_XZ       0x05
+#define BRW_WRITEMASK_YZ       0x06
+#define BRW_WRITEMASK_XYZ      0x07
+#define BRW_WRITEMASK_W        0x08
+#define BRW_WRITEMASK_XW       0x09
+#define BRW_WRITEMASK_YW       0x0A
+#define BRW_WRITEMASK_XYW      0x0B
+#define BRW_WRITEMASK_ZW       0x0C
+#define BRW_WRITEMASK_XZW      0x0D
+#define BRW_WRITEMASK_YZW      0x0E
+#define BRW_WRITEMASK_XYZW     0x0F
+
+
+#define REG_SIZE (8*4)
+
+
+/* These aren't hardware structs, just something useful for us to pass around:
+ *
+ * Align1 operation has a lot of control over input ranges.  Used in
+ * WM programs to implement shaders decomposed into "channel serial"
+ * or "structure of array" form:
+ */
+struct brw_reg
+{
+   GLuint type:4;
+   GLuint file:2;
+   GLuint nr:8;
+   GLuint subnr:5;		/* :1 in align16 */
+   GLuint negate:1;		/* source only */
+   GLuint abs:1;		/* source only */
+   GLuint vstride:4;		/* source only */
+   GLuint width:3;		/* src only, align1 only */
+   GLuint hstride:2;   		/* align1 only */
+   GLuint address_mode:1;	/* relative addressing, hopefully! */
+   GLuint pad0:1;
+
+   union {      
+      struct {
+	 GLuint swizzle:8;		/* src only, align16 only */
+	 GLuint writemask:4;		/* dest only, align16 only */
+	 GLint  indirect_offset:10;	/* relative addressing offset */
+	 GLuint pad1:10;		/* two dwords total */
+      } bits;
+
+      GLfloat f;
+      GLint   d;
+      GLuint ud;
+   } dw1;      
+};
+
+
+struct brw_indirect {
+   GLuint addr_subnr:4;
+   GLint addr_offset:10;
+   GLuint pad:18;
+};
+
+
+struct brw_eu_label;
+struct brw_eu_call;
+
+
+
+#define BRW_EU_MAX_INSN_STACK 5
+#define BRW_EU_MAX_INSN 10000
+
+struct brw_compile {
+   struct brw_instruction store[BRW_EU_MAX_INSN];
+   GLuint nr_insn;
+
+   /* Allow clients to push/pop instruction state:
+    */
+   struct brw_instruction stack[BRW_EU_MAX_INSN_STACK];
+   struct brw_instruction *current;
+
+   GLuint flag_value;
+   GLboolean single_program_flow;
+   struct brw_context *brw;
+
+   struct brw_eu_label *first_label;  /**< linked list of labels */
+   struct brw_eu_call *first_call;    /**< linked list of CALs */
+
+   boolean error;
+};
+
+
+void
+brw_save_label(struct brw_compile *c, unsigned label, GLuint position);
+
+void
+brw_save_call(struct brw_compile *c, unsigned label, GLuint call_pos);
+
+void
+brw_resolve_cals(struct brw_compile *c);
+
+
+
+static INLINE int type_sz( GLuint type )
+{
+   switch( type ) {
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_F:
+      return 4;
+   case BRW_REGISTER_TYPE_HF:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_W:
+      return 2;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
+/**
+ * Construct a brw_reg.
+ * \param file  one of the BRW_x_REGISTER_FILE values
+ * \param nr  register number/index
+ * \param subnr  register sub number
+ * \param type  one of BRW_REGISTER_TYPE_x
+ * \param vstride  one of BRW_VERTICAL_STRIDE_x
+ * \param width  one of BRW_WIDTH_x
+ * \param hstride  one of BRW_HORIZONTAL_STRIDE_x
+ * \param swizzle  one of BRW_SWIZZLE_x
+ * \param writemask  BRW_WRITEMASK_X/Y/Z/W bitfield
+ */
+static INLINE struct brw_reg brw_reg( GLuint file,
+                                      GLuint nr,
+                                      GLuint subnr,
+                                      GLuint type,
+                                      GLuint vstride,
+                                      GLuint width,
+                                      GLuint hstride,
+                                      GLuint swizzle,
+                                      GLuint writemask )
+{
+   struct brw_reg reg;
+   if (type == BRW_GENERAL_REGISTER_FILE)
+      assert(nr < BRW_MAX_GRF);
+   else if (type == BRW_MESSAGE_REGISTER_FILE)
+      assert(nr < BRW_MAX_MRF);
+   else if (type == BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(nr <= BRW_ARF_IP);
+
+   reg.type = type;
+   reg.file = file;
+   reg.nr = nr;
+   reg.subnr = subnr * type_sz(type);
+   reg.negate = 0;
+   reg.abs = 0;
+   reg.vstride = vstride;
+   reg.width = width;
+   reg.hstride = hstride;
+   reg.address_mode = BRW_ADDRESS_DIRECT;
+   reg.pad0 = 0;
+
+   /* Could do better: If the reg is r5.3<0;1,0>, we probably want to
+    * set swizzle and writemask to W, as the lower bits of subnr will
+    * be lost when converted to align16.  This is probably too much to
+    * keep track of as you'd want it adjusted by suboffset(), etc.
+    * Perhaps fix up when converting to align16?
+    */
+   reg.dw1.bits.swizzle = swizzle;
+   reg.dw1.bits.writemask = writemask;
+   reg.dw1.bits.indirect_offset = 0;
+   reg.dw1.bits.pad1 = 0;
+   return reg;
+}
+
+/** Construct float[16] register */
+static INLINE struct brw_reg brw_vec16_reg( GLuint file,
+					      GLuint nr,
+					      GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_16,
+		  BRW_WIDTH_16,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  BRW_WRITEMASK_XYZW);
+}
+
+/** Construct float[8] register */
+static INLINE struct brw_reg brw_vec8_reg( GLuint file,
+					     GLuint nr,
+					     GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_8,
+		  BRW_WIDTH_8,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  BRW_WRITEMASK_XYZW);
+}
+
+/** Construct float[4] register */
+static INLINE struct brw_reg brw_vec4_reg( GLuint file,
+					      GLuint nr,
+					      GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_4,
+		  BRW_WIDTH_4,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  BRW_WRITEMASK_XYZW);
+}
+
+/** Construct float[2] register */
+static INLINE struct brw_reg brw_vec2_reg( GLuint file,
+					      GLuint nr,
+					      GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_2,
+		  BRW_WIDTH_2,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYXY,
+		  BRW_WRITEMASK_XY);
+}
+
+/** Construct float[1] register */
+static INLINE struct brw_reg brw_vec1_reg( GLuint file,
+					     GLuint nr,
+					     GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_0,
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XXXX,
+		  BRW_WRITEMASK_X);
+}
+
+
+static INLINE struct brw_reg retype( struct brw_reg reg,
+				       GLuint type )
+{
+   reg.type = type;
+   return reg;
+}
+
+static INLINE struct brw_reg suboffset( struct brw_reg reg,
+					  GLuint delta )
+{   
+   reg.subnr += delta * type_sz(reg.type);
+   return reg;
+}
+
+
+static INLINE struct brw_reg offset( struct brw_reg reg,
+				       GLuint delta )
+{
+   reg.nr += delta;
+   return reg;
+}
+
+
+static INLINE struct brw_reg byte_offset( struct brw_reg reg,
+					    GLuint bytes )
+{
+   GLuint newoffset = reg.nr * REG_SIZE + reg.subnr + bytes;
+   reg.nr = newoffset / REG_SIZE;
+   reg.subnr = newoffset % REG_SIZE;
+   return reg;
+}
+   
+
+/** Construct unsigned word[16] register */
+static INLINE struct brw_reg brw_uw16_reg( GLuint file,
+					     GLuint nr,
+					     GLuint subnr )
+{
+   return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[8] register */
+static INLINE struct brw_reg brw_uw8_reg( GLuint file,
+					    GLuint nr,
+					    GLuint subnr )
+{
+   return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[1] register */
+static INLINE struct brw_reg brw_uw1_reg( GLuint file,
+					    GLuint nr,
+					    GLuint subnr )
+{
+   return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static INLINE struct brw_reg brw_imm_reg( GLuint type )
+{
+   return brw_reg( BRW_IMMEDIATE_VALUE,
+		   0,
+		   0,
+		   type,
+		   BRW_VERTICAL_STRIDE_0,
+		   BRW_WIDTH_1,
+		   BRW_HORIZONTAL_STRIDE_0,
+		   0,
+		   0);      
+}
+
+/** Construct float immediate register */
+static INLINE struct brw_reg brw_imm_f( GLfloat f )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
+   imm.dw1.f = f;
+   return imm;
+}
+
+/** Construct integer immediate register */
+static INLINE struct brw_reg brw_imm_d( GLint d )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D);
+   imm.dw1.d = d;
+   return imm;
+}
+
+/** Construct uint immediate register */
+static INLINE struct brw_reg brw_imm_ud( GLuint ud )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD);
+   imm.dw1.ud = ud;
+   return imm;
+}
+
+/** Construct ushort immediate register */
+static INLINE struct brw_reg brw_imm_uw( GLushort uw )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
+   imm.dw1.ud = uw | (uw << 16);
+   return imm;
+}
+
+/** Construct short immediate register */
+static INLINE struct brw_reg brw_imm_w( GLshort w )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
+   imm.dw1.d = w | (w << 16);
+   return imm;
+}
+
+/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type
+ * numbers alias with _V and _VF below:
+ */
+
+/** Construct vector of eight signed half-byte values */
+static INLINE struct brw_reg brw_imm_v( GLuint v )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_8;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = v;
+   return imm;
+}
+
+/** Construct vector of four 8-bit float values */
+static INLINE struct brw_reg brw_imm_vf( GLuint v )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = v;
+   return imm;
+}
+
+#define VF_ZERO 0x0
+#define VF_ONE  0x30
+#define VF_NEG  (1<<7)
+
+static INLINE struct brw_reg brw_imm_vf4( GLuint v0, 
+					    GLuint v1, 
+					    GLuint v2,
+					    GLuint v3)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = ((v0 << 0) |
+		 (v1 << 8) |
+		 (v2 << 16) |
+		 (v3 << 24));
+   return imm;
+}
+
+
+static INLINE struct brw_reg brw_address( struct brw_reg reg )
+{
+   return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr);
+}
+
+/** Construct float[1] general-purpose register */
+static INLINE struct brw_reg brw_vec1_grf( GLuint nr, GLuint subnr )
+{
+   return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[2] general-purpose register */
+static INLINE struct brw_reg brw_vec2_grf( GLuint nr, GLuint subnr )
+{
+   return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[4] general-purpose register */
+static INLINE struct brw_reg brw_vec4_grf( GLuint nr, GLuint subnr )
+{
+   return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[8] general-purpose register */
+static INLINE struct brw_reg brw_vec8_grf( GLuint nr, GLuint subnr )
+{
+   return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+static INLINE struct brw_reg brw_uw8_grf( GLuint nr, GLuint subnr )
+{
+   return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static INLINE struct brw_reg brw_uw16_grf( GLuint nr, GLuint subnr )
+{
+   return brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+/** Construct null register (usually used for setting condition codes) */
+static INLINE struct brw_reg brw_null_reg( void )
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+		       BRW_ARF_NULL, 
+		       0);
+}
+
+static INLINE struct brw_reg brw_address_reg( GLuint subnr )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+		      BRW_ARF_ADDRESS, 
+		      subnr);
+}
+
+/* If/else instructions break in align16 mode if writemask & swizzle
+ * aren't xyzw.  This goes against the convention for other scalar
+ * regs:
+ */
+static INLINE struct brw_reg brw_ip_reg( void )
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+		  BRW_ARF_IP, 
+		  0,
+		  BRW_REGISTER_TYPE_UD,
+		  BRW_VERTICAL_STRIDE_4, /* ? */
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XYZW, /* NOTE! */
+		  BRW_WRITEMASK_XYZW); /* NOTE! */
+}
+
+static INLINE struct brw_reg brw_acc_reg( void )
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+		       BRW_ARF_ACCUMULATOR, 
+		       0);
+}
+
+
+static INLINE struct brw_reg brw_flag_reg( void )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_FLAG,
+		      0);
+}
+
+
+static INLINE struct brw_reg brw_mask_reg( GLuint subnr )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_MASK,
+		      subnr);
+}
+
+static INLINE struct brw_reg brw_message_reg( GLuint nr )
+{
+   assert(nr < BRW_MAX_MRF);
+   return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE,
+		       nr,
+		       0);
+}
+
+
+
+
+/* This is almost always called with a numeric constant argument, so
+ * make things easy to evaluate at compile time:
+ */
+static INLINE GLuint cvt( GLuint val )
+{
+   switch (val) {
+   case 0: return 0;
+   case 1: return 1;
+   case 2: return 2;
+   case 4: return 3;
+   case 8: return 4;
+   case 16: return 5;
+   case 32: return 6;
+   }
+   return 0;
+}
+
+static INLINE struct brw_reg stride( struct brw_reg reg,
+				       GLuint vstride,
+				       GLuint width,
+				       GLuint hstride )
+{
+   reg.vstride = cvt(vstride);
+   reg.width = cvt(width) - 1;
+   reg.hstride = cvt(hstride);
+   return reg;
+}
+
+
+static INLINE struct brw_reg vec16( struct brw_reg reg )
+{
+   return stride(reg, 16,16,1);
+}
+
+static INLINE struct brw_reg vec8( struct brw_reg reg )
+{
+   return stride(reg, 8,8,1);
+}
+
+static INLINE struct brw_reg vec4( struct brw_reg reg )
+{
+   return stride(reg, 4,4,1);
+}
+
+static INLINE struct brw_reg vec2( struct brw_reg reg )
+{
+   return stride(reg, 2,2,1);
+}
+
+static INLINE struct brw_reg vec1( struct brw_reg reg )
+{
+   return stride(reg, 0,1,0);
+}
+
+
+static INLINE struct brw_reg get_element( struct brw_reg reg, GLuint elt )
+{
+   return vec1(suboffset(reg, elt));
+}
+
+static INLINE struct brw_reg get_element_ud( struct brw_reg reg, GLuint elt )
+{
+   return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt));
+}
+
+
+static INLINE struct brw_reg brw_swizzle( struct brw_reg reg,
+					    GLuint x,
+					    GLuint y, 
+					    GLuint z,
+					    GLuint w)
+{
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(reg.dw1.bits.swizzle, x),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, y),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, z),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, w));
+   return reg;
+}
+
+
+static INLINE struct brw_reg brw_swizzle1( struct brw_reg reg,
+					     GLuint x )
+{
+   return brw_swizzle(reg, x, x, x, x);
+}
+
+static INLINE struct brw_reg brw_writemask( struct brw_reg reg,
+					      GLuint mask )
+{
+   reg.dw1.bits.writemask &= mask;
+   return reg;
+}
+
+static INLINE struct brw_reg brw_set_writemask( struct brw_reg reg,
+						  GLuint mask )
+{
+   reg.dw1.bits.writemask = mask;
+   return reg;
+}
+
+static INLINE struct brw_reg negate( struct brw_reg reg )
+{
+   reg.negate ^= 1;
+   return reg;
+}
+
+static INLINE struct brw_reg brw_abs( struct brw_reg reg )
+{
+   reg.abs = 1;
+   return reg;
+}
+
+/***********************************************************************
+ */
+static INLINE struct brw_reg brw_vec4_indirect( GLuint subnr,
+						  GLint offset )
+{
+   struct brw_reg reg =  brw_vec4_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.dw1.bits.indirect_offset = offset;
+   return reg;
+}
+
+static INLINE struct brw_reg brw_vec1_indirect( GLuint subnr,
+						  GLint offset )
+{
+   struct brw_reg reg =  brw_vec1_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.dw1.bits.indirect_offset = offset;
+   return reg;
+}
+
+static INLINE struct brw_reg deref_4f(struct brw_indirect ptr, GLint offset)
+{
+   return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static INLINE struct brw_reg deref_1f(struct brw_indirect ptr, GLint offset)
+{
+   return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static INLINE struct brw_reg deref_4b(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B);
+}
+
+static INLINE struct brw_reg deref_1uw(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW);
+}
+
+static INLINE struct brw_reg deref_1d(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_D);
+}
+
+static INLINE struct brw_reg deref_1ud(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD);
+}
+
+static INLINE struct brw_reg get_addr_reg(struct brw_indirect ptr)
+{
+   return brw_address_reg(ptr.addr_subnr);
+}
+
+static INLINE struct brw_indirect brw_indirect_offset( struct brw_indirect ptr, GLint offset )
+{
+   ptr.addr_offset += offset;
+   return ptr;
+}
+
+static INLINE struct brw_indirect brw_indirect( GLuint addr_subnr, GLint offset )
+{
+   struct brw_indirect ptr;
+   ptr.addr_subnr = addr_subnr;
+   ptr.addr_offset = offset;
+   ptr.pad = 0;
+   return ptr;
+}
+
+/** Do two brw_regs refer to the same register? */
+static INLINE GLboolean
+brw_same_reg(struct brw_reg r1, struct brw_reg r2)
+{
+   return r1.file == r2.file && r1.nr == r2.nr;
+}
+
+static INLINE struct brw_instruction *current_insn( struct brw_compile *p)
+{
+   return &p->store[p->nr_insn];
+}
+
+void brw_pop_insn_state( struct brw_compile *p );
+void brw_push_insn_state( struct brw_compile *p );
+void brw_set_mask_control( struct brw_compile *p, GLuint value );
+void brw_set_saturate( struct brw_compile *p, GLuint value );
+void brw_set_access_mode( struct brw_compile *p, GLuint access_mode );
+void brw_set_compression_control( struct brw_compile *p, GLboolean control );
+void brw_set_predicate_control_flag_value( struct brw_compile *p, GLuint value );
+void brw_set_predicate_control( struct brw_compile *p, GLuint pc );
+void brw_set_conditionalmod( struct brw_compile *p, GLuint conditional );
+
+void brw_init_compile( struct brw_context *, struct brw_compile *p );
+
+enum pipe_error brw_get_program( struct brw_compile *p, 
+                                 const GLuint **program,
+                                 GLuint *sz );
+
+
+/* Helpers for regular instructions:
+ */
+#define ALU1(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0);
+
+#define ALU2(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1);
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU2(RSR)
+ALU2(RSL)
+ALU2(ASR)
+ALU2(JMPI)
+ALU2(ADD)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDZ)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+
+#undef ALU1
+#undef ALU2
+
+
+
+/* Helpers for SEND instruction:
+ */
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle);
+
+void brw_ff_sync(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle);
+
+void brw_fb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLuint binding_table_index,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot);
+
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		GLuint msg_reg_nr,
+		struct brw_reg src0,
+		GLuint binding_table_index,
+		GLuint sampler,
+		GLuint writemask,
+		GLuint msg_type,
+		GLuint response_length,
+		GLuint msg_length,
+		GLboolean eot,
+		GLuint header_present,
+		GLuint simd_mode);
+
+void brw_math_16( struct brw_compile *p,
+		  struct brw_reg dest,
+		  GLuint function,
+		  GLuint saturate,
+		  GLuint msg_reg_nr,
+		  struct brw_reg src,
+		  GLuint precision );
+
+void brw_math( struct brw_compile *p,
+	       struct brw_reg dest,
+	       GLuint function,
+	       GLuint saturate,
+	       GLuint msg_reg_nr,
+	       struct brw_reg src,
+	       GLuint data_type,
+	       GLuint precision );
+
+void brw_dp_READ_16( struct brw_compile *p,
+		     struct brw_reg dest,
+		     GLuint scratch_offset );
+
+void brw_dp_READ_4( struct brw_compile *p,
+                    struct brw_reg dest,
+                    GLboolean relAddr,
+                    GLuint location,
+                    GLuint bind_table_index );
+
+void brw_dp_READ_4_vs( struct brw_compile *p,
+                       struct brw_reg dest,
+                       GLuint oword,
+                       GLboolean relAddr,
+                       struct brw_reg addrReg,
+                       GLuint location,
+                       GLuint bind_table_index );
+
+void brw_dp_WRITE_16( struct brw_compile *p,
+		      struct brw_reg src,
+		      GLuint scratch_offset );
+
+/* If/else/endif.  Works by manipulating the execution flags on each
+ * channel.
+ */
+struct brw_instruction *brw_IF(struct brw_compile *p, 
+			       GLuint execute_size);
+
+struct brw_instruction *brw_ELSE(struct brw_compile *p, 
+				 struct brw_instruction *if_insn);
+
+void brw_ENDIF(struct brw_compile *p, 
+	       struct brw_instruction *if_or_else_insn);
+
+
+/* DO/WHILE loops:
+ */
+struct brw_instruction *brw_DO(struct brw_compile *p,
+			       GLuint execute_size);
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p, 
+	       struct brw_instruction *patch_insn);
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p);
+struct brw_instruction *brw_CONT(struct brw_compile *p);
+/* Forward jumps:
+ */
+void brw_land_fwd_jump(struct brw_compile *p, 
+		       struct brw_instruction *jmp_insn);
+
+
+
+void brw_NOP(struct brw_compile *p);
+
+/* Special case: there is never a destination, execution size will be
+ * taken from src0:
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     GLuint conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1);
+
+void brw_print_reg( struct brw_reg reg );
+
+
+/*********************************************************************** 
+ * brw_eu_util.c:
+ */
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   GLuint count);
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    GLuint count);
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count);
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count);
+
+void brw_math_invert( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg src);
+
+void brw_set_src1( struct brw_instruction *insn,
+                          struct brw_reg reg );
+#endif
diff --git a/src/gallium/drivers/i965/brw_eu_debug.c b/src/gallium/drivers/i965/brw_eu_debug.c
new file mode 100644
index 0000000000..5989f5a04e
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_eu_debug.c
@@ -0,0 +1,94 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+#include "util/u_debug.h"
+
+#include "brw_eu.h"
+
+void brw_print_reg( struct brw_reg hwreg )
+{
+   static const char *file[] = {
+      "arf",
+      "grf",
+      "msg",
+      "imm"
+   };
+
+   static const char *type[] = {
+      "ud",
+      "d",
+      "uw",
+      "w",
+      "ub",
+      "vf",
+      "hf",
+      "f"
+   };
+
+   debug_printf("%s%s", 
+		hwreg.abs ? "abs/" : "",
+		hwreg.negate ? "-" : "");
+     
+   if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+       hwreg.nr % 2 == 0 &&
+       hwreg.subnr == 0 &&
+       hwreg.vstride == BRW_VERTICAL_STRIDE_8 &&
+       hwreg.width == BRW_WIDTH_8 &&
+       hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
+       hwreg.type == BRW_REGISTER_TYPE_F) {
+      /* vector register */
+      debug_printf("vec%d", hwreg.nr);
+   }
+   else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
+	    hwreg.width == BRW_WIDTH_1 &&
+	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
+	    hwreg.type == BRW_REGISTER_TYPE_F) {      
+      /* "scalar" register */
+      debug_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
+   }
+   else if (hwreg.file == BRW_IMMEDIATE_VALUE) {
+      debug_printf("imm %f", hwreg.dw1.f);
+   }
+   else {
+      debug_printf("%s%d.%d<%d;%d,%d>:%s", 
+		   file[hwreg.file],
+		   hwreg.nr,
+		   hwreg.subnr / type_sz(hwreg.type),
+		   hwreg.vstride ? (1<<(hwreg.vstride-1)) : 0,
+		   1<<hwreg.width,
+		   hwreg.hstride ? (1<<(hwreg.hstride-1)) : 0,		
+		   type[hwreg.type]);
+   }
+}
+
+
+
diff --git a/src/gallium/drivers/i965/brw_eu_emit.c b/src/gallium/drivers/i965/brw_eu_emit.c
new file mode 100644
index 0000000000..00d8eaccbc
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_eu_emit.c
@@ -0,0 +1,1433 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+     
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+#include "brw_debug.h"
+#include "brw_disasm.h"
+
+
+
+
+/***********************************************************************
+ * Internal helper for constructing instructions
+ */
+
+static void guess_execution_size( struct brw_instruction *insn,
+				  struct brw_reg reg )
+{
+   if (reg.width == BRW_WIDTH_8 && 
+       insn->header.compression_control == BRW_COMPRESSION_COMPRESSED) 
+      insn->header.execution_size = BRW_EXECUTE_16;
+   else
+      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
+}
+
+
+static void brw_set_dest( struct brw_instruction *insn,
+			  struct brw_reg dest )
+{
+   if (dest.type != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(dest.nr < 128);
+
+   insn->bits1.da1.dest_reg_file = dest.file;
+   insn->bits1.da1.dest_reg_type = dest.type;
+   insn->bits1.da1.dest_address_mode = dest.address_mode;
+
+   if (dest.address_mode == BRW_ADDRESS_DIRECT) {   
+      insn->bits1.da1.dest_reg_nr = dest.nr;
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
+	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
+      }
+      else {
+	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
+	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
+      }
+   }
+   else {
+      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
+
+      /* These are different sizes in align1 vs align16:
+       */
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
+      }
+      else {
+	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+      }
+   }
+
+   /* NEW: Set the execution size based on dest.width and
+    * insn->compression_control:
+    */
+   guess_execution_size(insn, dest);
+}
+
+static void brw_set_src0( struct brw_instruction *insn,
+                          struct brw_reg reg )
+{
+   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(reg.nr < 128);
+
+   insn->bits1.da1.src0_reg_file = reg.file;
+   insn->bits1.da1.src0_reg_type = reg.type;
+   insn->bits2.da1.src0_abs = reg.abs;
+   insn->bits2.da1.src0_negate = reg.negate;
+   insn->bits2.da1.src0_address_mode = reg.address_mode;
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      insn->bits3.ud = reg.dw1.ud;
+   
+      /* Required to set some fields in src1 as well:
+       */
+      insn->bits1.da1.src1_reg_file = 0; /* arf */
+      insn->bits1.da1.src1_reg_type = reg.type;
+   }
+   else 
+   {
+      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
+	 if (insn->header.access_mode == BRW_ALIGN_1) {
+	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
+	    insn->bits2.da1.src0_reg_nr = reg.nr;
+	 }
+	 else {
+	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
+	    insn->bits2.da16.src0_reg_nr = reg.nr;
+	 }
+      }
+      else {
+	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
+
+	 if (insn->header.access_mode == BRW_ALIGN_1) {
+	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset; 
+	 }
+	 else {
+	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
+	 }
+      }
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 if (reg.width == BRW_WIDTH_1 && 
+	     insn->header.execution_size == BRW_EXECUTE_1) {
+	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
+	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
+	 }
+	 else {
+	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
+	    insn->bits2.da1.src0_width = reg.width;
+	    insn->bits2.da1.src0_vert_stride = reg.vstride;
+	 }
+      }
+      else {
+	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
+	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
+	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
+	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
+
+	 /* This is an oddity of the fact we're using the same
+	  * descriptions for registers in align_16 as align_1:
+	  */
+	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
+	 else
+	    insn->bits2.da16.src0_vert_stride = reg.vstride;
+      }
+   }
+}
+
+
+void brw_set_src1( struct brw_instruction *insn,
+                   struct brw_reg reg )
+{
+   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+   assert(reg.nr < 128);
+
+   insn->bits1.da1.src1_reg_file = reg.file;
+   insn->bits1.da1.src1_reg_type = reg.type;
+   insn->bits3.da1.src1_abs = reg.abs;
+   insn->bits3.da1.src1_negate = reg.negate;
+
+   /* Only src1 can be immediate in two-argument instructions.
+    */
+   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      insn->bits3.ud = reg.dw1.ud;
+   }
+   else {
+      /* This is a hardware restriction, which may or may not be lifted
+       * in the future:
+       */
+      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
+      /*assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
+	 insn->bits3.da1.src1_reg_nr = reg.nr;
+      }
+      else {
+	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
+	 insn->bits3.da16.src1_reg_nr = reg.nr;
+      }
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 if (reg.width == BRW_WIDTH_1 && 
+	     insn->header.execution_size == BRW_EXECUTE_1) {
+	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
+	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
+	 }
+	 else {
+	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
+	    insn->bits3.da1.src1_width = reg.width;
+	    insn->bits3.da1.src1_vert_stride = reg.vstride;
+	 }
+      }
+      else {
+	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
+	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
+	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
+	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
+
+	 /* This is an oddity of the fact we're using the same
+	  * descriptions for registers in align_16 as align_1:
+	  */
+	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
+	 else
+	    insn->bits3.da16.src1_vert_stride = reg.vstride;
+      }
+   }
+}
+
+
+
+static void brw_set_math_message( struct brw_context *brw,
+				  struct brw_instruction *insn,
+				  GLuint msg_length,
+				  GLuint response_length,
+				  GLuint function,
+				  GLuint integer_type,
+				  GLboolean low_precision,
+				  GLboolean saturate,
+				  GLuint dataType )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.math_igdng.function = function;
+       insn->bits3.math_igdng.int_type = integer_type;
+       insn->bits3.math_igdng.precision = low_precision;
+       insn->bits3.math_igdng.saturate = saturate;
+       insn->bits3.math_igdng.data_type = dataType;
+       insn->bits3.math_igdng.snapshot = 0;
+       insn->bits3.math_igdng.header_present = 0;
+       insn->bits3.math_igdng.response_length = response_length;
+       insn->bits3.math_igdng.msg_length = msg_length;
+       insn->bits3.math_igdng.end_of_thread = 0;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_MATH;
+       insn->bits2.send_igdng.end_of_thread = 0;
+   } else {
+       insn->bits3.math.function = function;
+       insn->bits3.math.int_type = integer_type;
+       insn->bits3.math.precision = low_precision;
+       insn->bits3.math.saturate = saturate;
+       insn->bits3.math.data_type = dataType;
+       insn->bits3.math.response_length = response_length;
+       insn->bits3.math.msg_length = msg_length;
+       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
+       insn->bits3.math.end_of_thread = 0;
+   }
+}
+
+
+static void brw_set_ff_sync_message( struct brw_context *brw,
+				 struct brw_instruction *insn,
+				 GLboolean allocate,
+				 GLboolean used,
+				 GLuint msg_length,
+				 GLuint response_length,
+				 GLboolean end_of_thread,
+				 GLboolean complete,
+				 GLuint offset,
+				 GLuint swizzle_control )
+{
+	brw_set_src1(insn, brw_imm_d(0));
+
+	insn->bits3.urb_igdng.opcode = 1;
+	insn->bits3.urb_igdng.offset = offset;
+	insn->bits3.urb_igdng.swizzle_control = swizzle_control;
+	insn->bits3.urb_igdng.allocate = allocate;
+	insn->bits3.urb_igdng.used = used;
+	insn->bits3.urb_igdng.complete = complete;
+	insn->bits3.urb_igdng.header_present = 1;
+	insn->bits3.urb_igdng.response_length = response_length;
+	insn->bits3.urb_igdng.msg_length = msg_length;
+	insn->bits3.urb_igdng.end_of_thread = end_of_thread;
+	insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+	insn->bits2.send_igdng.end_of_thread = end_of_thread;
+}
+
+static void brw_set_urb_message( struct brw_context *brw,
+				 struct brw_instruction *insn,
+				 GLboolean allocate,
+				 GLboolean used,
+				 GLuint msg_length,
+				 GLuint response_length,
+				 GLboolean end_of_thread,
+				 GLboolean complete,
+				 GLuint offset,
+				 GLuint swizzle_control )
+{
+    brw_set_src1(insn, brw_imm_d(0));
+
+    if (BRW_IS_IGDNG(brw)) {
+        insn->bits3.urb_igdng.opcode = 0;	/* ? */
+        insn->bits3.urb_igdng.offset = offset;
+        insn->bits3.urb_igdng.swizzle_control = swizzle_control;
+        insn->bits3.urb_igdng.allocate = allocate;
+        insn->bits3.urb_igdng.used = used;	/* ? */
+        insn->bits3.urb_igdng.complete = complete;
+        insn->bits3.urb_igdng.header_present = 1;
+        insn->bits3.urb_igdng.response_length = response_length;
+        insn->bits3.urb_igdng.msg_length = msg_length;
+        insn->bits3.urb_igdng.end_of_thread = end_of_thread;
+        insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+        insn->bits2.send_igdng.end_of_thread = end_of_thread;
+    } else {
+        insn->bits3.urb.opcode = 0;	/* ? */
+        insn->bits3.urb.offset = offset;
+        insn->bits3.urb.swizzle_control = swizzle_control;
+        insn->bits3.urb.allocate = allocate;
+        insn->bits3.urb.used = used;	/* ? */
+        insn->bits3.urb.complete = complete;
+        insn->bits3.urb.response_length = response_length;
+        insn->bits3.urb.msg_length = msg_length;
+        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
+        insn->bits3.urb.end_of_thread = end_of_thread;
+    }
+}
+
+static void brw_set_dp_write_message( struct brw_context *brw,
+				      struct brw_instruction *insn,
+				      GLuint binding_table_index,
+				      GLuint msg_control,
+				      GLuint msg_type,
+				      GLuint msg_length,
+				      GLuint pixel_scoreboard_clear,
+				      GLuint response_length,
+				      GLuint end_of_thread )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.dp_write_igdng.binding_table_index = binding_table_index;
+       insn->bits3.dp_write_igdng.msg_control = msg_control;
+       insn->bits3.dp_write_igdng.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write_igdng.msg_type = msg_type;
+       insn->bits3.dp_write_igdng.send_commit_msg = 0;
+       insn->bits3.dp_write_igdng.header_present = 1;
+       insn->bits3.dp_write_igdng.response_length = response_length;
+       insn->bits3.dp_write_igdng.msg_length = msg_length;
+       insn->bits3.dp_write_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   } else {
+       insn->bits3.dp_write.binding_table_index = binding_table_index;
+       insn->bits3.dp_write.msg_control = msg_control;
+       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write.msg_type = msg_type;
+       insn->bits3.dp_write.send_commit_msg = 0;
+       insn->bits3.dp_write.response_length = response_length;
+       insn->bits3.dp_write.msg_length = msg_length;
+       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits3.dp_write.end_of_thread = end_of_thread;
+   }
+}
+
+static void brw_set_dp_read_message( struct brw_context *brw,
+				      struct brw_instruction *insn,
+				      GLuint binding_table_index,
+				      GLuint msg_control,
+				      GLuint msg_type,
+				      GLuint target_cache,
+				      GLuint msg_length,
+				      GLuint response_length,
+				      GLuint end_of_thread )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.dp_read_igdng.binding_table_index = binding_table_index;
+       insn->bits3.dp_read_igdng.msg_control = msg_control;
+       insn->bits3.dp_read_igdng.msg_type = msg_type;
+       insn->bits3.dp_read_igdng.target_cache = target_cache;
+       insn->bits3.dp_read_igdng.header_present = 1;
+       insn->bits3.dp_read_igdng.response_length = response_length;
+       insn->bits3.dp_read_igdng.msg_length = msg_length;
+       insn->bits3.dp_read_igdng.pad1 = 0;
+       insn->bits3.dp_read_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   } else {
+       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
+       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
+       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
+       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
+       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
+       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
+       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
+       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
+       insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
+   }
+}
+
+static void brw_set_sampler_message(struct brw_context *brw,
+                                    struct brw_instruction *insn,
+                                    GLuint binding_table_index,
+                                    GLuint sampler,
+                                    GLuint msg_type,
+                                    GLuint response_length,
+                                    GLuint msg_length,
+                                    GLboolean eot,
+                                    GLuint header_present,
+                                    GLuint simd_mode)
+{
+   assert(eot == 0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   if (BRW_IS_IGDNG(brw)) {
+      insn->bits3.sampler_igdng.binding_table_index = binding_table_index;
+      insn->bits3.sampler_igdng.sampler = sampler;
+      insn->bits3.sampler_igdng.msg_type = msg_type;
+      insn->bits3.sampler_igdng.simd_mode = simd_mode;
+      insn->bits3.sampler_igdng.header_present = header_present;
+      insn->bits3.sampler_igdng.response_length = response_length;
+      insn->bits3.sampler_igdng.msg_length = msg_length;
+      insn->bits3.sampler_igdng.end_of_thread = eot;
+      insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_SAMPLER;
+      insn->bits2.send_igdng.end_of_thread = eot;
+   } else if (BRW_IS_G4X(brw)) {
+      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
+      insn->bits3.sampler_g4x.sampler = sampler;
+      insn->bits3.sampler_g4x.msg_type = msg_type;
+      insn->bits3.sampler_g4x.response_length = response_length;
+      insn->bits3.sampler_g4x.msg_length = msg_length;
+      insn->bits3.sampler_g4x.end_of_thread = eot;
+      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
+   } else {
+      insn->bits3.sampler.binding_table_index = binding_table_index;
+      insn->bits3.sampler.sampler = sampler;
+      insn->bits3.sampler.msg_type = msg_type;
+      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+      insn->bits3.sampler.response_length = response_length;
+      insn->bits3.sampler.msg_length = msg_length;
+      insn->bits3.sampler.end_of_thread = eot;
+      insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
+   }
+}
+
+
+
+static struct brw_instruction *next_insn( struct brw_compile *p, 
+					  GLuint opcode )
+{
+   struct brw_instruction *insn;
+
+   if (0 && (BRW_DEBUG & DEBUG_DISASSEM))
+   {
+      if (p->nr_insn) 
+         brw_disasm_insn(stderr, &p->store[p->nr_insn-1]);
+   }
+
+   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
+
+   insn = &p->store[p->nr_insn++];
+   memcpy(insn, p->current, sizeof(*insn));
+
+   /* Reset this one-shot flag: 
+    */
+
+   if (p->current->header.destreg__conditionalmod) {
+      p->current->header.destreg__conditionalmod = 0;
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+   }
+
+   insn->header.opcode = opcode;
+   return insn;
+}
+
+
+static struct brw_instruction *brw_alu1( struct brw_compile *p,
+					 GLuint opcode,
+					 struct brw_reg dest,
+					 struct brw_reg src )
+{
+   struct brw_instruction *insn = next_insn(p, opcode);
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);   
+   return insn;
+}
+
+static struct brw_instruction *brw_alu2(struct brw_compile *p,
+					GLuint opcode,
+					struct brw_reg dest,
+					struct brw_reg src0,
+					struct brw_reg src1 )
+{
+   struct brw_instruction *insn = next_insn(p, opcode);   
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, src1);
+   return insn;
+}
+
+
+/***********************************************************************
+ * Convenience routines.
+ */
+#define ALU1(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0)   			\
+{							\
+   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
+}
+
+#define ALU2(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1)   			\
+{							\
+   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
+}
+
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU2(RSR)
+ALU2(RSL)
+ALU2(ASR)
+ALU2(ADD)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDZ)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+
+
+
+
+void brw_NOP(struct brw_compile *p)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);   
+   brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_src1(insn, brw_imm_ud(0x0));
+}
+
+
+
+
+
+/***********************************************************************
+ * Comparisons, if/else/endif
+ */
+
+struct brw_instruction *brw_JMPI(struct brw_compile *p, 
+                                 struct brw_reg dest,
+                                 struct brw_reg src0,
+                                 struct brw_reg src1)
+{
+   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
+
+   insn->header.execution_size = 1;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   return insn;
+}
+
+/* EU takes the value from the flag register and pushes it onto some
+ * sort of a stack (presumably merging with any flag value already on
+ * the stack).  Within an if block, the flags at the top of the stack
+ * control execution on each channel of the unit, eg. on each of the
+ * 16 pixel values in our wm programs.
+ *
+ * When the matching 'else' instruction is reached (presumably by
+ * countdown of the instruction count patched in by our ELSE/ENDIF
+ * functions), the relevent flags are inverted.
+ *
+ * When the matching 'endif' instruction is reached, the flags are
+ * popped off.  If the stack is now empty, normal execution resumes.
+ *
+ * No attempt is made to deal with stack overflow (14 elements?).
+ */
+struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
+{
+   struct brw_instruction *insn;
+
+   if (p->single_program_flow) {
+      assert(execute_size == BRW_EXECUTE_1);
+
+      insn = next_insn(p, BRW_OPCODE_ADD);
+      insn->header.predicate_inverse = 1;
+   } else {
+      insn = next_insn(p, BRW_OPCODE_IF);
+   }
+
+   /* Override the defaults for this instruction:
+    */
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.execution_size = execute_size;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
+   insn->header.mask_control = BRW_MASK_ENABLE;
+   if (!p->single_program_flow)
+       insn->header.thread_control = BRW_THREAD_SWITCH;
+
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   return insn;
+}
+
+
+struct brw_instruction *brw_ELSE(struct brw_compile *p, 
+				 struct brw_instruction *if_insn)
+{
+   struct brw_instruction *insn;
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2;
+
+   if (p->single_program_flow) {
+      insn = next_insn(p, BRW_OPCODE_ADD);
+   } else {
+      insn = next_insn(p, BRW_OPCODE_ELSE);
+   }
+
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = if_insn->header.execution_size;
+   insn->header.mask_control = BRW_MASK_ENABLE;
+   if (!p->single_program_flow)
+       insn->header.thread_control = BRW_THREAD_SWITCH;
+
+   /* Patch the if instruction to point at this instruction.
+    */
+   if (p->single_program_flow) {
+      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
+
+      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
+   } else {
+      assert(if_insn->header.opcode == BRW_OPCODE_IF);
+
+      if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
+      if_insn->bits3.if_else.pop_count = 0;
+      if_insn->bits3.if_else.pad0 = 0;
+   }
+
+   return insn;
+}
+
+void brw_ENDIF(struct brw_compile *p, 
+	       struct brw_instruction *patch_insn)
+{
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2; 
+ 
+   if (p->single_program_flow) {
+      /* In single program flow mode, there's no need to execute an ENDIF,
+       * since we don't need to do any stack operations, and if we're executing
+       * currently, we want to just continue executing.
+       */
+      struct brw_instruction *next = &p->store[p->nr_insn];
+
+      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
+
+      patch_insn->bits3.ud = (next - patch_insn) * 16;
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
+
+      brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_src1(insn, brw_imm_d(0x0));
+
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = patch_insn->header.execution_size;
+      insn->header.mask_control = BRW_MASK_ENABLE;
+      insn->header.thread_control = BRW_THREAD_SWITCH;
+
+      assert(patch_insn->bits3.if_else.jump_count == 0);
+
+      /* Patch the if or else instructions to point at this or the next
+       * instruction respectively.
+       */
+      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
+	 /* Automagically turn it into an IFF:
+	  */
+	 patch_insn->header.opcode = BRW_OPCODE_IFF;
+	 patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
+	 patch_insn->bits3.if_else.pop_count = 0;
+	 patch_insn->bits3.if_else.pad0 = 0;
+      } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) {
+	 patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
+	 patch_insn->bits3.if_else.pop_count = 1;
+	 patch_insn->bits3.if_else.pad0 = 0;
+      } else {
+	 assert(0);
+      }
+
+      /* Also pop item off the stack in the endif instruction:
+       */
+      insn->bits3.if_else.jump_count = 0;
+      insn->bits3.if_else.pop_count = 1;
+      insn->bits3.if_else.pad0 = 0;
+   }
+}
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+   insn = next_insn(p, BRW_OPCODE_BREAK);
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   /* insn->header.mask_control = BRW_MASK_DISABLE; */
+   insn->bits3.if_else.pad0 = 0;
+   return insn;
+}
+
+struct brw_instruction *brw_CONT(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+   insn = next_insn(p, BRW_OPCODE_CONTINUE);
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   /* insn->header.mask_control = BRW_MASK_DISABLE; */
+   insn->bits3.if_else.pad0 = 0;
+   return insn;
+}
+
+/* DO/WHILE loop:
+ */
+struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
+{
+   if (p->single_program_flow) {
+      return &p->store[p->nr_insn];
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
+
+      /* Override the defaults for this instruction:
+       */
+      brw_set_dest(insn, brw_null_reg());
+      brw_set_src0(insn, brw_null_reg());
+      brw_set_src1(insn, brw_null_reg());
+
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = execute_size;
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      /* insn->header.mask_control = BRW_MASK_ENABLE; */
+      /* insn->header.mask_control = BRW_MASK_DISABLE; */
+
+      return insn;
+   }
+}
+
+
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p, 
+                                  struct brw_instruction *do_insn)
+{
+   struct brw_instruction *insn;
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2;
+
+   if (p->single_program_flow)
+      insn = next_insn(p, BRW_OPCODE_ADD);
+   else
+      insn = next_insn(p, BRW_OPCODE_WHILE);
+
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+
+   if (p->single_program_flow) {
+      insn->header.execution_size = BRW_EXECUTE_1;
+
+      insn->bits3.d = (do_insn - insn) * 16;
+   } else {
+      insn->header.execution_size = do_insn->header.execution_size;
+
+      assert(do_insn->header.opcode == BRW_OPCODE_DO);
+      insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
+      insn->bits3.if_else.pop_count = 0;
+      insn->bits3.if_else.pad0 = 0;
+   }
+
+/*    insn->header.mask_control = BRW_MASK_ENABLE; */
+
+   /* insn->header.mask_control = BRW_MASK_DISABLE; */
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;   
+   return insn;
+}
+
+
+/* FORWARD JUMPS:
+ */
+void brw_land_fwd_jump(struct brw_compile *p, 
+		       struct brw_instruction *jmp_insn)
+{
+   struct brw_instruction *landing = &p->store[p->nr_insn];
+   GLuint jmpi = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
+   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
+   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
+
+   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
+}
+
+
+
+/* To integrate with the above, it makes sense that the comparison
+ * instruction should populate the flag register.  It might be simpler
+ * just to use the flag reg for most WM tasks?
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     GLuint conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
+
+   insn->header.destreg__conditionalmod = conditional;
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, src1);
+
+/*    guess_execution_size(insn, src0); */
+
+
+   /* Make it so that future instructions will use the computed flag
+    * value until brw_set_predicate_control_flag_value() is called
+    * again.  
+    */
+   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+       dest.nr == 0) {
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+      p->flag_value = 0xff;
+   }
+}
+
+
+
+/***********************************************************************
+ * Helpers for the various SEND message types:
+ */
+
+/** Extended math function, float[8].
+ */
+void brw_math( struct brw_compile *p,
+	       struct brw_reg dest,
+	       GLuint function,
+	       GLuint saturate,
+	       GLuint msg_reg_nr,
+	       struct brw_reg src,
+	       GLuint data_type,
+	       GLuint precision )
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; 
+   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; 
+
+   /* Example code doesn't set predicate_control for send
+    * instructions.
+    */
+   insn->header.predicate_control = 0; 
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   brw_set_math_message(p->brw,
+			insn, 
+			msg_length, response_length, 
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			data_type);
+}
+
+/**
+ * Extended math function, float[16].
+ * Use 2 send instructions.
+ */
+void brw_math_16( struct brw_compile *p,
+		  struct brw_reg dest,
+		  GLuint function,
+		  GLuint saturate,
+		  GLuint msg_reg_nr,
+		  struct brw_reg src,
+		  GLuint precision )
+{
+   struct brw_instruction *insn;
+   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; 
+   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; 
+
+   /* First instruction:
+    */
+   brw_push_insn_state(p);
+   brw_set_predicate_control_flag_value(p, 0xff);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   brw_set_math_message(p->brw,
+			insn, 
+			msg_length, response_length, 
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			BRW_MATH_DATA_VECTOR);
+
+   /* Second instruction:
+    */
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
+   insn->header.destreg__conditionalmod = msg_reg_nr+1;
+
+   brw_set_dest(insn, offset(dest,1));
+   brw_set_src0(insn, src);
+   brw_set_math_message(p->brw, 
+			insn, 
+			msg_length, response_length, 
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			BRW_MATH_DATA_VECTOR);
+
+   brw_pop_insn_state(p);
+}
+
+
+/**
+ * Write block of 16 dwords/floats to the data port Render Cache scratch buffer.
+ * Scratch offset should be a multiple of 64.
+ * Used for register spilling.
+ */
+void brw_dp_WRITE_16( struct brw_compile *p,
+		      struct brw_reg src,
+		      GLuint scratch_offset )
+{
+   GLuint msg_reg_nr = 1;
+   {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+      /* set message header global offset field (reg 0, element 2) */
+      brw_MOV(p,
+	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
+	      brw_imm_d(scratch_offset));
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      GLuint msg_length = 3;
+      struct brw_reg dest = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+  
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+
+      brw_set_dp_write_message(p->brw,
+			       insn,
+			       255, /* binding table index (255=stateless) */
+			       BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
+			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
+			       msg_length,
+			       0, /* pixel scoreboard */
+			       0, /* response_length */
+			       0); /* eot */
+   }
+}
+
+
+/**
+ * Read block of 16 dwords/floats from the data port Render Cache scratch buffer.
+ * Scratch offset should be a multiple of 64.
+ * Used for register spilling.
+ */
+void brw_dp_READ_16( struct brw_compile *p,
+		      struct brw_reg dest,
+		      GLuint scratch_offset )
+{
+   GLuint msg_reg_nr = 1;
+   {
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+      /* set message header global offset field (reg 0, element 2) */
+      brw_MOV(p,
+	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
+	      brw_imm_d(scratch_offset));
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+  
+      brw_set_dest(insn, dest);	/* UW? */
+      brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
+
+      brw_set_dp_read_message(p->brw,
+			      insn,
+			      255, /* binding table index (255=stateless) */
+			      3,  /* msg_control (3 means 4 Owords) */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      1, /* target cache (render/scratch) */
+			      1, /* msg_length */
+			      2, /* response_length */
+			      0); /* eot */
+   }
+}
+
+
+/**
+ * Read a float[4] vector from the data port Data Cache (const buffer).
+ * Location (in buffer) should be a multiple of 16.
+ * Used for fetching shader constants.
+ * If relAddr is true, we'll do an indirect fetch using the address register.
+ */
+void brw_dp_READ_4( struct brw_compile *p,
+                    struct brw_reg dest,
+                    GLboolean relAddr,
+                    GLuint location,
+                    GLuint bind_table_index )
+{
+   /* XXX: relAddr not implemented */
+   GLuint msg_reg_nr = 1;
+   {
+      struct brw_reg b;
+      brw_push_insn_state(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+   /* Setup MRF[1] with location/offset into const buffer */
+      b = brw_message_reg(msg_reg_nr);
+      b = retype(b, BRW_REGISTER_TYPE_UD);
+      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
+       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
+       */
+      brw_MOV(p, b, brw_imm_ud(location));
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+      insn->header.mask_control = BRW_MASK_DISABLE;
+  
+      /* cast dest to a uword[8] vector */
+      dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, brw_null_reg());
+
+      brw_set_dp_read_message(p->brw,
+			      insn,
+			      bind_table_index,
+			      0,  /* msg_control (0 means 1 Oword) */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      0, /* source cache = data cache */
+			      1, /* msg_length */
+			      1, /* response_length (1 Oword) */
+			      0); /* eot */
+   }
+}
+
+
+/**
+ * Read float[4] constant(s) from VS constant buffer.
+ * For relative addressing, two float[4] constants will be read into 'dest'.
+ * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
+ */
+void brw_dp_READ_4_vs(struct brw_compile *p,
+                      struct brw_reg dest,
+                      GLuint oword,
+                      GLboolean relAddr,
+                      struct brw_reg addrReg,
+                      GLuint location,
+                      GLuint bind_table_index)
+{
+   GLuint msg_reg_nr = 1;
+
+   assert(oword < 2);
+   /*
+   printf("vs const read msg, location %u, msg_reg_nr %d\n",
+          location, msg_reg_nr);
+   */
+
+   /* Setup MRF[1] with location/offset into const buffer */
+   {
+      struct brw_reg b;
+
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      /*brw_set_access_mode(p, BRW_ALIGN_16);*/
+
+      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
+       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
+       */
+      b = brw_message_reg(msg_reg_nr);
+      b = retype(b, BRW_REGISTER_TYPE_UD);
+      /*b = get_element_ud(b, 2);*/
+      if (relAddr) {
+         brw_ADD(p, b, addrReg, brw_imm_ud(location));
+      }
+      else {
+         brw_MOV(p, b, brw_imm_ud(location));
+      }
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+      insn->header.mask_control = BRW_MASK_DISABLE;
+      /*insn->header.access_mode = BRW_ALIGN_16;*/
+  
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, brw_null_reg());
+
+      brw_set_dp_read_message(p->brw,
+			      insn,
+			      bind_table_index,
+			      oword,  /* 0 = lower Oword, 1 = upper Oword */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      0, /* source cache = data cache */
+			      1, /* msg_length */
+			      1, /* response_length (1 Oword) */
+			      0); /* eot */
+   }
+}
+
+
+
+void brw_fb_WRITE(struct brw_compile *p,
+                  struct brw_reg dest,
+                  GLuint msg_reg_nr,
+                  struct brw_reg src0,
+                  GLuint binding_table_index,
+                  GLuint msg_length,
+                  GLuint response_length,
+                  GLboolean eot)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+   insn->header.predicate_control = 0; /* XXX */
+   insn->header.compression_control = BRW_COMPRESSION_NONE; 
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+  
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_dp_write_message(p->brw,
+			    insn,
+			    binding_table_index,
+			    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, /* msg_control */
+			    BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, /* msg_type */
+			    msg_length,
+			    1,	/* pixel scoreboard */
+			    response_length, 
+			    eot);
+}
+
+
+/**
+ * Texture sample instruction.
+ * Note: the msg_type plus msg_length values determine exactly what kind
+ * of sampling operation is performed.  See volume 4, page 161 of docs.
+ */
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		GLuint msg_reg_nr,
+		struct brw_reg src0,
+		GLuint binding_table_index,
+		GLuint sampler,
+		GLuint writemask,
+		GLuint msg_type,
+		GLuint response_length,
+		GLuint msg_length,
+		GLboolean eot,
+		GLuint header_present,
+		GLuint simd_mode)
+{
+   GLboolean need_stall = 0;
+   
+   if (writemask == 0) {
+      /*debug_printf("%s: zero writemask??\n", __FUNCTION__); */
+      return;
+   }
+   
+   /* Hardware doesn't do destination dependency checking on send
+    * instructions properly.  Add a workaround which generates the
+    * dependency by other means.  In practice it seems like this bug
+    * only crops up for texture samples, and only where registers are
+    * written by the send and then written again later without being
+    * read in between.  Luckily for us, we already track that
+    * information and use it to modify the writemask for the
+    * instruction, so that is a guide for whether a workaround is
+    * needed.
+    */
+   if (writemask != BRW_WRITEMASK_XYZW) {
+      GLuint dst_offset = 0;
+      GLuint i, newmask = 0, len = 0;
+
+      for (i = 0; i < 4; i++) {
+	 if (writemask & (1<<i))
+	    break;
+	 dst_offset += 2;
+      }
+      for (; i < 4; i++) {
+	 if (!(writemask & (1<<i)))
+	    break;
+	 newmask |= 1<<i;
+	 len++;
+      }
+
+      if (newmask != writemask) {
+	 need_stall = 1;
+         /* debug_printf("need stall %x %x\n", newmask , writemask); */
+      }
+      else {
+	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
+	 
+	 newmask = ~newmask & BRW_WRITEMASK_XYZW;
+
+	 brw_push_insn_state(p);
+
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	 brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+	 brw_MOV(p, m1, brw_vec8_grf(0,0));	 
+  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12)); 
+
+	 brw_pop_insn_state(p);
+
+  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW); 
+	 dest = offset(dest, dst_offset);
+	 response_length = len * 2;
+      }
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src0);
+      brw_set_sampler_message(p->brw, insn,
+			      binding_table_index,
+			      sampler,
+			      msg_type,
+			      response_length, 
+			      msg_length,
+			      eot,
+			      header_present,
+			      simd_mode);
+   }
+
+   if (need_stall) {
+      struct brw_reg reg = vec8(offset(dest, response_length-1));
+
+      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
+       */
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_MOV(p, reg, reg);	      
+      brw_pop_insn_state(p);
+   }
+
+}
+
+/* All these variables are pretty confusing - we might be better off
+ * using bitmasks and macros for this, in the old style.  Or perhaps
+ * just having the caller instantiate the fields in dword3 itself.
+ */
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < BRW_MAX_MRF);
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+
+   brw_set_urb_message(p->brw,
+		       insn,
+		       allocate,
+		       used,
+		       msg_length,
+		       response_length, 
+		       eot, 
+		       writes_complete, 
+		       offset,
+		       swizzle);
+}
+
+void brw_ff_sync(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < 16);
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+
+   brw_set_ff_sync_message(p->brw,
+		       insn,
+		       allocate,
+		       used,
+		       msg_length,
+		       response_length, 
+		       eot, 
+		       writes_complete, 
+		       offset,
+		       swizzle);
+}
diff --git a/src/gallium/drivers/i965/brw_eu_util.c b/src/gallium/drivers/i965/brw_eu_util.c
new file mode 100644
index 0000000000..5405cf17a4
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_eu_util.c
@@ -0,0 +1,126 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+      
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+void brw_math_invert( struct brw_compile *p, 
+			     struct brw_reg dst,
+			     struct brw_reg src)
+{
+   brw_math( p, 
+	     dst,
+	     BRW_MATH_FUNCTION_INV, 
+	     BRW_MATH_SATURATE_NONE,
+	     0,
+	     src,
+	     BRW_MATH_PRECISION_FULL, 
+	     BRW_MATH_DATA_VECTOR );
+}
+
+
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count)
+{
+   GLuint i;
+
+   dst = vec4(dst);
+   src = vec4(src);
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
+   }
+}
+
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count)
+{
+   GLuint i;
+
+   dst = vec8(dst);
+   src = vec8(src);
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+   }
+}
+
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   GLuint count)
+{
+   GLuint i;
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, deref_4f(dst_ptr, delta),    deref_4f(src_ptr, delta));
+      brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
+   }
+}
+
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    GLuint count)
+{
+   GLuint i;
+
+   dst = vec4(dst);
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    deref_4f(ptr, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
+   }
+}
+
+
+
+
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
new file mode 100644
index 0000000000..06826635a8
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -0,0 +1,215 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+      
+#include "brw_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_state.h"
+#include "brw_gs.h"
+
+
+
+static enum pipe_error compile_gs_prog( struct brw_context *brw,
+                                        struct brw_gs_prog_key *key,
+                                        struct brw_winsys_buffer **bo_out )
+{
+   struct brw_gs_compile c;
+   enum pipe_error ret;
+   const GLuint *program;
+   GLuint program_size;
+
+   memset(&c, 0, sizeof(c));
+   
+   c.key = *key;
+   c.need_ff_sync = BRW_IS_IGDNG(brw);
+   /* Need to locate the two positions present in vertex + header.
+    * These are currently hardcoded:
+    */
+   c.nr_attrs = c.key.nr_attrs;
+
+   if (BRW_IS_IGDNG(brw))
+      c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+   else
+      c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
+   c.nr_bytes = c.nr_regs * REG_SIZE;
+
+   
+   /* Begin the compilation:
+    */
+   brw_init_compile(brw, &c.func);
+
+   c.func.single_program_flow = 1;
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.  
+    */
+   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
+
+
+   /* Note that primitives which don't require a GS program have
+    * already been weeded out by this stage:
+    */
+   switch (key->primitive) {
+   case PIPE_PRIM_QUADS:
+      brw_gs_quads( &c ); 
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      brw_gs_quad_strip( &c );
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      brw_gs_lines( &c );
+      break;
+   case PIPE_PRIM_LINES:
+      if (key->hint_gs_always)
+	 brw_gs_lines( &c );
+      else {
+	 return PIPE_OK;
+      }
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      if (key->hint_gs_always)
+	 brw_gs_tris( &c );
+      else {
+	 return PIPE_OK;
+      }
+      break;
+   case PIPE_PRIM_POINTS:
+      if (key->hint_gs_always)
+	 brw_gs_points( &c );
+      else {
+	 return PIPE_OK;
+      }
+      break;
+   default:
+      assert(0);
+      return PIPE_ERROR_BAD_INPUT;
+   }
+
+   /* get the program
+    */
+   ret = brw_get_program(&c.func, &program, &program_size);
+   if (ret)
+      return ret;
+
+   /* Upload
+    */
+   ret = brw_upload_cache( &brw->cache, BRW_GS_PROG,
+                           &c.key, sizeof(c.key),
+                           NULL, 0,
+                           program, program_size,
+                           &c.prog_data,
+                           &brw->gs.prog_data,
+                           bo_out );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+static const unsigned gs_prim[PIPE_PRIM_MAX] = {  
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINE_LOOP,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_QUADS,
+   PIPE_PRIM_QUAD_STRIP,
+   PIPE_PRIM_TRIANGLES
+};
+
+static void populate_key( struct brw_context *brw,
+			  struct brw_gs_prog_key *key )
+{
+   const struct brw_fs_signature *sig = &brw->curr.fragment_shader->signature;
+
+   memset(key, 0, sizeof(*key));
+
+   /* PIPE_NEW_FRAGMENT_SIGNATURE */
+   key->nr_attrs = sig->nr_inputs + 1;
+
+   /* BRW_NEW_PRIMITIVE */
+   key->primitive = gs_prim[brw->primitive];
+
+   key->hint_gs_always = 0;	/* debug code? */
+
+   key->need_gs_prog = (key->hint_gs_always ||
+			brw->primitive == PIPE_PRIM_QUADS ||
+			brw->primitive == PIPE_PRIM_QUAD_STRIP ||
+			brw->primitive == PIPE_PRIM_LINE_LOOP);
+}
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static int prepare_gs_prog(struct brw_context *brw)
+{
+   struct brw_gs_prog_key key;
+   enum pipe_error ret;
+
+   /* Populate the key:
+    */
+   populate_key(brw, &key);
+
+   if (brw->gs.prog_active != key.need_gs_prog) {
+      brw->state.dirty.cache |= CACHE_NEW_GS_PROG;
+      brw->gs.prog_active = key.need_gs_prog;
+   }
+
+   if (!brw->gs.prog_active)
+      return PIPE_OK;
+
+   if (brw_search_cache(&brw->cache, BRW_GS_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->gs.prog_data,
+                        &brw->gs.prog_bo))
+      return PIPE_OK;
+
+   ret = compile_gs_prog( brw, &key, &brw->gs.prog_bo );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+const struct brw_tracked_state brw_gs_prog = {
+   .dirty = {
+      .mesa  = PIPE_NEW_FRAGMENT_SIGNATURE,
+      .brw   = BRW_NEW_PRIMITIVE,
+      .cache = 0,
+   },
+   .prepare = prepare_gs_prog
+};
diff --git a/src/gallium/drivers/i965/brw_gs.h b/src/gallium/drivers/i965/brw_gs.h
new file mode 100644
index 0000000000..6e616dcb87
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_gs.h
@@ -0,0 +1,76 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+#ifndef BRW_GS_H
+#define BRW_GS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define MAX_GS_VERTS (4)	     
+
+struct brw_gs_prog_key {
+   GLuint nr_attrs:8;
+   GLuint primitive:4;
+   GLuint hint_gs_always:1;
+   GLuint need_gs_prog:1;
+   GLuint pad:18;
+};
+
+struct brw_gs_compile {
+   struct brw_compile func;
+   struct brw_gs_prog_key key;
+   struct brw_gs_prog_data prog_data;
+   
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_GS_VERTS];
+   } reg;
+
+   /* 3 different ways of expressing vertex size:
+    */
+   GLuint nr_attrs;
+   GLuint nr_regs;
+   GLuint nr_bytes;
+   GLboolean need_ff_sync;
+};
+
+#define ATTR_SIZE  (4*4)
+
+void brw_gs_quads( struct brw_gs_compile *c );
+void brw_gs_quad_strip( struct brw_gs_compile *c );
+void brw_gs_tris( struct brw_gs_compile *c );
+void brw_gs_lines( struct brw_gs_compile *c );
+void brw_gs_points( struct brw_gs_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_gs_emit.c b/src/gallium/drivers/i965/brw_gs_emit.c
new file mode 100644
index 0000000000..9b58773b3b
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_gs_emit.c
@@ -0,0 +1,180 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+#include "brw_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_gs.h"
+
+static void brw_gs_alloc_regs( struct brw_gs_compile *c,
+			       GLuint nr_verts )
+{
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->prog_data.urb_read_length = c->nr_regs; 
+   c->prog_data.total_grf = i;
+}
+
+
+static void brw_gs_emit_vue(struct brw_gs_compile *c, 
+			    struct brw_reg vert,
+			    GLboolean last,
+			    GLuint header)
+{
+   struct brw_compile *p = &c->func;
+   GLboolean allocate = !last;
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy8(p, brw_message_reg(1), vert, c->nr_regs);
+
+   /* Send each vertex as a seperate write to the urb.  This is
+    * different to the concept in brw_sf_emit.c, where subsequent
+    * writes are used to build up a single urb entry.  Each of these
+    * writes instantiates a seperate urb entry, and a new one must be
+    * allocated each time.
+    */
+   brw_urb_WRITE(p, 
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+		 allocate,
+		 1,		/* used */
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response length */
+		 allocate ? 0 : 1, /* eot */
+		 1,		/* writes_complete */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
+{
+	struct brw_compile *p = &c->func;
+	brw_MOV(p, get_element_ud(c->reg.R0, 1), brw_imm_ud(num_prim));
+	brw_ff_sync(p, 
+				c->reg.R0,
+				0,
+				c->reg.R0,
+				1,	
+				1,		/* used */
+				1,  	/* msg length */
+				1,		/* response length */
+				0,		/* eot */
+				1,		/* write compelete */
+				0,		/* urb offset */
+				BRW_URB_SWIZZLE_NONE);
+}
+
+
+void brw_gs_quads( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 4);
+   
+   /* Use polygons for correct edgeflag behaviour. Note that vertex 3
+    * is the PV for quads, but vertex 0 for polygons:
+    */
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);    
+   brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); 
+   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+}
+
+void brw_gs_quad_strip( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 4);
+   
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
+   brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); 
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+}
+
+void brw_gs_tris( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 3);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_TRILIST << 2) | R02_PRIM_END));
+}
+
+void brw_gs_lines( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 2);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
+}
+
+void brw_gs_points( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 1);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
+   brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
+}
+
+
+
+
+
+
+
+
diff --git a/src/gallium/drivers/i965/brw_gs_state.c b/src/gallium/drivers/i965/brw_gs_state.c
new file mode 100644
index 0000000000..b64ec286ce
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_gs_state.c
@@ -0,0 +1,169 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+#include "util/u_math.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_debug.h"
+
+struct brw_gs_unit_key {
+   unsigned int total_grf;
+   unsigned int urb_entry_read_length;
+
+   unsigned int curbe_offset;
+
+   unsigned int nr_urb_entries, urb_size;
+   GLboolean prog_active;
+};
+
+static void
+gs_unit_populate_key(struct brw_context *brw, struct brw_gs_unit_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_GS_PROG */
+   key->prog_active = brw->gs.prog_active;
+   if (key->prog_active) {
+      key->total_grf = brw->gs.prog_data->total_grf;
+      key->urb_entry_read_length = brw->gs.prog_data->urb_read_length;
+   } else {
+      key->total_grf = 1;
+      key->urb_entry_read_length = 1;
+   }
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   key->curbe_offset = brw->curbe.clip_start;
+
+   /* BRW_NEW_URB_FENCE */
+   key->nr_urb_entries = brw->urb.nr_gs_entries;
+   key->urb_size = brw->urb.vsize;
+}
+
+static enum pipe_error
+gs_unit_create_from_key(struct brw_context *brw, 
+                        struct brw_gs_unit_key *key,
+                        struct brw_winsys_reloc *reloc,
+                        unsigned nr_reloc,
+                        struct brw_winsys_buffer **bo_out)
+{
+   struct brw_gs_unit_state gs;
+   enum pipe_error ret;
+
+
+   memset(&gs, 0, sizeof(gs));
+
+   /* reloc */
+   gs.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
+   gs.thread0.kernel_start_pointer = 0;
+
+   gs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   gs.thread1.single_program_flow = 1;
+
+   gs.thread3.dispatch_grf_start_reg = 1;
+   gs.thread3.const_urb_entry_read_offset = 0;
+   gs.thread3.const_urb_entry_read_length = 0;
+   gs.thread3.urb_entry_read_offset = 0;
+   gs.thread3.urb_entry_read_length = key->urb_entry_read_length;
+
+   gs.thread4.nr_urb_entries = key->nr_urb_entries;
+   gs.thread4.urb_entry_allocation_size = key->urb_size - 1;
+
+   if (key->nr_urb_entries >= 8)
+      gs.thread4.max_threads = 1;
+   else
+      gs.thread4.max_threads = 0;
+
+   if (BRW_IS_IGDNG(brw))
+      gs.thread4.rendering_enable = 1;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      gs.thread4.stats_enable = 1;
+
+   ret = brw_upload_cache(&brw->cache, BRW_GS_UNIT,
+                          key, sizeof(*key),
+                          reloc, nr_reloc,
+                          &gs, sizeof(gs),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+static enum pipe_error prepare_gs_unit(struct brw_context *brw)
+{
+   struct brw_gs_unit_key key;
+   enum pipe_error ret;
+   struct brw_winsys_reloc reloc[1];
+   unsigned nr_reloc = 0;
+   unsigned grf_reg_count;
+
+   gs_unit_populate_key(brw, &key);
+
+   grf_reg_count = (align(key.total_grf, 16) / 16 - 1);
+
+   /* GS program relocation */
+   if (key.prog_active) {
+      make_reloc(&reloc[nr_reloc++],
+                 BRW_USAGE_STATE,
+                 grf_reg_count << 1,
+                 offsetof(struct brw_gs_unit_state, thread0),
+                 brw->gs.prog_bo);
+   }
+
+   if (brw_search_cache(&brw->cache, BRW_GS_UNIT,
+                        &key, sizeof(key),
+                        reloc, nr_reloc,
+                        NULL,
+                        &brw->gs.state_bo))
+      return PIPE_OK;
+
+   ret = gs_unit_create_from_key(brw, &key,
+                                 reloc, nr_reloc,
+                                 &brw->gs.state_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_gs_unit = {
+   .dirty = {
+      .mesa  = 0,
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_GS_PROG
+   },
+   .prepare = prepare_gs_unit,
+};
diff --git a/src/gallium/drivers/i965/brw_misc_state.c b/src/gallium/drivers/i965/brw_misc_state.c
new file mode 100644
index 0000000000..b5029ceb69
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_misc_state.c
@@ -0,0 +1,513 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+
+#include "brw_debug.h"
+#include "brw_batchbuffer.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_screen.h"
+#include "brw_pipe_rast.h"
+
+
+
+
+
+/***********************************************************************
+ * Blend color
+ */
+
+static int upload_blend_constant_color(struct brw_context *brw)
+{
+   BRW_CACHED_BATCH_STRUCT(brw, &brw->curr.bcc);
+   return 0;
+}
+
+
+const struct brw_tracked_state brw_blend_constant_color = {
+   .dirty = {
+      .mesa = PIPE_NEW_BLEND_COLOR,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_blend_constant_color
+};
+
+/***********************************************************************
+ * Drawing rectangle - framebuffer dimensions
+ */
+static int upload_drawing_rect(struct brw_context *brw)
+{
+   BEGIN_BATCH(4, NO_LOOP_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965);
+   OUT_BATCH(0);
+   OUT_BATCH(((brw->curr.fb.width - 1) & 0xffff) |
+	    ((brw->curr.fb.height - 1) << 16));
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+   return 0;
+}
+
+const struct brw_tracked_state brw_drawing_rect = {
+   .dirty = {
+      .mesa = PIPE_NEW_FRAMEBUFFER_DIMENSIONS,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_drawing_rect
+};
+
+
+/***********************************************************************
+ * Binding table pointers
+ */
+
+static int prepare_binding_table_pointers(struct brw_context *brw)
+{
+   brw_add_validated_bo(brw, brw->vs.bind_bo);
+   brw_add_validated_bo(brw, brw->wm.bind_bo);
+   return 0;
+}
+
+/**
+ * Upload the binding table pointers, which point each stage's array of surface
+ * state pointers.
+ *
+ * The binding table pointers are relative to the surface state base address,
+ * which is 0.
+ */
+static int upload_binding_table_pointers(struct brw_context *brw)
+{
+   BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+   OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
+   if (brw->vs.bind_bo != NULL)
+      OUT_RELOC(brw->vs.bind_bo, 
+		BRW_USAGE_SAMPLER,
+		0); /* vs */
+   else
+      OUT_BATCH(0);
+   OUT_BATCH(0); /* gs */
+   OUT_BATCH(0); /* clip */
+   OUT_BATCH(0); /* sf */
+   OUT_RELOC(brw->wm.bind_bo,
+	     BRW_USAGE_SAMPLER,
+	     0); /* wm/ps */
+   ADVANCE_BATCH();
+   return 0;
+}
+
+const struct brw_tracked_state brw_binding_table_pointers = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH,
+      .cache = CACHE_NEW_SURF_BIND,
+   },
+   .prepare = prepare_binding_table_pointers,
+   .emit = upload_binding_table_pointers,
+};
+
+
+/**********************************************************************
+ * Upload pointers to the per-stage state.
+ *
+ * The state pointers in this packet are all relative to the general state
+ * base address set by CMD_STATE_BASE_ADDRESS, which is 0.
+ */
+static int upload_pipelined_state_pointers(struct brw_context *brw )
+{
+   BEGIN_BATCH(7, IGNORE_CLIPRECTS);
+   OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2));
+   OUT_RELOC(brw->vs.state_bo, 
+	     BRW_USAGE_STATE,
+	     0);
+   if (brw->gs.prog_active)
+      OUT_RELOC(brw->gs.state_bo, 
+		BRW_USAGE_STATE,
+		1);
+   else
+      OUT_BATCH(0);
+   OUT_RELOC(brw->clip.state_bo, 
+	     BRW_USAGE_STATE,
+	     1);
+   OUT_RELOC(brw->sf.state_bo,
+	     BRW_USAGE_STATE,
+	     0);
+   OUT_RELOC(brw->wm.state_bo,
+	     BRW_USAGE_STATE,
+	     0);
+   OUT_RELOC(brw->cc.state_bo,
+	     BRW_USAGE_STATE,
+	     0);
+   ADVANCE_BATCH();
+
+   brw->state.dirty.brw |= BRW_NEW_PSP;
+   return 0;
+}
+
+
+static int prepare_psp_urb_cbs(struct brw_context *brw)
+{
+   brw_add_validated_bo(brw, brw->vs.state_bo);
+   brw_add_validated_bo(brw, brw->gs.state_bo);
+   brw_add_validated_bo(brw, brw->clip.state_bo);
+   brw_add_validated_bo(brw, brw->sf.state_bo);
+   brw_add_validated_bo(brw, brw->wm.state_bo);
+   brw_add_validated_bo(brw, brw->cc.state_bo);
+   return 0;
+}
+
+static int upload_psp_urb_cbs(struct brw_context *brw )
+{
+   int ret;
+   
+   ret = upload_pipelined_state_pointers(brw);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_urb_fence(brw);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_cs_urb_state(brw);
+   if (ret)
+      return ret;
+
+   return 0;
+}
+
+const struct brw_tracked_state brw_psp_urb_cbs = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_URB_FENCE | BRW_NEW_BATCH,
+      .cache = (CACHE_NEW_VS_UNIT | 
+		CACHE_NEW_GS_UNIT | 
+		CACHE_NEW_GS_PROG | 
+		CACHE_NEW_CLIP_UNIT | 
+		CACHE_NEW_SF_UNIT | 
+		CACHE_NEW_WM_UNIT | 
+		CACHE_NEW_CC_UNIT)
+   },
+   .prepare = prepare_psp_urb_cbs,
+   .emit = upload_psp_urb_cbs,
+};
+
+
+/***********************************************************************
+ * Depth buffer 
+ */
+
+static int prepare_depthbuffer(struct brw_context *brw)
+{
+   struct pipe_surface *zsbuf = brw->curr.fb.zsbuf;
+
+   if (zsbuf)
+      brw_add_validated_bo(brw, brw_surface(zsbuf)->bo);
+
+   return 0;
+}
+
+static int emit_depthbuffer(struct brw_context *brw)
+{
+   struct pipe_surface *surface = brw->curr.fb.zsbuf;
+   unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5;
+
+   if (surface == NULL) {
+      BEGIN_BATCH(len, IGNORE_CLIPRECTS);
+      OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
+      OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
+		(BRW_SURFACE_NULL << 29));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+         OUT_BATCH(0);
+
+      ADVANCE_BATCH();
+   } else {
+      struct brw_winsys_buffer *bo;
+      unsigned int format;
+      unsigned int pitch;
+      unsigned int cpp;
+
+      switch (surface->format) {
+      case PIPE_FORMAT_Z16_UNORM:
+	 format = BRW_DEPTHFORMAT_D16_UNORM;
+	 cpp = 2;
+	 break;
+      case PIPE_FORMAT_Z24X8_UNORM:
+      case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+	 format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+	 cpp = 4;
+	 break;
+      case PIPE_FORMAT_Z32_FLOAT:
+	 format = BRW_DEPTHFORMAT_D32_FLOAT;
+	 cpp = 4;
+	 break;
+      default:
+	 assert(0);
+	 return PIPE_ERROR_BAD_INPUT;
+      }
+
+      bo = brw_surface(surface)->bo;
+      pitch = brw_surface(surface)->pitch;
+
+      BEGIN_BATCH(len, IGNORE_CLIPRECTS);
+      OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
+      OUT_BATCH(((pitch * cpp) - 1) |
+		(format << 18) |
+		(BRW_TILEWALK_YMAJOR << 26) |
+		((surface->layout != PIPE_SURFACE_LAYOUT_LINEAR) << 27) |
+		(BRW_SURFACE_2D << 29));
+      OUT_RELOC(bo,
+		BRW_USAGE_DEPTH_BUFFER,
+		surface->offset);
+      OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) |
+		((pitch - 1) << 6) |
+		((surface->height - 1) << 19));
+      OUT_BATCH(0);
+
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+         OUT_BATCH(0);
+
+      ADVANCE_BATCH();
+   }
+
+   return 0;
+}
+
+const struct brw_tracked_state brw_depthbuffer = {
+   .dirty = {
+      .mesa = PIPE_NEW_DEPTH_BUFFER,
+      .brw = BRW_NEW_BATCH,
+      .cache = 0,
+   },
+   .prepare = prepare_depthbuffer,
+   .emit = emit_depthbuffer,
+};
+
+
+
+/***********************************************************************
+ * Polygon stipple packet
+ */
+
+static int upload_polygon_stipple(struct brw_context *brw)
+{
+   BRW_CACHED_BATCH_STRUCT(brw, &brw->curr.bps);
+   return 0;
+}
+
+const struct brw_tracked_state brw_polygon_stipple = {
+   .dirty = {
+      .mesa = PIPE_NEW_POLYGON_STIPPLE,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_polygon_stipple
+};
+
+
+/***********************************************************************
+ * Line stipple packet
+ */
+
+static int upload_line_stipple(struct brw_context *brw)
+{
+   const struct brw_line_stipple *bls = &brw->curr.rast->bls;
+   if (bls->header.opcode) {
+      BRW_CACHED_BATCH_STRUCT(brw, bls);
+   }
+   return 0;
+}
+
+const struct brw_tracked_state brw_line_stipple = {
+   .dirty = {
+      .mesa = PIPE_NEW_RAST,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_line_stipple
+};
+
+
+/***********************************************************************
+ * Misc invarient state packets
+ */
+
+static int upload_invarient_state( struct brw_context *brw )
+{
+   {
+      /* 0x61040000  Pipeline Select */
+      /*     PipelineSelect            : 0 */
+      struct brw_pipeline_select ps;
+
+      memset(&ps, 0, sizeof(ps));
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+	 ps.header.opcode = CMD_PIPELINE_SELECT_GM45;
+      else
+	 ps.header.opcode = CMD_PIPELINE_SELECT_965;
+      ps.header.pipeline_select = 0;
+      BRW_BATCH_STRUCT(brw, &ps);
+   }
+
+   {
+      struct brw_global_depth_offset_clamp gdo;
+      memset(&gdo, 0, sizeof(gdo));
+
+      /* Disable depth offset clamping. 
+       */
+      gdo.header.opcode = CMD_GLOBAL_DEPTH_OFFSET_CLAMP;
+      gdo.header.length = sizeof(gdo)/4 - 2;
+      gdo.depth_offset_clamp = 0.0;
+
+      BRW_BATCH_STRUCT(brw, &gdo);
+   }
+
+
+   /* 0x61020000  State Instruction Pointer */
+   {
+      struct brw_system_instruction_pointer sip;
+      memset(&sip, 0, sizeof(sip));
+
+      sip.header.opcode = CMD_STATE_INSN_POINTER;
+      sip.header.length = 0;
+      sip.bits0.pad = 0;
+      sip.bits0.system_instruction_pointer = 0;
+      BRW_BATCH_STRUCT(brw, &sip);
+   }
+
+   /* VF Statistics */
+   {
+      struct brw_vf_statistics vfs;
+      memset(&vfs, 0, sizeof(vfs));
+
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) 
+	 vfs.opcode = CMD_VF_STATISTICS_GM45;
+      else 
+	 vfs.opcode = CMD_VF_STATISTICS_965;
+
+      if (BRW_DEBUG & DEBUG_STATS)
+	 vfs.statistics_enable = 1; 
+
+      BRW_BATCH_STRUCT(brw, &vfs);
+   }
+   
+   if (!BRW_IS_965(brw))
+   {
+      struct brw_aa_line_parameters balp;
+
+      /* use legacy aa line coverage computation */
+      memset(&balp, 0, sizeof(balp));
+      balp.header.opcode = CMD_AA_LINE_PARAMETERS;
+      balp.header.length = sizeof(balp) / 4 - 2;
+   
+      BRW_BATCH_STRUCT(brw, &balp);
+   }
+
+   {
+      struct brw_polygon_stipple_offset bpso;
+      
+      /* This is invarient state in gallium:
+       */
+      memset(&bpso, 0, sizeof(bpso));
+      bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
+      bpso.header.length = sizeof(bpso)/4-2;
+      bpso.bits0.y_offset = 0;
+      bpso.bits0.x_offset = 0;
+
+      BRW_BATCH_STRUCT(brw, &bpso);
+   }
+   
+   return 0;
+}
+
+const struct brw_tracked_state brw_invarient_state = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0
+   },
+   .emit = upload_invarient_state
+};
+
+
+/***********************************************************************
+ * State base address 
+ */
+
+/**
+ * Define the base addresses which some state is referenced from.
+ *
+ * This allows us to avoid having to emit relocations in many places for
+ * cached state, and instead emit pointers inside of large, mostly-static
+ * state pools.  This comes at the expense of memory, and more expensive cache
+ * misses.
+ */
+static int upload_state_base_address( struct brw_context *brw )
+{
+   /* Output the structure (brw_state_base_address) directly to the
+    * batchbuffer, so we can emit relocations inline.
+    */
+   if (BRW_IS_IGDNG(brw)) {
+       BEGIN_BATCH(8, IGNORE_CLIPRECTS);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* Instruction base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       OUT_BATCH(1); /* Instruction access upper bound */
+       ADVANCE_BATCH();
+   } else {
+       BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       ADVANCE_BATCH();
+   }
+   return 0;
+}
+
+const struct brw_tracked_state brw_state_base_address = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0,
+   },
+   .emit = upload_state_base_address
+};
diff --git a/src/gallium/drivers/i965/brw_pipe_blend.c b/src/gallium/drivers/i965/brw_pipe_blend.c
new file mode 100644
index 0000000000..21f786f871
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_blend.c
@@ -0,0 +1,208 @@
+
+#include "util/u_memory.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_debug.h"
+
+static int translate_logicop(unsigned logicop)
+{
+   switch (logicop) {
+   case PIPE_LOGICOP_CLEAR:
+      return BRW_LOGICOPFUNCTION_CLEAR;
+   case PIPE_LOGICOP_AND:
+      return BRW_LOGICOPFUNCTION_AND;
+   case PIPE_LOGICOP_AND_REVERSE:
+      return BRW_LOGICOPFUNCTION_AND_REVERSE;
+   case PIPE_LOGICOP_COPY:
+      return BRW_LOGICOPFUNCTION_COPY;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      return BRW_LOGICOPFUNCTION_COPY_INVERTED;
+   case PIPE_LOGICOP_AND_INVERTED:
+      return BRW_LOGICOPFUNCTION_AND_INVERTED;
+   case PIPE_LOGICOP_NOOP:
+      return BRW_LOGICOPFUNCTION_NOOP;
+   case PIPE_LOGICOP_XOR:
+      return BRW_LOGICOPFUNCTION_XOR;
+   case PIPE_LOGICOP_OR:
+      return BRW_LOGICOPFUNCTION_OR;
+   case PIPE_LOGICOP_OR_INVERTED:
+      return BRW_LOGICOPFUNCTION_OR_INVERTED;
+   case PIPE_LOGICOP_NOR:
+      return BRW_LOGICOPFUNCTION_NOR;
+   case PIPE_LOGICOP_EQUIV:
+      return BRW_LOGICOPFUNCTION_EQUIV;
+   case PIPE_LOGICOP_INVERT:
+      return BRW_LOGICOPFUNCTION_INVERT;
+   case PIPE_LOGICOP_OR_REVERSE:
+      return BRW_LOGICOPFUNCTION_OR_REVERSE;
+   case PIPE_LOGICOP_NAND:
+      return BRW_LOGICOPFUNCTION_NAND;
+   case PIPE_LOGICOP_SET:
+      return BRW_LOGICOPFUNCTION_SET;
+   default:
+      assert(0);
+      return BRW_LOGICOPFUNCTION_SET;
+   }
+}
+
+
+static unsigned translate_blend_equation( unsigned mode )
+{
+   switch (mode) {
+   case PIPE_BLEND_ADD: 
+      return BRW_BLENDFUNCTION_ADD; 
+   case PIPE_BLEND_MIN: 
+      return BRW_BLENDFUNCTION_MIN; 
+   case PIPE_BLEND_MAX: 
+      return BRW_BLENDFUNCTION_MAX; 
+   case PIPE_BLEND_SUBTRACT: 
+      return BRW_BLENDFUNCTION_SUBTRACT; 
+   case PIPE_BLEND_REVERSE_SUBTRACT: 
+      return BRW_BLENDFUNCTION_REVERSE_SUBTRACT; 
+   default: 
+      assert(0);
+      return BRW_BLENDFUNCTION_ADD;
+   }
+}
+
+static unsigned translate_blend_factor( unsigned factor )
+{
+   switch(factor) {
+   case PIPE_BLENDFACTOR_ZERO: 
+      return BRW_BLENDFACTOR_ZERO; 
+   case PIPE_BLENDFACTOR_SRC_ALPHA: 
+      return BRW_BLENDFACTOR_SRC_ALPHA; 
+   case PIPE_BLENDFACTOR_ONE: 
+      return BRW_BLENDFACTOR_ONE; 
+   case PIPE_BLENDFACTOR_SRC_COLOR: 
+      return BRW_BLENDFACTOR_SRC_COLOR; 
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR: 
+      return BRW_BLENDFACTOR_INV_SRC_COLOR; 
+   case PIPE_BLENDFACTOR_DST_COLOR: 
+      return BRW_BLENDFACTOR_DST_COLOR; 
+   case PIPE_BLENDFACTOR_INV_DST_COLOR: 
+      return BRW_BLENDFACTOR_INV_DST_COLOR; 
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return BRW_BLENDFACTOR_INV_SRC_ALPHA; 
+   case PIPE_BLENDFACTOR_DST_ALPHA: 
+      return BRW_BLENDFACTOR_DST_ALPHA; 
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return BRW_BLENDFACTOR_INV_DST_ALPHA; 
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: 
+      return BRW_BLENDFACTOR_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return BRW_BLENDFACTOR_CONST_COLOR; 
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      return BRW_BLENDFACTOR_INV_CONST_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return BRW_BLENDFACTOR_CONST_ALPHA; 
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      return BRW_BLENDFACTOR_INV_CONST_ALPHA;
+   default:
+      assert(0);
+      return BRW_BLENDFACTOR_ZERO;
+   }   
+}
+
+static void *brw_create_blend_state( struct pipe_context *pipe,
+				     const struct pipe_blend_state *templ )
+{
+   struct brw_blend_state *blend = CALLOC_STRUCT(brw_blend_state);
+   if (blend == NULL)
+      return NULL;
+
+   if (templ->logicop_enable) {
+      blend->cc2.logicop_enable = 1;
+      blend->cc5.logicop_func = translate_logicop(templ->logicop_func);
+   } 
+   else if (templ->rt[0].blend_enable) {
+      blend->cc6.dest_blend_factor = translate_blend_factor(templ->rt[0].rgb_dst_factor);
+      blend->cc6.src_blend_factor = translate_blend_factor(templ->rt[0].rgb_src_factor);
+      blend->cc6.blend_function = translate_blend_equation(templ->rt[0].rgb_func);
+
+      blend->cc5.ia_dest_blend_factor = translate_blend_factor(templ->rt[0].alpha_dst_factor);
+      blend->cc5.ia_src_blend_factor = translate_blend_factor(templ->rt[0].alpha_src_factor);
+      blend->cc5.ia_blend_function = translate_blend_equation(templ->rt[0].alpha_func);
+
+      blend->cc3.blend_enable = 1;
+      blend->cc3.ia_blend_enable = 
+	 (blend->cc6.dest_blend_factor != blend->cc5.ia_dest_blend_factor ||
+	  blend->cc6.src_blend_factor != blend->cc5.ia_src_blend_factor ||
+	  blend->cc6.blend_function != blend->cc5.ia_blend_function);
+
+      /* Per-surface blend enables, currently just follow global
+       * state:
+       */
+      blend->ss0.color_blend = 1;
+   }
+
+   blend->cc5.dither_enable = templ->dither;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      blend->cc5.statistics_enable = 1;
+
+   /* Per-surface color mask -- just follow global state:
+    */
+   blend->ss0.writedisable_red   = (templ->rt[0].colormask & PIPE_MASK_R) ? 0 : 1;
+   blend->ss0.writedisable_green = (templ->rt[0].colormask & PIPE_MASK_G) ? 0 : 1;
+   blend->ss0.writedisable_blue  = (templ->rt[0].colormask & PIPE_MASK_B) ? 0 : 1;
+   blend->ss0.writedisable_alpha = (templ->rt[0].colormask & PIPE_MASK_A) ? 0 : 1;
+
+   return (void *)blend;
+}
+
+static void brw_bind_blend_state(struct pipe_context *pipe,
+				 void *cso)
+{
+   struct brw_context *brw = brw_context(pipe);
+   brw->curr.blend = (const struct brw_blend_state *)cso;
+   brw->state.dirty.mesa |= PIPE_NEW_BLEND;
+}
+
+static void brw_delete_blend_state(struct pipe_context *pipe,
+				  void *cso)
+{
+   struct brw_context *brw = brw_context(pipe);
+   assert((const void *)cso != (const void *)brw->curr.blend);
+   FREE(cso);
+}
+
+
+static void brw_set_blend_color(struct pipe_context *pipe,
+				const struct pipe_blend_color *blend_color)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_blend_constant_color *bcc = &brw->curr.bcc;
+
+   bcc->blend_constant_color[0] = blend_color->color[0];
+   bcc->blend_constant_color[1] = blend_color->color[1];
+   bcc->blend_constant_color[2] = blend_color->color[2];
+   bcc->blend_constant_color[3] = blend_color->color[3];
+
+   brw->state.dirty.mesa |= PIPE_NEW_BLEND_COLOR;
+}
+
+
+void brw_pipe_blend_init( struct brw_context *brw )
+{
+   brw->base.set_blend_color = brw_set_blend_color;
+   brw->base.create_blend_state = brw_create_blend_state;
+   brw->base.bind_blend_state = brw_bind_blend_state;
+   brw->base.delete_blend_state = brw_delete_blend_state;
+
+   {
+      struct brw_blend_constant_color *bcc = &brw->curr.bcc;
+
+      memset(bcc, 0, sizeof(*bcc));      
+      bcc->header.opcode = CMD_BLEND_CONSTANT_COLOR;
+      bcc->header.length = sizeof(*bcc)/4-2;
+   }
+
+}
+
+void brw_pipe_blend_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_clear.c b/src/gallium/drivers/i965/brw_pipe_clear.c
new file mode 100644
index 0000000000..d5cff338a6
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_clear.c
@@ -0,0 +1,270 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_pack_color.h"
+#include "util/u_math.h"
+
+#include "pipe/p_state.h"
+
+#include "brw_batchbuffer.h"
+#include "brw_screen.h"
+#include "brw_context.h"
+
+#define MASK16 0xffff
+#define MASK24 0xffffff
+
+
+/**
+ * Use blitting to clear the renderbuffers named by 'flags'.
+ * Note: we can't use the ctx->DrawBuffer->_ColorDrawBufferIndexes field
+ * since that might include software renderbuffers or renderbuffers
+ * which we're clearing with triangles.
+ */
+static enum pipe_error
+try_clear( struct brw_context *brw,
+           struct brw_surface *surface,
+           unsigned value,
+           unsigned rgba_mask)
+{
+   uint32_t BR13, CMD;
+   int x1 = 0;
+   int y1 = 0;
+   int x2 = surface->base.width;
+   int y2 = surface->base.height;
+   int pitch = surface->pitch;
+   int cpp = surface->cpp;
+
+   if (x2 == 0 || y2 == 0)
+      return 0;
+
+   debug_printf("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+                __FUNCTION__,
+                (void *)surface->bo, pitch * cpp,
+                surface->base.offset,
+                x1, y1, x2 - x1, y2 - y1);
+
+   BR13 = 0xf0 << 16;
+   CMD = XY_COLOR_BLT_CMD | rgba_mask;
+
+   /* Setup the blit command */
+   if (cpp == 4) {
+      BR13 |= BR13_8888;
+   }
+   else {
+      assert(cpp == 2);
+      BR13 |= BR13_565;
+   }
+
+   /* XXX: nasty hack for clearing depth buffers
+    */
+   if (surface->tiling == BRW_TILING_Y) {
+      x2 = pitch;
+   }
+
+   if (surface->tiling == BRW_TILING_X) {
+      CMD |= XY_DST_TILED;
+      pitch /= 4;
+   }
+
+   BR13 |= (pitch * cpp);
+
+   BEGIN_BATCH(6, 0);
+   OUT_BATCH(CMD);
+   OUT_BATCH(BR13);
+   OUT_BATCH((y1 << 16) | x1);
+   OUT_BATCH((y2 << 16) | x2);
+   OUT_RELOC(surface->bo,
+             BRW_USAGE_BLIT_DEST,
+             surface->base.offset);
+   OUT_BATCH(value);
+   ADVANCE_BATCH();
+
+   return 0;
+}
+
+
+
+
+static void color_clear(struct brw_context *brw, 
+                        struct brw_surface *bsurface,
+                        const float *rgba )
+{
+   enum pipe_error ret;
+   union util_color value;
+
+   util_pack_color( rgba, bsurface->base.format, &value );
+
+   if (bsurface->cpp == 2)
+      value.ui |= value.ui << 16;
+
+   ret = try_clear( brw, bsurface, value.ui,
+                    XY_BLT_WRITE_RGB | XY_BLT_WRITE_ALPHA );
+
+   if (ret != 0) {
+      brw_context_flush( brw );
+      ret = try_clear( brw, bsurface, value.ui,
+                       XY_BLT_WRITE_RGB | XY_BLT_WRITE_ALPHA );
+      assert( ret == 0 );
+   }
+}
+
+static void zstencil_clear(struct brw_context *brw,
+                           struct brw_surface *bsurface,
+                           unsigned clear_flags,
+                           double depth,
+                           unsigned stencil )
+{
+   enum pipe_error ret;
+   unsigned value;
+   unsigned mask = 0;
+   union fi tmp;
+
+   if (clear_flags & PIPE_CLEAR_DEPTH)
+      mask |= XY_BLT_WRITE_RGB;
+
+   switch (bsurface->base.format) {
+   case PIPE_FORMAT_Z32_FLOAT:
+      tmp.f = (float)depth;
+      value = tmp.ui;
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+      value = ((unsigned)(depth * MASK24) & MASK24);
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      value = ((unsigned)(depth * MASK16) & MASK16);
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   switch (bsurface->base.format) {
+   case PIPE_FORMAT_Z32_FLOAT:
+      mask |= XY_BLT_WRITE_ALPHA;
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+      value = value | (stencil << 24);
+      mask |= XY_BLT_WRITE_ALPHA;
+      break;
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+      value = value | (stencil << 24);
+      if (clear_flags & PIPE_CLEAR_STENCIL)
+         mask |= XY_BLT_WRITE_ALPHA;
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      value = value | (value << 16);
+      mask |= XY_BLT_WRITE_ALPHA;
+      break;
+   default:
+      break;
+   }
+
+   ret = try_clear( brw, bsurface, value, mask );
+
+   if (ret != 0) {
+      brw_context_flush( brw );
+      ret = try_clear( brw, bsurface, value, mask );
+      assert( ret == 0 );
+   }
+}
+
+
+
+/**
+ * Clear the given surface to the specified value.
+ * No masking, no scissor (clear entire buffer).
+ */
+static void brw_clear(struct pipe_context *pipe, 
+                      unsigned buffers,
+                      const float *rgba,
+                      double depth,
+                      unsigned stencil)
+{
+   struct brw_context *brw = brw_context( pipe );
+   int i;
+
+   if (buffers & PIPE_CLEAR_COLOR) {
+      for (i = 0; i < brw->curr.fb.nr_cbufs; i++) {
+         color_clear( brw, 
+                      brw_surface(brw->curr.fb.cbufs[i]),
+                      rgba );
+      }
+   }
+
+   if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
+      if (brw->curr.fb.zsbuf) {
+         zstencil_clear( brw,
+                         brw_surface(brw->curr.fb.zsbuf),
+                         buffers & PIPE_CLEAR_DEPTHSTENCIL,
+                         depth, stencil );
+      }
+   }
+}
+
+/* XXX should respect region */
+static void brw_clear_render_target(struct pipe_context *pipe,
+                                    struct pipe_surface *dst,
+                                    const float *rgba,
+                                    unsigned dstx, unsigned dsty,
+                                    unsigned width, unsigned height)
+{
+   struct brw_context *brw = brw_context( pipe );
+
+   color_clear( brw,
+                brw_surface(dst),
+                rgba );
+}
+
+/* XXX should respect region */
+static void brw_clear_depth_stencil(struct pipe_context *pipe,
+                                    struct pipe_surface *dst,
+                                    unsigned clear_flags,
+                                    double depth,
+                                    unsigned stencil,
+                                    unsigned dstx, unsigned dsty,
+                                    unsigned width, unsigned height)
+{
+   struct brw_context *brw = brw_context( pipe );
+
+   zstencil_clear( brw,
+                   brw_surface(dst),
+                   clear_flags,
+                   depth, stencil );
+}
+
+void brw_pipe_clear_init( struct brw_context *brw )
+{
+   brw->base.clear = brw_clear;
+   brw->base.clear_render_target = brw_clear_render_target;
+   brw->base.clear_depth_stencil = brw_clear_depth_stencil;
+}
+
+
+void brw_pipe_clear_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_depth.c b/src/gallium/drivers/i965/brw_pipe_depth.c
new file mode 100644
index 0000000000..31c2c343d8
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_depth.c
@@ -0,0 +1,187 @@
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+
+/* XXX: Fixme - include this to get IZ_ defines
+ */
+#include "brw_wm.h"
+
+static unsigned brw_translate_compare_func(unsigned func)
+{
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      return BRW_COMPAREFUNCTION_NEVER;
+   case PIPE_FUNC_LESS:
+      return BRW_COMPAREFUNCTION_LESS;
+   case PIPE_FUNC_LEQUAL:
+      return BRW_COMPAREFUNCTION_LEQUAL;
+   case PIPE_FUNC_GREATER:
+      return BRW_COMPAREFUNCTION_GREATER;
+   case PIPE_FUNC_GEQUAL:
+      return BRW_COMPAREFUNCTION_GEQUAL;
+   case PIPE_FUNC_NOTEQUAL:
+      return BRW_COMPAREFUNCTION_NOTEQUAL;
+   case PIPE_FUNC_EQUAL:
+      return BRW_COMPAREFUNCTION_EQUAL;
+   case PIPE_FUNC_ALWAYS:
+      return BRW_COMPAREFUNCTION_ALWAYS;
+   default:
+      assert(0);
+      return BRW_COMPAREFUNCTION_ALWAYS;
+   }
+}
+
+static unsigned translate_stencil_op(unsigned op)
+{
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      return BRW_STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:
+      return BRW_STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:
+      return BRW_STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:
+      return BRW_STENCILOP_INCRSAT;
+   case PIPE_STENCIL_OP_DECR:
+      return BRW_STENCILOP_DECRSAT;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      return BRW_STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      return BRW_STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INVERT:
+      return BRW_STENCILOP_INVERT;
+   default:
+      assert(0);
+      return BRW_STENCILOP_ZERO;
+   }
+}
+
+static void create_bcc_state( struct brw_depth_stencil_state *zstencil,
+			      const struct pipe_depth_stencil_alpha_state *templ )
+{
+   if (templ->stencil[0].enabled) {
+      zstencil->cc0.stencil_enable = 1;
+      zstencil->cc0.stencil_func =
+	 brw_translate_compare_func(templ->stencil[0].func);
+      zstencil->cc0.stencil_fail_op =
+	 translate_stencil_op(templ->stencil[0].fail_op);
+      zstencil->cc0.stencil_pass_depth_fail_op =
+	 translate_stencil_op(templ->stencil[0].zfail_op);
+      zstencil->cc0.stencil_pass_depth_pass_op =
+	 translate_stencil_op(templ->stencil[0].zpass_op);
+      zstencil->cc1.stencil_write_mask = templ->stencil[0].writemask;
+      zstencil->cc1.stencil_test_mask = templ->stencil[0].valuemask;
+
+      if (templ->stencil[1].enabled) {
+	 zstencil->cc0.bf_stencil_enable = 1;
+	 zstencil->cc0.bf_stencil_func =
+	    brw_translate_compare_func(templ->stencil[1].func);
+	 zstencil->cc0.bf_stencil_fail_op =
+	    translate_stencil_op(templ->stencil[1].fail_op);
+	 zstencil->cc0.bf_stencil_pass_depth_fail_op =
+	    translate_stencil_op(templ->stencil[1].zfail_op);
+	 zstencil->cc0.bf_stencil_pass_depth_pass_op =
+	    translate_stencil_op(templ->stencil[1].zpass_op);
+	 zstencil->cc2.bf_stencil_write_mask = templ->stencil[1].writemask;
+	 zstencil->cc2.bf_stencil_test_mask = templ->stencil[1].valuemask;
+      }
+
+      zstencil->cc0.stencil_write_enable = (zstencil->cc1.stencil_write_mask ||
+					    zstencil->cc2.bf_stencil_write_mask);
+   }
+
+
+   if (templ->alpha.enabled) {
+      zstencil->cc3.alpha_test = 1;
+      zstencil->cc3.alpha_test_func = brw_translate_compare_func(templ->alpha.func);
+      zstencil->cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
+      zstencil->cc7.alpha_ref.ub[0] = float_to_ubyte(templ->alpha.ref_value);
+   }
+
+   if (templ->depth.enabled) {
+      zstencil->cc2.depth_test = 1;
+      zstencil->cc2.depth_test_function = brw_translate_compare_func(templ->depth.func);
+      zstencil->cc2.depth_write_enable = templ->depth.writemask;
+   }
+}
+
+static void create_wm_iz_state( struct brw_depth_stencil_state *zstencil )
+{
+   if (zstencil->cc3.alpha_test)
+      zstencil->iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+
+   if (zstencil->cc2.depth_test)
+      zstencil->iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
+
+   if (zstencil->cc2.depth_write_enable)
+      zstencil->iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
+
+   if (zstencil->cc0.stencil_enable)
+      zstencil->iz_lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
+
+   if (zstencil->cc0.stencil_write_enable)
+      zstencil->iz_lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
+
+}
+
+
+static void *
+brw_create_depth_stencil_state( struct pipe_context *pipe,
+				const struct pipe_depth_stencil_alpha_state *templ )
+{
+   struct brw_depth_stencil_state *zstencil = CALLOC_STRUCT(brw_depth_stencil_state);
+
+   create_bcc_state( zstencil, templ );
+   create_wm_iz_state( zstencil );
+
+   return (void *)zstencil;
+}
+
+
+static void brw_bind_depth_stencil_state(struct pipe_context *pipe,
+					 void *cso)
+{
+   struct brw_context *brw = brw_context(pipe);
+   brw->curr.zstencil = (const struct brw_depth_stencil_state *)cso;
+   brw->state.dirty.mesa |= PIPE_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+static void brw_delete_depth_stencil_state(struct pipe_context *pipe,
+					   void *cso)
+{
+   struct brw_context *brw = brw_context(pipe);
+   assert((const void *)cso != (const void *)brw->curr.zstencil);
+   FREE(cso);
+}
+
+static void brw_set_stencil_ref(struct pipe_context *pipe,
+                                const struct pipe_stencil_ref *stencil_ref)
+{
+   struct brw_context *brw = brw_context(pipe);
+   brw->curr.cc1_stencil_ref.stencil_ref = stencil_ref->ref_value[0];
+   brw->curr.cc1_stencil_ref.bf_stencil_ref = stencil_ref->ref_value[1];
+
+   brw->state.dirty.mesa |= PIPE_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+static void
+brw_set_sample_mask(struct pipe_context *pipe,
+                    unsigned sample_mask)
+{
+}
+
+void brw_pipe_depth_stencil_init( struct brw_context *brw )
+{
+   brw->base.set_stencil_ref = brw_set_stencil_ref;
+   brw->base.create_depth_stencil_alpha_state = brw_create_depth_stencil_state;
+   brw->base.bind_depth_stencil_alpha_state = brw_bind_depth_stencil_state;
+   brw->base.delete_depth_stencil_alpha_state = brw_delete_depth_stencil_state;
+   brw->base.set_sample_mask = brw_set_sample_mask;
+}
+
+void brw_pipe_depth_stencil_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_fb.c b/src/gallium/drivers/i965/brw_pipe_fb.c
new file mode 100644
index 0000000000..a90b7c73f6
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_fb.c
@@ -0,0 +1,84 @@
+#include "util/u_math.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+#include "brw_context.h"
+
+/**
+ * called from intelDrawBuffer()
+ */
+static void brw_set_framebuffer_state( struct pipe_context *pipe, 
+				       const struct pipe_framebuffer_state *fb )
+{
+   struct brw_context *brw = brw_context(pipe);
+   unsigned i;
+
+   /* Dimensions:
+    */
+   if (brw->curr.fb.width != fb->width ||
+       brw->curr.fb.height != fb->height) {
+      brw->curr.fb.width = fb->width;
+      brw->curr.fb.height = fb->height;
+      brw->state.dirty.mesa |= PIPE_NEW_FRAMEBUFFER_DIMENSIONS;
+   }
+   
+   /* Z/Stencil
+    */
+   if (brw->curr.fb.zsbuf != fb->zsbuf) {
+      pipe_surface_reference(&brw->curr.fb.zsbuf, fb->zsbuf);
+      brw->state.dirty.mesa |= PIPE_NEW_DEPTH_BUFFER;
+   }
+
+   /* Color buffers:
+    */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      if (brw->curr.fb.cbufs[i] != fb->cbufs[i]) {
+	 brw->state.dirty.mesa |= PIPE_NEW_COLOR_BUFFERS;
+	 pipe_surface_reference(&brw->curr.fb.cbufs[i], fb->cbufs[i]);
+      }
+   }
+   
+   if (brw->curr.fb.nr_cbufs != fb->nr_cbufs) {
+      brw->curr.fb.nr_cbufs = MIN2(BRW_MAX_DRAW_BUFFERS, fb->nr_cbufs);
+      brw->state.dirty.mesa |= PIPE_NEW_NR_CBUFS;
+   }
+}
+
+
+static void brw_set_viewport_state( struct pipe_context *pipe,
+				    const struct pipe_viewport_state *viewport )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->curr.viewport = *viewport;
+   brw->curr.ccv.min_depth = viewport->scale[2] * -1.0 + viewport->translate[2];
+   brw->curr.ccv.max_depth = viewport->scale[2] *  1.0 + viewport->translate[2];
+
+   if (0)
+      debug_printf("%s depth range %f .. %f\n",
+                   __FUNCTION__,
+                   brw->curr.ccv.min_depth,
+                   brw->curr.ccv.max_depth);
+
+   brw->state.dirty.mesa |= PIPE_NEW_VIEWPORT;
+}
+
+
+void brw_pipe_framebuffer_init( struct brw_context *brw )
+{
+   brw->base.set_framebuffer_state = brw_set_framebuffer_state;
+   brw->base.set_viewport_state = brw_set_viewport_state;
+}
+
+void brw_pipe_framebuffer_cleanup( struct brw_context *brw )
+{
+   struct pipe_framebuffer_state *fb = &brw->curr.fb;
+   int i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&fb->cbufs[i], NULL);
+   }
+
+   pipe_surface_reference(&fb->zsbuf, NULL);
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_flush.c b/src/gallium/drivers/i965/brw_pipe_flush.c
new file mode 100644
index 0000000000..0ae1a6be9e
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_flush.c
@@ -0,0 +1,57 @@
+
+#include "brw_context.h"
+#include "brw_batchbuffer.h"
+
+#include "util/u_upload_mgr.h"
+
+
+
+
+/* All batchbuffer flushes must go through this function.
+ */
+void brw_context_flush( struct brw_context *brw )
+{
+   /*
+    * 
+    */
+   brw_emit_query_end(brw);
+
+   /* Move to the end of the current upload buffer so that we'll force choosing
+    * a new buffer next time.
+    */
+   u_upload_flush( brw->vb.upload_vertex );
+   u_upload_flush( brw->vb.upload_index );
+
+   _brw_batchbuffer_flush( brw->batch, __FILE__, __LINE__ );
+
+   /* Mark all context state as needing to be re-emitted.
+    * This is probably not as severe as on 915, since almost all of our state
+    * is just in referenced buffers.
+    */
+   brw->state.dirty.brw |= BRW_NEW_CONTEXT;
+   brw->state.dirty.mesa |= ~0;
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+
+   brw->curbe.need_new_bo = GL_TRUE;
+}
+
+static void
+brw_flush( struct pipe_context *pipe,
+           unsigned flags, 
+           struct pipe_fence_handle **fence )
+{
+   brw_context_flush( brw_context( pipe ) );
+   if (fence)
+      *fence = NULL;
+}
+
+void brw_pipe_flush_init( struct brw_context *brw )
+{
+   brw->base.flush = brw_flush;
+}
+
+
+void brw_pipe_flush_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_misc.c b/src/gallium/drivers/i965/brw_pipe_misc.c
new file mode 100644
index 0000000000..3035907807
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_misc.c
@@ -0,0 +1,54 @@
+
+#include "brw_context.h"
+#include "brw_structs.h"
+#include "brw_defines.h"
+
+static void brw_set_polygon_stipple( struct pipe_context *pipe,
+				     const struct pipe_poly_stipple *stip )
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_polygon_stipple *bps = &brw->curr.bps;
+   GLuint i;
+
+   memset(bps, 0, sizeof *bps);
+   bps->header.opcode = CMD_POLY_STIPPLE_PATTERN;
+   bps->header.length = sizeof *bps/4-2;
+
+   for (i = 0; i < 32; i++)
+      bps->stipple[i] = stip->stipple[i]; /* don't invert */
+
+   brw->state.dirty.mesa |= PIPE_NEW_POLYGON_STIPPLE;
+}
+
+
+static void brw_set_scissor_state( struct pipe_context *pipe,
+                                   const struct pipe_scissor_state *scissor )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->curr.scissor =  *scissor;
+   brw->state.dirty.mesa |= PIPE_NEW_SCISSOR;
+}
+
+
+static void brw_set_clip_state( struct pipe_context *pipe,
+                                const struct pipe_clip_state *clip )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->curr.ucp = *clip;
+   brw->state.dirty.mesa |= PIPE_NEW_CLIP;
+}
+
+
+void brw_pipe_misc_init( struct brw_context *brw )
+{
+   brw->base.set_polygon_stipple = brw_set_polygon_stipple;
+   brw->base.set_scissor_state = brw_set_scissor_state;
+   brw->base.set_clip_state = brw_set_clip_state;
+}
+
+
+void brw_pipe_misc_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_query.c b/src/gallium/drivers/i965/brw_pipe_query.c
new file mode 100644
index 0000000000..0745254c3c
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_query.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright © 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/** @file support for ARB_query_object
+ *
+ * ARB_query_object is implemented by using the PIPE_CONTROL command to stall
+ * execution on the completion of previous depth tests, and write the
+ * current PS_DEPTH_COUNT to a buffer object.
+ *
+ * We use before and after counts when drawing during a query so that
+ * we don't pick up other clients' query data in ours.  To reduce overhead,
+ * a single BO is used to record the query data for all active queries at
+ * once.  This also gives us a simple bound on how much batchbuffer space is
+ * required for handling queries, so that we can be sure that we won't
+ * have to emit a batchbuffer without getting the ending PS_DEPTH_COUNT.
+ */
+#include "util/u_simple_list.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_batchbuffer.h"
+#include "brw_reg.h"
+
+/** Waits on the query object's BO and totals the results for this query */
+static boolean
+brw_query_get_result(struct pipe_context *pipe,
+		     struct pipe_query *q,
+		     boolean wait,
+		     void *vresult)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+   uint64_t *result = (uint64_t*)vresult;
+
+   /* Map and count the pixels from the current query BO */
+   if (query->bo) {
+      int i;
+      uint64_t *map;
+      
+      if (brw->sws->bo_is_busy(query->bo) && !wait)
+	 return FALSE;
+      
+      map = bo_map_read(brw->sws, query->bo);
+      if (map == NULL)
+	 return FALSE;
+      
+      for (i = query->first_index; i <= query->last_index; i++) {
+	 query->result += map[i * 2 + 1] - map[i * 2];
+      }
+
+      brw->sws->bo_unmap(query->bo);
+      bo_reference(&query->bo, NULL);
+   }
+
+   *result = query->result;
+   return TRUE;
+}
+
+static struct pipe_query *
+brw_query_create(struct pipe_context *pipe, unsigned type )
+{
+   struct brw_query_object *query;
+
+   switch (type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      query = CALLOC_STRUCT( brw_query_object );
+      if (query == NULL)
+	 return NULL;
+      return (struct pipe_query *)query;
+      
+   default:
+      return NULL;
+   }
+}
+
+static void
+brw_query_destroy(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   bo_reference(&query->bo, NULL);
+   FREE(query);
+}
+
+static void
+brw_query_begin(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Reset our driver's tracking of query state. */
+   bo_reference(&query->bo, NULL);
+   query->result = 0;
+   query->first_index = -1;
+   query->last_index = -1;
+
+   insert_at_head(&brw->query.active_head, query);
+   brw->query.stats_wm++;
+   brw->state.dirty.mesa |= PIPE_NEW_QUERY;
+}
+
+static void
+brw_query_end(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Flush the batchbuffer in case it has writes to our query BO.
+    * Have later queries write to a new query BO so that further rendering
+    * doesn't delay the collection of our results.
+    */
+   if (query->bo) {
+      brw_emit_query_end(brw);
+      brw_context_flush( brw );
+
+      bo_reference(&brw->query.bo, NULL);
+   }
+
+   remove_from_list(query);
+   brw->query.stats_wm--;
+   brw->state.dirty.mesa |= PIPE_NEW_QUERY;
+}
+
+/***********************************************************************
+ * Internal functions and callbacks to implement queries 
+ */
+
+/** Called to set up the query BO and account for its aperture space */
+enum pipe_error
+brw_prepare_query_begin(struct brw_context *brw)
+{
+   enum pipe_error ret;
+
+   /* Skip if we're not doing any queries. */
+   if (is_empty_list(&brw->query.active_head))
+      return PIPE_OK;
+
+   /* Get a new query BO if we're going to need it. */
+   if (brw->query.bo == NULL ||
+       brw->query.index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
+
+      ret = brw->sws->bo_alloc(brw->sws, BRW_BUFFER_TYPE_QUERY, 4096, 1,
+                               &brw->query.bo);
+      if (ret)
+         return ret;
+
+      brw->query.index = 0;
+   }
+
+   brw_add_validated_bo(brw, brw->query.bo);
+
+   return PIPE_OK;
+}
+
+/** Called just before primitive drawing to get a beginning PS_DEPTH_COUNT. */
+void
+brw_emit_query_begin(struct brw_context *brw)
+{
+   struct brw_query_object *query;
+
+   /* Skip if we're not doing any queries, or we've emitted the start. */
+   if (brw->query.active || is_empty_list(&brw->query.active_head))
+      return;
+
+   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+	     PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   /* This object could be mapped cacheable, but we don't have an exposed
+    * mechanism to support that.  Since it's going uncached, tell GEM that
+    * we're writing to it.  The usual clflush should be all that's required
+    * to pick up the results.
+    */
+   OUT_RELOC(brw->query.bo,
+	     BRW_USAGE_QUERY_RESULT,
+	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
+	     ((brw->query.index * 2) * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   foreach(query, &brw->query.active_head) {
+      if (query->bo != brw->query.bo) {
+	 uint64_t tmp;
+	 
+	 /* Propogate the results from this buffer to all of the
+	  * active queries, as the bo is going away.
+	  */
+	 if (query->bo != NULL)
+	    brw_query_get_result( &brw->base, 
+				  (struct pipe_query *)query,
+				  FALSE,
+				  &tmp );
+
+	 bo_reference( &query->bo, brw->query.bo );
+	 query->first_index = brw->query.index;
+      }
+      query->last_index = brw->query.index;
+   }
+   brw->query.active = GL_TRUE;
+}
+
+/** Called at batchbuffer flush to get an ending PS_DEPTH_COUNT */
+void
+brw_emit_query_end(struct brw_context *brw)
+{
+   if (!brw->query.active)
+      return;
+
+   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+	     PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   OUT_RELOC(brw->query.bo,
+	     BRW_USAGE_QUERY_RESULT,
+	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
+	     ((brw->query.index * 2 + 1) * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   brw->query.active = GL_FALSE;
+   brw->query.index++;
+}
+
+void brw_pipe_query_init( struct brw_context *brw )
+{
+   brw->base.create_query = brw_query_create;
+   brw->base.destroy_query = brw_query_destroy;
+   brw->base.begin_query = brw_query_begin;
+   brw->base.end_query = brw_query_end;
+   brw->base.get_query_result = brw_query_get_result;
+}
+
+
+void brw_pipe_query_cleanup( struct brw_context *brw )
+{
+   /* Unreference brw->query.bo ??
+    */
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.c b/src/gallium/drivers/i965/brw_pipe_rast.c
new file mode 100644
index 0000000000..4c1a6d7dcd
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_rast.c
@@ -0,0 +1,188 @@
+
+#include "util/u_memory.h"
+#include "pipe/p_defines.h"
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_pipe_rast.h"
+#include "brw_wm.h"
+
+
+static unsigned translate_fill( unsigned fill )
+{
+   switch (fill) {
+   case PIPE_POLYGON_MODE_FILL:
+      return CLIP_FILL;
+   case PIPE_POLYGON_MODE_LINE:
+      return CLIP_LINE;
+   case PIPE_POLYGON_MODE_POINT:
+      return CLIP_POINT;
+   default:
+      assert(0);
+      return CLIP_FILL;
+   }
+}
+
+
+/* Calculates the key for triangle-mode clipping.  Non-triangle
+ * clipping keys use much less information and are computed on the
+ * fly.
+ */
+static void
+calculate_clip_key_rast( const struct brw_context *brw,
+			 const struct pipe_rasterizer_state *templ,
+			 const struct brw_rasterizer_state *rast,
+			 struct brw_clip_prog_key *key)
+{
+   memset(key, 0, sizeof *key);
+
+   if (brw->chipset.is_igdng)
+       key->clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
+   else
+       key->clip_mode = BRW_CLIPMODE_NORMAL;
+
+   key->do_flat_shading = templ->flatshade;
+
+   if (templ->cull_face == PIPE_FACE_FRONT_AND_BACK) {
+      key->clip_mode = BRW_CLIPMODE_REJECT_ALL;
+      return;
+   }
+
+   key->fill_ccw = CLIP_CULL;
+   key->fill_cw = CLIP_CULL;
+
+   if (!(templ->cull_face & PIPE_FACE_FRONT)) {
+      if (templ->front_ccw)
+         key->fill_ccw = translate_fill(templ->fill_front);
+      else 
+         key->fill_cw = translate_fill(templ->fill_front);
+   }
+
+   if (!(templ->cull_face & PIPE_FACE_BACK)) {
+      if (templ->front_ccw)
+         key->fill_cw = translate_fill(templ->fill_back);
+      else 
+         key->fill_ccw = translate_fill(templ->fill_back);
+   }
+
+   if (key->fill_cw == CLIP_LINE ||
+       key->fill_ccw == CLIP_LINE ||
+       key->fill_cw == CLIP_POINT ||
+       key->fill_ccw == CLIP_POINT) {
+      key->do_unfilled = 1;
+      key->clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+   }
+
+   switch (key->fill_cw) {
+   case CLIP_POINT:
+      key->offset_cw = templ->offset_point;
+      break;
+   case CLIP_LINE:
+      key->offset_cw = templ->offset_line;
+      break;
+   case CLIP_FILL:
+      key->offset_cw = templ->offset_tri;
+      break;
+   }
+
+   switch (key->fill_ccw) {
+   case CLIP_POINT:
+      key->offset_ccw = templ->offset_point;
+      break;
+   case CLIP_LINE:
+      key->offset_ccw = templ->offset_line;
+      break;
+   case CLIP_FILL:
+      key->offset_ccw = templ->offset_tri;
+      break;
+   }
+
+   if (templ->light_twoside && key->fill_cw != CLIP_CULL) 
+      key->copy_bfc_cw = 1;
+   
+   if (templ->light_twoside && key->fill_ccw != CLIP_CULL) 
+      key->copy_bfc_ccw = 1;
+}
+
+
+static void
+calculate_line_stipple_rast( const struct pipe_rasterizer_state *templ,
+			     struct brw_line_stipple *bls )
+{
+   GLfloat tmp = 1.0f / (templ->line_stipple_factor + 1);
+   GLint tmpi = tmp * (1<<13);
+
+   bls->header.opcode = CMD_LINE_STIPPLE_PATTERN;
+   bls->header.length = sizeof(*bls)/4 - 2;
+   bls->bits0.pattern = templ->line_stipple_pattern;
+   bls->bits1.repeat_count = templ->line_stipple_factor + 1;
+   bls->bits1.inverse_repeat_count = tmpi;
+}
+
+static void *brw_create_rasterizer_state( struct pipe_context *pipe,
+					  const struct pipe_rasterizer_state *templ )
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_rasterizer_state *rast;
+
+   rast = CALLOC_STRUCT(brw_rasterizer_state);
+   if (rast == NULL)
+      return NULL;
+
+   rast->templ = *templ;
+
+   calculate_clip_key_rast( brw, templ, rast, &rast->clip_key );
+   
+   if (templ->line_stipple_enable)
+      calculate_line_stipple_rast( templ, &rast->bls );
+
+   /* Caclculate lookup value for WM IZ table.
+    */
+   if (templ->line_smooth) {
+      if (templ->fill_front == PIPE_POLYGON_MODE_LINE &&
+	  templ->fill_back == PIPE_POLYGON_MODE_LINE) {
+	 rast->unfilled_aa_line = AA_ALWAYS;
+      }
+      else if (templ->fill_front == PIPE_POLYGON_MODE_LINE ||
+	       templ->fill_back == PIPE_POLYGON_MODE_LINE) {
+	 rast->unfilled_aa_line = AA_SOMETIMES;
+      }
+      else {
+	 rast->unfilled_aa_line = AA_NEVER;
+      }
+   }
+   else {
+      rast->unfilled_aa_line = AA_NEVER;
+   }
+
+   return (void *)rast;
+}
+
+
+static void brw_bind_rasterizer_state(struct pipe_context *pipe,
+				 void *cso)
+{
+   struct brw_context *brw = brw_context(pipe);
+   brw->curr.rast = (const struct brw_rasterizer_state *)cso;
+   brw->state.dirty.mesa |= PIPE_NEW_RAST;
+}
+
+static void brw_delete_rasterizer_state(struct pipe_context *pipe,
+				  void *cso)
+{
+   struct brw_context *brw = brw_context(pipe);
+   assert((const void *)cso != (const void *)brw->curr.rast);
+   FREE(cso);
+}
+
+
+
+void brw_pipe_rast_init( struct brw_context *brw )
+{
+   brw->base.create_rasterizer_state = brw_create_rasterizer_state;
+   brw->base.bind_rasterizer_state = brw_bind_rasterizer_state;
+   brw->base.delete_rasterizer_state = brw_delete_rasterizer_state;
+}
+
+void brw_pipe_rast_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.h b/src/gallium/drivers/i965/brw_pipe_rast.h
new file mode 100644
index 0000000000..9354f01e18
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_rast.h
@@ -0,0 +1,16 @@
+#ifndef BRW_PIPE_RAST_H
+#define BRW_PIPE_RAST_H
+
+#include "brw_clip.h"
+
+struct brw_rasterizer_state {
+   struct pipe_rasterizer_state templ; /* for draw module */
+
+   /* Precalculated hardware state:
+    */
+   struct brw_clip_prog_key clip_key;
+   struct brw_line_stipple bls;
+   unsigned unfilled_aa_line;
+};
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_pipe_sampler.c b/src/gallium/drivers/i965/brw_pipe_sampler.c
new file mode 100644
index 0000000000..3fe753ec42
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_sampler.c
@@ -0,0 +1,259 @@
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+
+
+
+/* The brw (and related graphics cores) do not support GL_CLAMP.  The
+ * Intel drivers for "other operating systems" implement GL_CLAMP as
+ * GL_CLAMP_TO_EDGE, so the same is done here.
+ */
+static GLuint translate_wrap_mode( unsigned wrap )
+{
+   switch( wrap ) {
+   case PIPE_TEX_WRAP_REPEAT: 
+      return BRW_TEXCOORDMODE_WRAP;
+
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return BRW_TEXCOORDMODE_CLAMP;
+      
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return BRW_TEXCOORDMODE_CLAMP_BORDER;
+
+   case PIPE_TEX_WRAP_MIRROR_REPEAT: 
+      return BRW_TEXCOORDMODE_MIRROR;
+
+   case PIPE_TEX_WRAP_MIRROR_CLAMP: 
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: 
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: 
+      return BRW_TEXCOORDMODE_MIRROR_ONCE;
+
+   default: 
+      return BRW_TEXCOORDMODE_WRAP;
+   }
+}
+
+static GLuint translate_img_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      return BRW_MAPFILTER_NEAREST;
+   case PIPE_TEX_FILTER_LINEAR:
+      return BRW_MAPFILTER_LINEAR;
+   default:
+      assert(0);
+      return BRW_MAPFILTER_NEAREST;
+   }
+}
+
+static GLuint translate_mip_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_MIPFILTER_NONE: 
+      return BRW_MIPFILTER_NONE;
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      return BRW_MIPFILTER_NEAREST;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      return BRW_MIPFILTER_LINEAR;
+   default:
+      assert(0);
+      return BRW_MIPFILTER_NONE;
+   }
+}
+
+/* XXX: not sure why there are special translations for the shadow tex
+ * compare functions.  In particular ALWAYS is translated to NEVER.
+ * Is this a hardware issue?  Does i965 really suffer from this?
+ */
+static GLuint translate_shadow_compare_func( unsigned func )
+{
+   switch (func) {
+   case PIPE_FUNC_NEVER: 
+       return BRW_COMPAREFUNCTION_ALWAYS;
+   case PIPE_FUNC_LESS: 
+       return BRW_COMPAREFUNCTION_LEQUAL;
+   case PIPE_FUNC_LEQUAL: 
+       return BRW_COMPAREFUNCTION_LESS;
+   case PIPE_FUNC_GREATER: 
+       return BRW_COMPAREFUNCTION_GEQUAL;
+   case PIPE_FUNC_GEQUAL: 
+      return BRW_COMPAREFUNCTION_GREATER;
+   case PIPE_FUNC_NOTEQUAL: 
+      return BRW_COMPAREFUNCTION_EQUAL;
+   case PIPE_FUNC_EQUAL: 
+      return BRW_COMPAREFUNCTION_NOTEQUAL;
+   case PIPE_FUNC_ALWAYS: 
+       return BRW_COMPAREFUNCTION_NEVER;
+   default:
+      assert(0);
+      return BRW_COMPAREFUNCTION_NEVER;
+   }
+}
+
+
+
+
+static void *
+brw_create_sampler_state( struct pipe_context *pipe,
+                          const struct pipe_sampler_state *template )
+{
+   struct brw_sampler *sampler = CALLOC_STRUCT(brw_sampler);
+
+   sampler->ss0.min_filter = translate_img_filter( template->min_img_filter );
+   sampler->ss0.mag_filter = translate_img_filter( template->mag_img_filter );
+   sampler->ss0.mip_filter = translate_mip_filter( template->min_mip_filter );
+
+
+   /* XXX: anisotropy logic slightly changed: 
+    */
+   if (template->max_anisotropy > 1) {
+      sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC; 
+      sampler->ss0.mag_filter = BRW_MAPFILTER_ANISOTROPIC;
+
+      sampler->ss3.max_aniso = MIN2((template->max_anisotropy - 2) / 2,
+                                    BRW_ANISORATIO_16);
+   }
+
+   sampler->ss1.r_wrap_mode = translate_wrap_mode(template->wrap_r);
+   sampler->ss1.s_wrap_mode = translate_wrap_mode(template->wrap_s);
+   sampler->ss1.t_wrap_mode = translate_wrap_mode(template->wrap_t);
+
+   /* Set LOD bias: 
+    */
+   sampler->ss0.lod_bias = 
+      util_signed_fixed(CLAMP(template->lod_bias, -16, 15), 6);
+
+
+   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+   /* Set shadow function: 
+    */
+   if (template->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+
+      /* Shadowing is "enabled" by emitting a particular sampler
+       * message (sample_c).  So need to recompile WM program when
+       * shadow comparison is enabled on each/any texture unit.
+       */
+      sampler->ss0.shadow_function =
+	 translate_shadow_compare_func(template->compare_func);
+   }
+
+   /* Set BaseMipLevel, MaxLOD, MinLOD: 
+    */
+   sampler->ss0.base_level = 
+      util_unsigned_fixed(0, 1);
+
+   sampler->ss1.max_lod = 
+      util_unsigned_fixed(CLAMP(template->max_lod, 0, 13), 6);
+
+   sampler->ss1.min_lod = 
+      util_unsigned_fixed(CLAMP(template->min_lod, 0, 13), 6);
+
+   return (void *)sampler;
+}
+
+static void brw_bind_sampler_state(struct pipe_context *pipe,
+                                   unsigned num, void **sampler)
+{
+   struct brw_context *brw = brw_context(pipe);
+   int i;
+
+   for (i = 0; i < num; i++)
+      brw->curr.sampler[i] = sampler[i];
+
+   for (i = num; i < brw->curr.num_samplers; i++)
+      brw->curr.sampler[i] = NULL;
+
+   brw->curr.num_samplers = num;
+   brw->state.dirty.mesa |= PIPE_NEW_SAMPLERS;
+}
+
+static void brw_delete_sampler_state(struct pipe_context *pipe,
+				  void *cso)
+{
+   FREE(cso);
+}
+
+static void brw_set_fragment_sampler_views(struct pipe_context *pipe,
+                                           unsigned num,
+                                           struct pipe_sampler_view **views)
+{
+   struct brw_context *brw = brw_context(pipe);
+   int i;
+
+   for (i = 0; i < num; i++)
+      pipe_sampler_view_reference(&brw->curr.fragment_sampler_views[i], views[i]);
+
+   for (i = num; i < brw->curr.num_fragment_sampler_views; i++)
+      pipe_sampler_view_reference(&brw->curr.fragment_sampler_views[i], NULL);
+
+   brw->curr.num_fragment_sampler_views = num;
+   brw->state.dirty.mesa |= PIPE_NEW_BOUND_TEXTURES;
+}
+
+static void brw_set_vertex_sampler_views(struct pipe_context *pipe,
+                                         unsigned num,
+                                         struct pipe_sampler_view **views)
+{
+}
+
+static void brw_bind_vertex_sampler_state(struct pipe_context *pipe,
+                                          unsigned num, void **sampler)
+{
+}
+
+
+static struct pipe_sampler_view *
+brw_create_sampler_view(struct pipe_context *pipe,
+                        struct pipe_resource *texture,
+                        const struct pipe_sampler_view *templ)
+{
+   struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
+
+   if (view) {
+      *view = *templ;
+      view->reference.count = 1;
+      view->texture = NULL;
+      pipe_resource_reference(&view->texture, texture);
+      view->context = pipe;
+   }
+
+   return view;
+}
+
+
+static void
+brw_sampler_view_destroy(struct pipe_context *pipe,
+                         struct pipe_sampler_view *view)
+{
+   pipe_resource_reference(&view->texture, NULL);
+   FREE(view);
+}
+
+
+void brw_pipe_sampler_init( struct brw_context *brw )
+{
+   brw->base.create_sampler_state = brw_create_sampler_state;
+   brw->base.delete_sampler_state = brw_delete_sampler_state;
+
+   brw->base.set_fragment_sampler_views = brw_set_fragment_sampler_views;
+   brw->base.bind_fragment_sampler_states = brw_bind_sampler_state;
+
+   brw->base.set_vertex_sampler_views = brw_set_vertex_sampler_views;
+   brw->base.bind_vertex_sampler_states = brw_bind_vertex_sampler_state;
+
+   brw->base.create_sampler_view = brw_create_sampler_view;
+   brw->base.sampler_view_destroy = brw_sampler_view_destroy;
+}
+void brw_pipe_sampler_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
new file mode 100644
index 0000000000..d9bee96c11
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -0,0 +1,303 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+  
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+
+#include "brw_context.h"
+#include "brw_wm.h"
+
+
+/**
+ * Determine if the given shader uses complex features such as flow
+ * conditionals, loops, subroutines.
+ */
+static GLboolean has_flow_control(const struct tgsi_shader_info *info)
+{
+    return (info->opcode_count[TGSI_OPCODE_ARL] > 0 ||
+	    info->opcode_count[TGSI_OPCODE_IF] > 0 ||
+	    info->opcode_count[TGSI_OPCODE_ENDIF] > 0 || /* redundant - IF */
+	    info->opcode_count[TGSI_OPCODE_CAL] > 0 ||
+	    info->opcode_count[TGSI_OPCODE_BRK] > 0 ||   /* redundant - BGNLOOP */
+	    info->opcode_count[TGSI_OPCODE_RET] > 0 ||   /* redundant - CAL */
+	    info->opcode_count[TGSI_OPCODE_BGNLOOP] > 0);
+}
+
+
+static void scan_immediates(const struct tgsi_token *tokens,
+                            const struct tgsi_shader_info *info,
+                            struct brw_immediate_data *imm)
+{
+   struct tgsi_parse_context parse;
+   boolean done = FALSE;
+
+   imm->nr = 0;
+   imm->data = MALLOC(info->immediate_count * 4 * sizeof(float));
+
+   tgsi_parse_init( &parse, tokens );
+   while (!tgsi_parse_end_of_tokens( &parse ) && !done) {
+      tgsi_parse_token( &parse );
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE: {
+	 static const float id[4] = {0,0,0,1};
+	 const float *value = &parse.FullToken.FullImmediate.u[0].Float;
+	 unsigned size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+         unsigned i;
+
+	 for (i = 0; i < size; i++)
+	    imm->data[imm->nr][i] = value[i];
+
+	 for (; i < 4; i++)
+	    imm->data[imm->nr][i] = id[i];
+         
+         imm->nr++;
+	 break;
+      }
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+	 done = 1;
+	 break;
+      }
+   }
+}
+
+
+static void brw_bind_fs_state( struct pipe_context *pipe, void *prog )
+{
+   struct brw_fragment_shader *fs = (struct brw_fragment_shader *)prog;
+   struct brw_context *brw = brw_context(pipe);
+   
+   if (brw->curr.fragment_shader == fs)
+      return;
+
+   if (brw->curr.fragment_shader == NULL ||
+       fs == NULL ||
+       memcmp(&brw->curr.fragment_shader->signature, &fs->signature,
+              brw_fs_signature_size(&fs->signature)) != 0) {
+      brw->state.dirty.mesa |= PIPE_NEW_FRAGMENT_SIGNATURE;
+   }
+
+   brw->curr.fragment_shader = fs;
+   brw->state.dirty.mesa |= PIPE_NEW_FRAGMENT_SHADER;
+}
+
+static void brw_bind_vs_state( struct pipe_context *pipe, void *prog )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->curr.vertex_shader = (struct brw_vertex_shader *)prog;
+   brw->state.dirty.mesa |= PIPE_NEW_VERTEX_SHADER;
+}
+
+
+
+static void *brw_create_fs_state( struct pipe_context *pipe,
+				  const struct pipe_shader_state *shader )
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_fragment_shader *fs;
+   int i;
+
+   fs = CALLOC_STRUCT(brw_fragment_shader);
+   if (fs == NULL)
+      return NULL;
+
+   /* Duplicate tokens, scan shader
+    */
+   fs->id = brw->program_id++;
+   fs->has_flow_control = has_flow_control(&fs->info);
+
+   fs->tokens = tgsi_dup_tokens(shader->tokens);
+   if (fs->tokens == NULL)
+      goto fail;
+
+   tgsi_scan_shader(fs->tokens, &fs->info);
+   scan_immediates(fs->tokens, &fs->info, &fs->immediates);
+
+   fs->signature.nr_inputs = fs->info.num_inputs;
+   for (i = 0; i < fs->info.num_inputs; i++) {
+      fs->signature.input[i].interp = fs->info.input_interpolate[i];
+      fs->signature.input[i].semantic = fs->info.input_semantic_name[i];
+      fs->signature.input[i].semantic_index = fs->info.input_semantic_index[i];
+   }
+
+   for (i = 0; i < fs->info.num_inputs; i++)
+      if (fs->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION)
+	 fs->uses_depth = 1;
+
+   if (fs->info.uses_kill)
+      fs->iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+
+   if (fs->info.writes_z)
+      fs->iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
+
+   return (void *)fs;
+
+fail:
+   FREE(fs);
+   return NULL;
+}
+
+
+static void *brw_create_vs_state( struct pipe_context *pipe,
+				  const struct pipe_shader_state *shader )
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_vertex_shader *vs;
+   unsigned i;
+
+   vs = CALLOC_STRUCT(brw_vertex_shader);
+   if (vs == NULL)
+      return NULL;
+
+   /* Duplicate tokens, scan shader
+    */
+   vs->tokens = tgsi_dup_tokens(shader->tokens);
+   if (vs->tokens == NULL)
+      goto fail;
+
+   tgsi_scan_shader(vs->tokens, &vs->info);
+   scan_immediates(vs->tokens, &vs->info, &vs->immediates);
+
+   vs->id = brw->program_id++;
+   vs->has_flow_control = has_flow_control(&vs->info);
+
+   vs->output_hpos = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_color0 = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_color1 = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_bfc0 = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_bfc1 = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_edgeflag = BRW_OUTPUT_NOT_PRESENT;
+
+   for (i = 0; i < vs->info.num_outputs; i++) {
+      int index = vs->info.output_semantic_index[i];
+      switch (vs->info.output_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         vs->output_hpos = i;
+         break;
+      case TGSI_SEMANTIC_COLOR:
+         if (index == 0)
+            vs->output_color0 = i;
+         else
+            vs->output_color1 = i;
+         break;
+      case TGSI_SEMANTIC_BCOLOR:
+         if (index == 0)
+            vs->output_bfc0 = i;
+         else
+            vs->output_bfc1 = i;
+         break;
+      case TGSI_SEMANTIC_EDGEFLAG:
+         vs->output_edgeflag = i;
+         break;
+      }
+   }
+
+   
+   /* Done:
+    */
+   return (void *)vs;
+
+fail:
+   FREE(vs);
+   return NULL;
+}
+
+
+static void brw_delete_fs_state( struct pipe_context *pipe, void *prog )
+{
+   struct brw_fragment_shader *fs = (struct brw_fragment_shader *)prog;
+
+   bo_reference(&fs->const_buffer, NULL);
+   FREE( (void *)fs->tokens );
+   FREE( fs );
+}
+
+
+static void brw_delete_vs_state( struct pipe_context *pipe, void *prog )
+{
+   struct brw_fragment_shader *vs = (struct brw_fragment_shader *)prog;
+
+   /* Delete draw shader
+    */
+   FREE( (void *)vs->tokens );
+   FREE( vs );
+}
+
+
+static void brw_set_constant_buffer(struct pipe_context *pipe,
+                                     uint shader, uint index,
+                                     struct pipe_resource *buf)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   assert(index == 0);
+
+   if (shader == PIPE_SHADER_FRAGMENT) {
+      pipe_resource_reference( &brw->curr.fragment_constants,
+                             buf );
+
+      brw->state.dirty.mesa |= PIPE_NEW_FRAGMENT_CONSTANTS;
+   }
+   else {
+      pipe_resource_reference( &brw->curr.vertex_constants,
+                             buf );
+
+      brw->state.dirty.mesa |= PIPE_NEW_VERTEX_CONSTANTS;
+   }
+}
+
+
+void brw_pipe_shader_init( struct brw_context *brw )
+{
+   brw->base.set_constant_buffer = brw_set_constant_buffer;
+
+   brw->base.create_vs_state = brw_create_vs_state;
+   brw->base.bind_vs_state = brw_bind_vs_state;
+   brw->base.delete_vs_state = brw_delete_vs_state;
+
+   brw->base.create_fs_state = brw_create_fs_state;
+   brw->base.bind_fs_state = brw_bind_fs_state;
+   brw->base.delete_fs_state = brw_delete_fs_state;
+}
+
+void brw_pipe_shader_cleanup( struct brw_context *brw )
+{
+   pipe_resource_reference( &brw->curr.fragment_constants, NULL );
+   pipe_resource_reference( &brw->curr.vertex_constants, NULL );
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_vertex.c b/src/gallium/drivers/i965/brw_pipe_vertex.c
new file mode 100644
index 0000000000..4a120a51da
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_vertex.c
@@ -0,0 +1,302 @@
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_structs.h"
+
+#include "util/u_memory.h"
+#include "util/u_format.h"
+
+
+static unsigned brw_translate_surface_format( unsigned id )
+{
+   switch (id) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return BRW_SURFACEFORMAT_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return BRW_SURFACEFORMAT_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return BRW_SURFACEFORMAT_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return BRW_SURFACEFORMAT_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return BRW_SURFACEFORMAT_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return BRW_SURFACEFORMAT_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return BRW_SURFACEFORMAT_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return BRW_SURFACEFORMAT_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return BRW_SURFACEFORMAT_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return BRW_SURFACEFORMAT_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return BRW_SURFACEFORMAT_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return BRW_SURFACEFORMAT_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return BRW_SURFACEFORMAT_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return BRW_SURFACEFORMAT_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SSCALED;
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static void brw_translate_vertex_elements(struct brw_context *brw,
+                                          struct brw_vertex_element_packet *brw_velems,
+                                          const struct pipe_vertex_element *attribs,
+                                          unsigned count)
+{
+   unsigned i;
+
+   /* If the VS doesn't read any inputs (calculating vertex position from
+    * a state variable for some reason, for example), emit a single pad
+    * VERTEX_ELEMENT struct and bail.
+    *
+    * The stale VB state stays in place, but they don't do anything unless
+    * a VE loads from them.
+    */
+   brw_velems->header.opcode = CMD_VERTEX_ELEMENT;
+
+   if (count == 0) {
+      brw_velems->header.length = 1;
+      brw_velems->ve[0].ve0.src_offset = 0;
+      brw_velems->ve[0].ve0.src_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+      brw_velems->ve[0].ve0.valid = 1;
+      brw_velems->ve[0].ve0.vertex_buffer_index = 0;
+      brw_velems->ve[0].ve1.dst_offset = 0;
+      brw_velems->ve[0].ve1.vfcomponent0 = BRW_VE1_COMPONENT_STORE_0;
+      brw_velems->ve[0].ve1.vfcomponent1 = BRW_VE1_COMPONENT_STORE_0;
+      brw_velems->ve[0].ve1.vfcomponent2 = BRW_VE1_COMPONENT_STORE_0;
+      brw_velems->ve[0].ve1.vfcomponent3 = BRW_VE1_COMPONENT_STORE_1_FLT;
+      return;
+   }
+
+
+   /* Now emit vertex element (VEP) state packets.
+    *
+    */
+   brw_velems->header.length = (1 + count * 2) - 2;
+   for (i = 0; i < count; i++) {
+      const struct pipe_vertex_element *input = &attribs[i];
+      unsigned nr_components = util_format_get_nr_components(input->src_format);
+
+      uint32_t format = brw_translate_surface_format( input->src_format );
+      uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC;
+      uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC;
+      uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC;
+      uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC;
+
+      switch (nr_components) {
+      case 0: comp0 = BRW_VE1_COMPONENT_STORE_0; /* fallthrough */
+      case 1: comp1 = BRW_VE1_COMPONENT_STORE_0; /* fallthrough */
+      case 2: comp2 = BRW_VE1_COMPONENT_STORE_0; /* fallthrough */
+      case 3: comp3 = BRW_VE1_COMPONENT_STORE_1_FLT;
+         break;
+      }
+
+      brw_velems->ve[i].ve0.src_offset = input->src_offset;
+      brw_velems->ve[i].ve0.src_format = format;
+      brw_velems->ve[i].ve0.valid = 1;
+      brw_velems->ve[i].ve0.vertex_buffer_index = input->vertex_buffer_index;
+      brw_velems->ve[i].ve1.vfcomponent0 = comp0;
+      brw_velems->ve[i].ve1.vfcomponent1 = comp1;
+      brw_velems->ve[i].ve1.vfcomponent2 = comp2;
+      brw_velems->ve[i].ve1.vfcomponent3 = comp3;
+
+      if (BRW_IS_IGDNG(brw))
+         brw_velems->ve[i].ve1.dst_offset = 0;
+      else
+         brw_velems->ve[i].ve1.dst_offset = i * 4;
+   }
+}
+
+static void* brw_create_vertex_elements_state( struct pipe_context *pipe,
+                                               unsigned count,
+                                               const struct pipe_vertex_element *attribs )
+{
+   /* note: for the brw_swtnl.c code (if ever we need draw fallback) we'd also need
+      to store the original data */
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_vertex_element_packet *velems;
+   assert(count <= BRW_VEP_MAX);
+   velems = (struct brw_vertex_element_packet *) MALLOC(sizeof(struct brw_vertex_element_packet));
+   if (velems) {
+      brw_translate_vertex_elements(brw, velems, attribs, count);
+   }
+   return velems;
+}
+
+static void brw_bind_vertex_elements_state(struct pipe_context *pipe,
+                                           void *velems)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_vertex_element_packet *brw_velems = (struct brw_vertex_element_packet *) velems;
+
+   brw->curr.velems = brw_velems;
+
+   brw->state.dirty.mesa |= PIPE_NEW_VERTEX_ELEMENT;
+}
+
+static void brw_delete_vertex_elements_state(struct pipe_context *pipe, void *velems)
+{
+   FREE( velems );
+}
+
+
+static void brw_set_vertex_buffers(struct pipe_context *pipe,
+                                   unsigned count,
+                                   const struct pipe_vertex_buffer *buffers)
+{
+   struct brw_context *brw = brw_context(pipe);
+   unsigned i;
+
+   /* Check for no change */
+   if (count == brw->curr.num_vertex_buffers &&
+       memcmp(brw->curr.vertex_buffer,
+              buffers,
+              count * sizeof buffers[0]) == 0)
+      return;
+
+   /* Adjust refcounts */
+   for (i = 0; i < count; i++) 
+      pipe_resource_reference(&brw->curr.vertex_buffer[i].buffer, 
+                            buffers[i].buffer);
+
+   for ( ; i < brw->curr.num_vertex_buffers; i++)
+      pipe_resource_reference(&brw->curr.vertex_buffer[i].buffer,
+                            NULL);
+
+   /* Copy remaining data */
+   memcpy(brw->curr.vertex_buffer, buffers, count * sizeof buffers[0]);
+   brw->curr.num_vertex_buffers = count;
+
+   brw->state.dirty.mesa |= PIPE_NEW_VERTEX_BUFFER;
+}
+
+
+void 
+brw_pipe_vertex_init( struct brw_context *brw )
+{
+   brw->base.set_vertex_buffers = brw_set_vertex_buffers;
+   brw->base.create_vertex_elements_state = brw_create_vertex_elements_state;
+   brw->base.bind_vertex_elements_state = brw_bind_vertex_elements_state;
+   brw->base.delete_vertex_elements_state = brw_delete_vertex_elements_state;
+}
+
+
+void 
+brw_pipe_vertex_cleanup( struct brw_context *brw )
+{
+
+   /* Release bound pipe vertex_buffers
+    */
+
+   /* Release some other stuff
+    */
+#if 0
+   for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+      bo_reference(&brw->vb.inputs[i].bo, NULL);
+      brw->vb.inputs[i].bo = NULL;
+   }
+#endif
+}
diff --git a/src/gallium/drivers/i965/brw_reg.h b/src/gallium/drivers/i965/brw_reg.h
new file mode 100644
index 0000000000..ba10f9d5df
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_reg.h
@@ -0,0 +1,115 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_REG_H
+#define BRW_REG_H
+
+#define CMD_MI				(0x0 << 29)
+#define CMD_2D				(0x2 << 29)
+#define CMD_3D				(0x3 << 29)
+
+#define MI_NOOP				(CMD_MI | 0)
+#define MI_BATCH_BUFFER_END		(CMD_MI | 0xA << 23)
+#define MI_FLUSH			(CMD_MI | (4 << 23))
+
+#define _3DSTATE_DRAWRECT_INFO_I965	(CMD_3D | (3 << 27) | (1 << 24) | 0x2)
+
+/** @{
+ *
+ * PIPE_CONTROL operation, a combination MI_FLUSH and register write with
+ * additional flushing control.
+ */
+#define _3DSTATE_PIPE_CONTROL		(CMD_3D | (3 << 27) | (2 << 24) | 2)
+#define PIPE_CONTROL_NO_WRITE		(0 << 14)
+#define PIPE_CONTROL_WRITE_IMMEDIATE	(1 << 14)
+#define PIPE_CONTROL_WRITE_DEPTH_COUNT	(2 << 14)
+#define PIPE_CONTROL_WRITE_TIMESTAMP	(3 << 14)
+#define PIPE_CONTROL_DEPTH_STALL	(1 << 13)
+#define PIPE_CONTROL_WRITE_FLUSH	(1 << 12)
+#define PIPE_CONTROL_INSTRUCTION_FLUSH	(1 << 11)
+#define PIPE_CONTROL_INTERRUPT_ENABLE	(1 << 8)
+#define PIPE_CONTROL_PPGTT_WRITE	(0 << 2)
+#define PIPE_CONTROL_GLOBAL_GTT_WRITE	(1 << 2)
+
+/** @} */
+
+#define XY_SETUP_BLT_CMD		(CMD_2D | (0x01 << 22) | 6)
+#define XY_COLOR_BLT_CMD		(CMD_2D | (0x50 << 22) | 4)
+#define XY_SRC_COPY_BLT_CMD             (CMD_2D | (0x53 << 22) | 6)
+
+/* BR00 */
+#define XY_BLT_WRITE_ALPHA	(1 << 21)
+#define XY_BLT_WRITE_RGB	(1 << 20)
+#define XY_SRC_TILED		(1 << 15)
+#define XY_DST_TILED		(1 << 11)
+
+/* BR13 */
+#define BR13_565		(0x1 << 24)
+#define BR13_8888		(0x3 << 24)
+
+#define FENCE_LINEAR 0
+#define FENCE_XMAJOR 1
+#define FENCE_YMAJOR 2
+
+
+
+/* PCI IDs
+ */
+#define PCI_CHIP_I965_G			0x29A2
+#define PCI_CHIP_I965_Q			0x2992
+#define PCI_CHIP_I965_G_1		0x2982
+#define PCI_CHIP_I946_GZ		0x2972
+#define PCI_CHIP_I965_GM                0x2A02
+#define PCI_CHIP_I965_GME               0x2A12
+
+#define PCI_CHIP_GM45_GM                0x2A42
+
+#define PCI_CHIP_IGD_E_G                0x2E02
+#define PCI_CHIP_Q45_G                  0x2E12
+#define PCI_CHIP_G45_G                  0x2E22
+#define PCI_CHIP_G41_G                  0x2E32
+#define PCI_CHIP_B43_G                  0x2E42
+
+#define PCI_CHIP_ILD_G                  0x0042
+#define PCI_CHIP_ILM_G                  0x0046
+
+struct brw_chipset {
+   unsigned pci_id:16;
+   unsigned is_965:1;
+   unsigned is_igdng:1;
+   unsigned is_g4x:1;
+   unsigned pad:13;
+};
+
+
+/* XXX: hacks
+ */
+#define VERT_RESULT_HPOS 0	/* not always true */
+#define VERT_RESULT_PSIZ 127	/* disabled */
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_resource.c b/src/gallium/drivers/i965/brw_resource.c
new file mode 100644
index 0000000000..3b61ffbd03
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_resource.c
@@ -0,0 +1,52 @@
+#include "util/u_debug.h"
+#include "util/u_surface.h"
+
+#include "brw_resource.h"
+#include "brw_context.h"
+#include "brw_screen.h"
+
+
+static struct pipe_resource *
+brw_resource_create(struct pipe_screen *screen,
+                    const struct pipe_resource *template)
+{
+   if (template->target == PIPE_BUFFER)
+      return brw_buffer_create(screen, template);
+   else
+      return brw_texture_create(screen, template);
+
+}
+
+static struct pipe_resource *
+brw_resource_from_handle(struct pipe_screen * screen,
+			 const struct pipe_resource *template,
+			 struct winsys_handle *whandle)
+{
+   if (template->target == PIPE_BUFFER)
+      return NULL;
+   else
+      return brw_texture_from_handle(screen, template, whandle);
+}
+
+
+void
+brw_init_resource_functions(struct brw_context *brw )
+{
+   brw->base.get_transfer = u_get_transfer_vtbl;
+   brw->base.transfer_map = u_transfer_map_vtbl;
+   brw->base.transfer_flush_region = u_transfer_flush_region_vtbl;
+   brw->base.transfer_unmap = u_transfer_unmap_vtbl;
+   brw->base.transfer_destroy = u_transfer_destroy_vtbl;
+   brw->base.transfer_inline_write = u_transfer_inline_write_vtbl;
+   brw->base.resource_copy_region = util_resource_copy_region;
+}
+
+void
+brw_init_screen_resource_functions(struct brw_screen *is)
+{
+   is->base.resource_create = brw_resource_create;
+   is->base.resource_from_handle = brw_resource_from_handle;
+   is->base.resource_get_handle = u_resource_get_handle_vtbl;
+   is->base.resource_destroy = u_resource_destroy_vtbl;
+   is->base.user_buffer_create = brw_user_buffer_create;
+}
diff --git a/src/gallium/drivers/i965/brw_resource.h b/src/gallium/drivers/i965/brw_resource.h
new file mode 100644
index 0000000000..78defb37b2
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_resource.h
@@ -0,0 +1,152 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_RESOURCE_H
+#define BRW_RESOURCE_H
+
+struct brw_screen;
+
+#include "util/u_transfer.h"
+#include "util/u_debug.h"
+
+#include "brw_screen.h"		/* for brw_surface */
+
+struct brw_context;
+struct brw_screen;
+
+
+struct brw_buffer {
+   struct u_resource b;
+
+   /* One of either bo or user_buffer will be non-null, depending on
+    * whether this is a hardware or user buffer.
+    */
+   struct brw_winsys_buffer *bo;
+   void *user_buffer;
+
+   /* Mapped pointer??
+    */
+   void *ptr;
+};
+
+#define BRW_MAX_TEXTURE_2D_LEVELS 11  /* max 1024x1024 */
+#define BRW_MAX_TEXTURE_3D_LEVELS  8  /* max 128x128x128 */
+
+
+
+struct brw_texture {
+   struct u_resource b;
+   struct brw_winsys_buffer *bo;
+   struct brw_surface_state ss;
+
+   unsigned *image_offset[BRW_MAX_TEXTURE_2D_LEVELS];
+   unsigned nr_images[BRW_MAX_TEXTURE_2D_LEVELS];
+   unsigned level_offset[BRW_MAX_TEXTURE_2D_LEVELS];
+
+   boolean compressed;
+   unsigned brw_target;
+   unsigned pitch;
+   unsigned tiling;
+   unsigned cpp;
+   unsigned total_height;
+
+   struct brw_surface views[2];
+};
+
+
+void brw_init_screen_resource_functions(struct brw_screen *is);
+void brw_init_resource_functions(struct brw_context *brw );
+
+extern struct u_resource_vtbl brw_buffer_vtbl;
+extern struct u_resource_vtbl brw_texture_vtbl;
+
+static INLINE struct brw_texture *brw_texture( struct pipe_resource *resource )
+{
+   struct brw_texture *tex = (struct brw_texture *)resource;
+   assert(tex->b.vtbl == &brw_texture_vtbl);
+   return tex;
+}
+
+static INLINE struct brw_buffer *brw_buffer( struct pipe_resource *resource )
+{
+   struct brw_buffer *tex = (struct brw_buffer *)resource;
+   assert(tex->b.vtbl == &brw_buffer_vtbl);
+   return tex;
+}
+
+struct pipe_resource *
+brw_texture_create(struct pipe_screen *screen,
+                    const struct pipe_resource *template);
+
+struct pipe_resource *
+brw_texture_from_handle(struct pipe_screen * screen,
+			const struct pipe_resource *template,
+			struct winsys_handle *whandle);
+
+
+struct pipe_resource *
+brw_user_buffer_create(struct pipe_screen *screen,
+                        void *ptr,
+                        unsigned bytes,
+			unsigned usage);
+
+struct pipe_resource *
+brw_buffer_create(struct pipe_screen *screen,
+		   const struct pipe_resource *template);
+
+
+/*
+boolean
+brw_is_format_supported( struct pipe_screen *screen,
+			 enum pipe_format format,
+			 enum pipe_texture_target target,
+			 unsigned sample_count,
+			 unsigned tex_usage,
+			 unsigned geom_flags );
+*/
+
+/* Pipe buffer helpers
+ */
+static INLINE boolean
+brw_buffer_is_user_buffer( const struct pipe_resource *buf )
+{
+   return ((const struct brw_buffer *)buf)->user_buffer != NULL;
+}
+
+
+/***********************************************************************
+ * Internal functions 
+ */
+GLboolean brw_texture_layout(struct brw_screen *brw_screen,
+			     struct brw_texture *tex );
+
+void brw_update_texture( struct brw_screen *brw_screen,
+			 struct brw_texture *tex );
+
+
+
+#endif /* BRW_RESOURCE_H */
diff --git a/src/gallium/drivers/i965/brw_resource_buffer.c b/src/gallium/drivers/i965/brw_resource_buffer.c
new file mode 100644
index 0000000000..5f9e8a87c9
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_resource_buffer.c
@@ -0,0 +1,201 @@
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+
+#include "brw_resource.h"
+#include "brw_context.h"
+#include "brw_batchbuffer.h"
+#include "brw_winsys.h"
+
+static boolean
+brw_buffer_get_handle(struct pipe_screen *screen,
+		      struct pipe_resource *resource,
+		      struct winsys_handle *handle)
+{
+   return FALSE;
+}
+
+
+static void
+brw_buffer_destroy(struct pipe_screen *screen,
+		    struct pipe_resource *resource)
+{
+   struct brw_buffer *buf = brw_buffer( resource );
+
+   bo_reference(&buf->bo, NULL);
+   FREE(buf);
+}
+
+
+static void *
+brw_buffer_transfer_map( struct pipe_context *pipe,
+			 struct pipe_transfer *transfer)
+{
+   struct brw_screen *bscreen = brw_screen(pipe->screen); 
+   struct brw_winsys_screen *sws = bscreen->sws;
+   struct brw_buffer *buf = brw_buffer(transfer->resource);
+   unsigned offset = transfer->box.x;
+   unsigned length = transfer->box.width;
+   unsigned usage = transfer->usage;
+   uint8_t *map;
+
+   if (buf->user_buffer)
+      map = buf->user_buffer;
+   else
+      map = sws->bo_map( buf->bo, 
+			 BRW_DATA_OTHER,
+			 offset,
+			 length,
+			 (usage & PIPE_TRANSFER_WRITE) ? TRUE : FALSE,
+			 (usage & PIPE_TRANSFER_DISCARD) ? TRUE : FALSE,
+			 (usage & PIPE_TRANSFER_FLUSH_EXPLICIT) ? TRUE : FALSE);
+
+   return map + offset;
+}
+
+
+static void
+brw_buffer_transfer_flush_region( struct pipe_context *pipe,
+				  struct pipe_transfer *transfer,
+				  const struct pipe_box *box)
+{
+   struct brw_screen *bscreen = brw_screen(pipe->screen); 
+   struct brw_winsys_screen *sws = bscreen->sws;
+   struct brw_buffer *buf = brw_buffer(transfer->resource);
+   unsigned offset = box->x;
+   unsigned length = box->width;
+
+   if (buf->user_buffer)
+      return;
+
+   sws->bo_flush_range( buf->bo, 
+                        offset,
+                        length );
+}
+
+
+static void
+brw_buffer_transfer_unmap( struct pipe_context *pipe,
+			   struct pipe_transfer *transfer)
+{
+   struct brw_screen *bscreen = brw_screen(pipe->screen); 
+   struct brw_winsys_screen *sws = bscreen->sws;
+   struct brw_buffer *buf = brw_buffer( transfer->resource );
+   
+   if (buf->bo)
+      sws->bo_unmap(buf->bo);
+}
+
+
+static unsigned brw_buffer_is_referenced( struct pipe_context *pipe,
+					 struct pipe_resource *resource,
+					 unsigned face,
+					 unsigned level)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_winsys_buffer *batch_bo = brw->batch->buf;
+   struct brw_buffer *buf = brw_buffer(resource);
+
+   if (buf->bo == NULL)
+      return PIPE_UNREFERENCED;
+
+   if (!brw_screen(pipe->screen)->sws->bo_references( batch_bo, buf->bo ))
+      return PIPE_UNREFERENCED;
+
+   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+}
+
+
+struct u_resource_vtbl brw_buffer_vtbl = 
+{
+   brw_buffer_get_handle,	     /* get_handle */
+   brw_buffer_destroy,		     /* resource_destroy */
+   brw_buffer_is_referenced,	     /* is_resource_referenced */
+   u_default_get_transfer,	     /* get_transfer */
+   u_default_transfer_destroy,	     /* transfer_destroy */
+   brw_buffer_transfer_map,	     /* transfer_map */
+   brw_buffer_transfer_flush_region,  /* transfer_flush_region */
+   brw_buffer_transfer_unmap,	     /* transfer_unmap */
+   u_default_transfer_inline_write   /* transfer_inline_write */
+};
+
+
+struct pipe_resource *
+brw_buffer_create(struct pipe_screen *screen,
+		  const struct pipe_resource *template)
+{
+   struct brw_screen *bscreen = brw_screen(screen);
+   struct brw_winsys_screen *sws = bscreen->sws;
+   struct brw_buffer *buf;
+   unsigned buffer_type;
+   enum pipe_error ret;
+   
+   buf = CALLOC_STRUCT(brw_buffer);
+   if (!buf)
+      return NULL;
+      
+   buf->b.b = *template;
+   buf->b.vtbl = &brw_buffer_vtbl;
+   pipe_reference_init(&buf->b.b.reference, 1);
+   buf->b.b.screen = screen;
+
+   switch (template->bind & (PIPE_BIND_VERTEX_BUFFER |
+			      PIPE_BIND_INDEX_BUFFER |
+			      PIPE_BIND_CONSTANT_BUFFER))
+   {
+   case PIPE_BIND_VERTEX_BUFFER:
+   case PIPE_BIND_INDEX_BUFFER:
+   case (PIPE_BIND_VERTEX_BUFFER|PIPE_BIND_INDEX_BUFFER):
+      buffer_type = BRW_BUFFER_TYPE_VERTEX;
+      break;
+      
+   case PIPE_BIND_CONSTANT_BUFFER:
+      buffer_type = BRW_BUFFER_TYPE_SHADER_CONSTANTS;
+      break;
+
+   default:
+      buffer_type = BRW_BUFFER_TYPE_GENERIC;
+      break;
+   }
+   
+   ret = sws->bo_alloc( sws, buffer_type,
+                        template->width0,
+			64,	/* alignment */
+                        &buf->bo );
+   if (ret != PIPE_OK)
+      return NULL;
+      
+   return &buf->b.b; 
+}
+
+
+struct pipe_resource *
+brw_user_buffer_create(struct pipe_screen *screen,
+                       void *ptr,
+                       unsigned bytes,
+		       unsigned bind)
+{
+   struct brw_buffer *buf;
+   
+   buf = CALLOC_STRUCT(brw_buffer);
+   if (!buf)
+      return NULL;
+   
+   pipe_reference_init(&buf->b.b.reference, 1);
+   buf->b.vtbl = &brw_buffer_vtbl;
+   buf->b.b.screen = screen;
+   buf->b.b.format = PIPE_FORMAT_R8_UNORM; /* ?? */
+   buf->b.b.usage = PIPE_USAGE_IMMUTABLE;
+   buf->b.b.bind = bind;
+   buf->b.b.width0 = bytes;
+   buf->b.b.height0 = 1;
+   buf->b.b.depth0 = 1;
+
+   buf->user_buffer = ptr;
+   
+   return &buf->b.b; 
+}
diff --git a/src/gallium/drivers/i965/brw_resource_texture.c b/src/gallium/drivers/i965/brw_resource_texture.c
new file mode 100644
index 0000000000..ffd0f38672
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_resource_texture.c
@@ -0,0 +1,603 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+#include "util/u_format.h"
+
+#include "brw_screen.h"
+#include "brw_defines.h"
+#include "brw_structs.h"
+#include "brw_winsys.h"
+#include "brw_batchbuffer.h"
+#include "brw_context.h"
+#include "brw_resource.h"
+
+
+/**
+ * Subclass of pipe_transfer
+ */
+struct brw_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned offset;
+};
+
+static INLINE struct brw_transfer *
+brw_transfer(struct pipe_transfer *transfer)
+{
+   return (struct brw_transfer *)transfer;
+}
+
+
+static GLuint translate_tex_target( unsigned target )
+{
+   switch (target) {
+   case PIPE_TEXTURE_1D: 
+      return BRW_SURFACE_1D;
+
+   case PIPE_TEXTURE_2D: 
+      return BRW_SURFACE_2D;
+
+   case PIPE_TEXTURE_3D: 
+      return BRW_SURFACE_3D;
+
+   case PIPE_TEXTURE_CUBE:
+      return BRW_SURFACE_CUBE;
+
+   default: 
+      assert(0); 
+      return BRW_SURFACE_1D;
+   }
+}
+
+
+static GLuint translate_tex_format( enum pipe_format pf )
+{
+   switch( pf ) {
+   case PIPE_FORMAT_L8_UNORM:
+      return BRW_SURFACEFORMAT_L8_UNORM;
+
+   case PIPE_FORMAT_I8_UNORM:
+      return BRW_SURFACEFORMAT_I8_UNORM;
+
+   case PIPE_FORMAT_A8_UNORM:
+      return BRW_SURFACEFORMAT_A8_UNORM; 
+
+   case PIPE_FORMAT_L16_UNORM:
+      return BRW_SURFACEFORMAT_L16_UNORM;
+
+      /* XXX: Add these to gallium
+   case PIPE_FORMAT_I16_UNORM:
+      return BRW_SURFACEFORMAT_I16_UNORM;
+
+   case PIPE_FORMAT_A16_UNORM:
+      return BRW_SURFACEFORMAT_A16_UNORM; 
+      */
+
+   case PIPE_FORMAT_L8A8_UNORM:
+      return BRW_SURFACEFORMAT_L8A8_UNORM;
+
+   case PIPE_FORMAT_B5G6R5_UNORM:
+      return BRW_SURFACEFORMAT_B5G6R5_UNORM;
+
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
+      return BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
+
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
+      return BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
+
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8X8_UNORM;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+
+   /*
+    * Video formats
+    */
+
+   case PIPE_FORMAT_YUYV:
+      return BRW_SURFACEFORMAT_YCRCB_NORMAL;
+
+   case PIPE_FORMAT_UYVY:
+      return BRW_SURFACEFORMAT_YCRCB_SWAPUVY;
+
+   /*
+    * Compressed formats.
+    */
+      /* XXX: Add FXT to gallium?
+   case PIPE_FORMAT_FXT1_RGBA:
+      return BRW_SURFACEFORMAT_FXT1;
+      */
+
+   case PIPE_FORMAT_DXT1_RGB:
+       return BRW_SURFACEFORMAT_DXT1_RGB;
+
+   case PIPE_FORMAT_DXT1_RGBA:
+       return BRW_SURFACEFORMAT_BC1_UNORM;
+       
+   case PIPE_FORMAT_DXT3_RGBA:
+       return BRW_SURFACEFORMAT_BC2_UNORM;
+       
+   case PIPE_FORMAT_DXT5_RGBA:
+       return BRW_SURFACEFORMAT_BC3_UNORM;
+
+   /*
+    * sRGB formats
+    */
+
+   case PIPE_FORMAT_A8B8G8R8_SRGB:
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB;
+
+   case PIPE_FORMAT_L8A8_SRGB:
+      return BRW_SURFACEFORMAT_L8A8_UNORM_SRGB;
+
+   case PIPE_FORMAT_L8_SRGB:
+      return BRW_SURFACEFORMAT_L8_UNORM_SRGB;
+
+   case PIPE_FORMAT_DXT1_SRGB:
+      return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
+
+   /*
+    * Depth formats
+    */
+
+   case PIPE_FORMAT_Z16_UNORM:
+         return BRW_SURFACEFORMAT_I16_UNORM;
+
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+   case PIPE_FORMAT_Z24X8_UNORM:
+         return BRW_SURFACEFORMAT_I24X8_UNORM;
+
+   case PIPE_FORMAT_Z32_FLOAT:
+         return BRW_SURFACEFORMAT_I32_FLOAT;
+
+      /* XXX: presumably for bump mapping.  Add this to mesa state
+       * tracker?
+       *
+       * XXX: Add flipped versions of these formats to Gallium.
+       */
+   case PIPE_FORMAT_R8G8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
+
+   default:
+      return BRW_SURFACEFORMAT_INVALID;
+   }
+}
+
+
+static boolean
+brw_texture_get_handle(struct pipe_screen *screen,
+                       struct pipe_resource *texture,
+                       struct winsys_handle *whandle)
+{
+   struct brw_screen *bscreen = brw_screen(screen);
+   struct brw_texture *tex = brw_texture(texture);
+   unsigned stride;
+
+   stride = tex->pitch * tex->cpp;
+
+   return bscreen->sws->bo_get_handle(tex->bo, whandle, stride) == PIPE_OK;
+}
+
+
+
+static void brw_texture_destroy(struct pipe_screen *screen,
+				struct pipe_resource *pt)
+{
+   struct brw_texture *tex = brw_texture(pt);
+   bo_reference(&tex->bo, NULL);
+   FREE(pt);
+}
+
+
+
+
+static unsigned brw_texture_is_referenced( struct pipe_context *pipe,
+					   struct pipe_resource *texture,
+					   unsigned face, 
+					   unsigned level )
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_screen *bscreen = brw_screen(pipe->screen);
+   struct brw_winsys_buffer *batch_bo = brw->batch->buf;
+   struct brw_texture *tex = brw_texture(texture);
+   struct brw_surface *surf;
+   int i;
+
+   /* XXX: this is subject to false positives if the underlying
+    * texture BO is referenced, we can't tell whether the sub-region
+    * we care about participates in that.
+    */
+   if (bscreen->sws->bo_references( batch_bo, tex->bo ))
+      return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+
+   /* Find any view on this texture for this face/level and see if it
+    * is referenced:
+    */
+   for (i = 0; i < 2; i++) {
+      foreach (surf, &tex->views[i]) {
+         if (surf->bo == tex->bo)
+            continue;
+
+         if (surf->id.bits.face != face ||
+             surf->id.bits.level != level)
+            continue;
+         
+         if (bscreen->sws->bo_references( batch_bo, surf->bo))
+            return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+      }
+   }
+
+   return PIPE_UNREFERENCED;
+}
+
+
+/*
+ * Transfer functions
+ */
+
+
+static struct pipe_transfer * 
+brw_texture_get_transfer(struct pipe_context *context,
+			  struct pipe_resource *resource,
+			  struct pipe_subresource sr,
+			  unsigned usage,
+			  const struct pipe_box *box)
+{
+   struct brw_texture *tex = brw_texture(resource);
+   struct pipe_transfer *transfer = CALLOC_STRUCT(pipe_transfer);
+   if (transfer == NULL)
+      return NULL;
+
+   transfer->resource = resource;
+   transfer->sr = sr;
+   transfer->usage = usage;
+   transfer->box = *box;
+   transfer->stride = tex->pitch * tex->cpp;
+
+   return transfer;
+}
+
+
+static void *
+brw_texture_transfer_map(struct pipe_context *pipe,
+                 struct pipe_transfer *transfer)
+{
+   struct pipe_resource *resource = transfer->resource;
+   struct brw_texture *tex = brw_texture(transfer->resource);
+   struct brw_winsys_screen *sws = brw_screen(pipe->screen)->sws;
+   struct pipe_subresource sr = transfer->sr;
+   struct pipe_box *box = &transfer->box;
+   enum pipe_format format = resource->format;
+   unsigned usage = transfer->usage;
+   unsigned offset;
+   char *map;
+
+   if (resource->target == PIPE_TEXTURE_CUBE) {
+      offset = tex->image_offset[sr.level][sr.face];
+   }
+   else if (resource->target == PIPE_TEXTURE_3D) {
+      offset = tex->image_offset[sr.level][box->z];
+   }
+   else {
+      offset = tex->image_offset[sr.level][0];
+      assert(sr.face == 0);
+      assert(box->z == 0);
+   }
+
+   map = sws->bo_map(tex->bo, 
+                     BRW_DATA_OTHER,
+                     0,
+                     tex->bo->size,
+                     (usage & PIPE_TRANSFER_WRITE) ? TRUE : FALSE,
+                     (usage & 0) ? TRUE : FALSE,
+                     (usage & 0) ? TRUE : FALSE);
+
+   if (!map)
+      return NULL;
+
+   return map + offset +
+      box->y / util_format_get_blockheight(format) * transfer->stride +
+      box->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
+}
+
+static void
+brw_texture_transfer_unmap(struct pipe_context *pipe,
+                   struct pipe_transfer *transfer)
+{
+   struct brw_texture *tex = brw_texture(transfer->resource);
+   struct brw_winsys_screen *sws = brw_screen(pipe->screen)->sws;
+
+   sws->bo_unmap(tex->bo);
+}
+
+
+
+
+
+struct u_resource_vtbl brw_texture_vtbl = 
+{
+   brw_texture_get_handle,	      /* get_handle */
+   brw_texture_destroy,	      /* resource_destroy */
+   brw_texture_is_referenced,	      /* is_resource_referenced */
+   brw_texture_get_transfer,	      /* get_transfer */
+   u_default_transfer_destroy,	      /* transfer_destroy */
+   brw_texture_transfer_map,	      /* transfer_map */
+   u_default_transfer_flush_region,   /* transfer_flush_region */
+   brw_texture_transfer_unmap,	      /* transfer_unmap */
+   u_default_transfer_inline_write    /* transfer_inline_write */
+};
+
+
+
+
+
+struct pipe_resource *
+brw_texture_create( struct pipe_screen *screen,
+		    const struct pipe_resource *template )
+{  
+   struct brw_screen *bscreen = brw_screen(screen);
+   struct brw_texture *tex;
+   enum brw_buffer_type buffer_type;
+   enum pipe_error ret;
+   GLuint format;
+   
+   tex = CALLOC_STRUCT(brw_texture);
+   if (tex == NULL)
+      return NULL;
+
+   tex->b.b = *template;
+   tex->b.vtbl = &brw_texture_vtbl;
+   pipe_reference_init(&tex->b.b.reference, 1);
+   tex->b.b.screen = screen;
+
+   /* XXX: compressed textures need special treatment here
+    */
+   tex->cpp = util_format_get_blocksize(tex->b.b.format);
+   tex->compressed = util_format_is_s3tc(tex->b.b.format);
+
+   make_empty_list(&tex->views[0]);
+   make_empty_list(&tex->views[1]);
+
+   /* XXX: No tiling with compressed textures??
+    */
+   if (tex->compressed == 0 &&
+       !bscreen->no_tiling) 
+   {
+      if (bscreen->chipset.is_965 &&
+	  util_format_is_depth_or_stencil(template->format))
+	 tex->tiling = BRW_TILING_Y;
+      else
+	 tex->tiling = BRW_TILING_X;
+   } 
+   else {
+      tex->tiling = BRW_TILING_NONE;
+   }
+
+
+   if (!brw_texture_layout( bscreen, tex ))
+      goto fail;
+
+   
+   if (template->bind & (PIPE_BIND_SCANOUT |
+                           PIPE_BIND_SHARED)) {
+      buffer_type = BRW_BUFFER_TYPE_SCANOUT;
+   }
+   else {
+      buffer_type = BRW_BUFFER_TYPE_TEXTURE;
+   }
+
+   ret = bscreen->sws->bo_alloc( bscreen->sws,
+                                 buffer_type,
+                                 tex->pitch * tex->total_height * tex->cpp,
+                                 64,
+                                 &tex->bo );
+   if (ret)
+      goto fail;
+
+   tex->ss.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   tex->ss.ss0.surface_type = translate_tex_target(tex->b.b.target);
+
+   format = translate_tex_format(tex->b.b.format);
+   assert(format != BRW_SURFACEFORMAT_INVALID);
+   tex->ss.ss0.surface_format = format;
+
+   /* This is ok for all textures with channel width 8bit or less:
+    */
+/*    tex->ss.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
+
+
+   /* XXX: what happens when tex->bo->offset changes???
+    */
+   tex->ss.ss1.base_addr = 0; /* reloc */
+   tex->ss.ss2.mip_count = tex->b.b.last_level;
+   tex->ss.ss2.width = tex->b.b.width0 - 1;
+   tex->ss.ss2.height = tex->b.b.height0 - 1;
+
+   switch (tex->tiling) {
+   case BRW_TILING_NONE:
+      tex->ss.ss3.tiled_surface = 0;
+      tex->ss.ss3.tile_walk = 0;
+      break;
+   case BRW_TILING_X:
+      tex->ss.ss3.tiled_surface = 1;
+      tex->ss.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
+      break;
+   case BRW_TILING_Y:
+      tex->ss.ss3.tiled_surface = 1;
+      tex->ss.ss3.tile_walk = BRW_TILEWALK_YMAJOR;
+      break;
+   }
+
+   tex->ss.ss3.pitch = (tex->pitch * tex->cpp) - 1;
+   tex->ss.ss3.depth = tex->b.b.depth0 - 1;
+
+   tex->ss.ss4.min_lod = 0;
+ 
+   if (tex->b.b.target == PIPE_TEXTURE_CUBE) {
+      tex->ss.ss0.cube_pos_x = 1;
+      tex->ss.ss0.cube_pos_y = 1;
+      tex->ss.ss0.cube_pos_z = 1;
+      tex->ss.ss0.cube_neg_x = 1;
+      tex->ss.ss0.cube_neg_y = 1;
+      tex->ss.ss0.cube_neg_z = 1;
+   }
+
+   return &tex->b.b;
+
+fail:
+   bo_reference(&tex->bo, NULL);
+   FREE(tex);
+   return NULL;
+}
+
+
+struct pipe_resource * 
+brw_texture_from_handle(struct pipe_screen *screen,
+                        const struct pipe_resource *template,
+                        struct winsys_handle *whandle)
+{
+   struct brw_screen *bscreen = brw_screen(screen);
+   struct brw_texture *tex;
+   struct brw_winsys_buffer *buffer;
+   unsigned tiling;
+   unsigned pitch;
+   GLuint format;
+
+   if (template->target != PIPE_TEXTURE_2D ||
+       template->last_level != 0 ||
+       template->depth0 != 1)
+      return NULL;
+
+   if (util_format_is_s3tc(template->format))
+      return NULL;
+
+   tex = CALLOC_STRUCT(brw_texture);
+   if (!tex)
+      return NULL;
+
+   if (bscreen->sws->bo_from_handle(bscreen->sws, whandle, &pitch, &tiling, &buffer) != PIPE_OK)
+      goto fail;
+
+   tex->b.b = *template;
+   tex->b.vtbl = &brw_texture_vtbl;
+   pipe_reference_init(&tex->b.b.reference, 1);
+   tex->b.b.screen = screen;
+
+   /* XXX: cpp vs. blocksize
+    */
+   tex->cpp = util_format_get_blocksize(tex->b.b.format);
+   tex->tiling = tiling;
+
+   make_empty_list(&tex->views[0]);
+   make_empty_list(&tex->views[1]);
+
+   if (!brw_texture_layout(bscreen, tex))
+      goto fail;
+
+   /* XXX Maybe some more checks? */
+   if ((pitch / tex->cpp) < tex->pitch)
+      goto fail;
+
+   tex->pitch = pitch / tex->cpp;
+
+   tex->bo = buffer;
+
+   /* fix this warning */
+#if 0
+   if (tex->size > buffer->size)
+      goto fail;
+#endif
+
+   tex->ss.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   tex->ss.ss0.surface_type = translate_tex_target(tex->b.b.target);
+
+   format = translate_tex_format(tex->b.b.format);
+   assert(format != BRW_SURFACEFORMAT_INVALID);
+   tex->ss.ss0.surface_format = format;
+
+   /* This is ok for all textures with channel width 8bit or less:
+    */
+/*    tex->ss.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
+
+
+   /* XXX: what happens when tex->bo->offset changes???
+    */
+   tex->ss.ss1.base_addr = 0; /* reloc */
+   tex->ss.ss2.mip_count = tex->b.b.last_level;
+   tex->ss.ss2.width = tex->b.b.width0 - 1;
+   tex->ss.ss2.height = tex->b.b.height0 - 1;
+
+   switch (tex->tiling) {
+   case BRW_TILING_NONE:
+      tex->ss.ss3.tiled_surface = 0;
+      tex->ss.ss3.tile_walk = 0;
+      break;
+   case BRW_TILING_X:
+      tex->ss.ss3.tiled_surface = 1;
+      tex->ss.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
+      break;
+   case BRW_TILING_Y:
+      tex->ss.ss3.tiled_surface = 1;
+      tex->ss.ss3.tile_walk = BRW_TILEWALK_YMAJOR;
+      break;
+   }
+
+   tex->ss.ss3.pitch = (tex->pitch * tex->cpp) - 1;
+   tex->ss.ss3.depth = tex->b.b.depth0 - 1;
+
+   tex->ss.ss4.min_lod = 0;
+
+   return &tex->b.b;
+
+fail:
+   FREE(tex);
+   return NULL;
+}
+
+
+#if 0
+boolean brw_is_format_supported( struct pipe_screen *screen,
+				 enum pipe_format format,
+				 enum pipe_texture_target target,
+				 unsigned sample_count,
+				 unsigned tex_usage,
+				 unsigned geom_flags )
+{
+   return translate_tex_format(format) != BRW_SURFACEFORMAT_INVALID;
+}
+#endif
diff --git a/src/gallium/drivers/i965/brw_resource_texture_layout.c b/src/gallium/drivers/i965/brw_resource_texture_layout.c
new file mode 100644
index 0000000000..2187bdd82c
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_resource_texture_layout.c
@@ -0,0 +1,414 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+
+#include "pipe/p_format.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "brw_resource.h"
+#include "brw_debug.h"
+#include "brw_winsys.h"
+
+/* Code to layout images in a mipmap tree for i965.
+ */
+
+static int 
+brw_tex_pitch_align (struct brw_texture *tex,
+		     int pitch)
+{
+   if (!tex->compressed) {
+      int pitch_align;
+
+      switch (tex->tiling) {
+      case BRW_TILING_X:
+	 pitch_align = 512;
+	 break;
+      case BRW_TILING_Y:
+	 pitch_align = 128;
+	 break;
+      default:
+	 /* XXX: Untiled pitch alignment of 64 bytes for now to allow
+	  * render-to-texture to work in all cases. This should
+	  * probably be replaced at some point by some scheme to only
+	  * do this when really necessary, for example standalone
+	  * render target views.
+	  */
+	 pitch_align = 64;
+	 break;
+      }
+
+      pitch = align(pitch * tex->cpp, pitch_align);
+      pitch /= tex->cpp;
+   }
+
+   return pitch;
+}
+
+
+static void 
+brw_tex_alignment_unit(enum pipe_format pf, 
+		       GLuint *w, GLuint *h)
+{
+    switch (pf) {
+    case PIPE_FORMAT_DXT1_RGB:
+    case PIPE_FORMAT_DXT1_RGBA:
+    case PIPE_FORMAT_DXT3_RGBA:
+    case PIPE_FORMAT_DXT5_RGBA:
+    case PIPE_FORMAT_DXT1_SRGB:
+    case PIPE_FORMAT_DXT1_SRGBA:
+    case PIPE_FORMAT_DXT3_SRGBA:
+    case PIPE_FORMAT_DXT5_SRGBA:
+        *w = 4;
+        *h = 4;
+        break;
+
+    default:
+        *w = 4;
+        *h = 2;
+        break;
+    }
+}
+
+
+static void 
+brw_tex_set_level_info(struct brw_texture *tex,
+		       GLuint level,
+		       GLuint nr_images,
+		       GLuint x, GLuint y,
+		       GLuint w, GLuint h, GLuint d)
+{
+
+   if (BRW_DEBUG & DEBUG_TEXTURE)
+      debug_printf("%s level %d size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+		   level, w, h, d, x, y, tex->level_offset[level]);
+
+   assert(tex->image_offset[level] == NULL);
+   assert(nr_images >= 1);
+
+   tex->level_offset[level] = (x + y * tex->pitch) * tex->cpp;
+   tex->nr_images[level] = nr_images;
+
+   tex->image_offset[level] = MALLOC(nr_images * sizeof(GLuint));
+   tex->image_offset[level][0] = 0;
+}
+
+
+static void
+brw_tex_set_image_offset(struct brw_texture *tex,
+			 GLuint level, GLuint img,
+			 GLuint x, GLuint y, 
+			 GLuint offset)
+{
+   assert((x == 0 && y == 0) || img != 0 || level != 0);
+   assert(img < tex->nr_images[level]);
+
+   if (BRW_DEBUG & DEBUG_TEXTURE)
+      debug_printf("%s level %d img %d pos %d,%d image_offset %x\n",
+		   __FUNCTION__, level, img, x, y, 
+		   tex->image_offset[level][img]);
+
+   tex->image_offset[level][img] = (x + y * tex->pitch) * tex->cpp + offset;
+}
+
+
+
+static void brw_layout_2d( struct brw_texture *tex )
+{
+   GLuint align_h = 2, align_w = 4;
+   GLuint level;
+   GLuint x = 0;
+   GLuint y = 0;
+   GLuint width = tex->b.b.width0;
+   GLuint height = tex->b.b.height0;
+
+   tex->pitch = tex->b.b.width0;
+   brw_tex_alignment_unit(tex->b.b.format, &align_w, &align_h);
+
+   if (tex->compressed) {
+       tex->pitch = align(tex->b.b.width0, align_w);
+   }
+
+   /* May need to adjust pitch to accomodate the placement of
+    * the 2nd mipmap.  This occurs when the alignment
+    * constraints of mipmap placement push the right edge of the
+    * 2nd mipmap out past the width of its parent.
+    */
+   if (tex->b.b.last_level > 0) {
+       GLuint mip1_width;
+
+       if (tex->compressed) {
+          mip1_width = (align(u_minify(tex->b.b.width0, 1), align_w) + 
+                        align(u_minify(tex->b.b.width0, 2), align_w));
+       } else {
+          mip1_width = (align(u_minify(tex->b.b.width0, 1), align_w) + 
+                        u_minify(tex->b.b.width0, 2));
+       }
+
+       if (mip1_width > tex->pitch) {
+           tex->pitch = mip1_width;
+       }
+   }
+
+   /* Pitch must be a whole number of dwords, even though we
+    * express it in texels.
+    */
+   tex->pitch = brw_tex_pitch_align (tex, tex->pitch);
+   tex->total_height = 0;
+
+   for ( level = 0 ; level <= tex->b.b.last_level ; level++ ) {
+      GLuint img_height;
+
+      brw_tex_set_level_info(tex, level, 1, x, y, width, height, 1);
+
+      if (tex->compressed)
+	 img_height = MAX2(1, height/4);
+      else
+	 img_height = align(height, align_h);
+
+
+      /* Because the images are packed better, the final offset
+       * might not be the maximal one:
+       */
+      tex->total_height = MAX2(tex->total_height, y + img_height);
+
+      /* Layout_below: step right after second mipmap.
+       */
+      if (level == 1) {
+	 x += align(width, align_w);
+      }
+      else {
+	 y += img_height;
+      }
+
+      width  = u_minify(width, 1);
+      height = u_minify(height, 1);
+   }
+}
+
+
+static boolean 
+brw_layout_cubemap_idgng( struct brw_texture *tex )
+{
+   GLuint align_h = 2, align_w = 4;
+   GLuint level;
+   GLuint x = 0;
+   GLuint y = 0;
+   GLuint width = tex->b.b.width0;
+   GLuint height = tex->b.b.height0;
+   GLuint qpitch = 0;
+   GLuint y_pitch = 0;
+
+   tex->pitch = tex->b.b.width0;
+   brw_tex_alignment_unit(tex->b.b.format, &align_w, &align_h);
+   y_pitch = align(height, align_h);
+
+   if (tex->compressed) {
+      tex->pitch = align(tex->b.b.width0, align_w);
+   }
+
+   if (tex->b.b.last_level != 0) {
+      GLuint mip1_width;
+
+      if (tex->compressed) {
+	 mip1_width = (align(u_minify(tex->b.b.width0, 1), align_w) +
+		       align(u_minify(tex->b.b.width0, 2), align_w));
+      } else {
+	 mip1_width = (align(u_minify(tex->b.b.width0, 1), align_w) +
+		       u_minify(tex->b.b.width0, 2));
+      }
+
+      if (mip1_width > tex->pitch) {
+	 tex->pitch = mip1_width;
+      }
+   }
+
+   tex->pitch = brw_tex_pitch_align(tex, tex->pitch);
+
+   if (tex->compressed) {
+      qpitch = ((y_pitch + 
+		 align(u_minify(y_pitch, 1), align_h) +
+		 11 * align_h) / 4) * tex->pitch * tex->cpp;
+
+      tex->total_height = ((y_pitch + 
+			    align(u_minify(y_pitch, 1), align_h) + 
+			    11 * align_h) / 4) * 6;
+   } else {
+      qpitch = (y_pitch + 
+		align(u_minify(y_pitch, 1), align_h) + 
+		11 * align_h) * tex->pitch * tex->cpp;
+
+      tex->total_height = (y_pitch +
+			   align(u_minify(y_pitch, 1), align_h) +
+			   11 * align_h) * 6;
+   }
+
+   for (level = 0; level <= tex->b.b.last_level; level++) {
+      GLuint img_height;
+      GLuint nr_images = 6;
+      GLuint q = 0;
+
+      brw_tex_set_level_info(tex, level, nr_images, x, y, width, height, 1);
+
+      for (q = 0; q < nr_images; q++)
+	 brw_tex_set_image_offset(tex, level, q, x, y, q * qpitch);
+
+      if (tex->compressed)
+	 img_height = MAX2(1, height/4);
+      else
+	 img_height = align(height, align_h);
+
+      if (level == 1) {
+	 x += align(width, align_w);
+      }
+      else {
+	 y += img_height;
+      }
+
+      width  = u_minify(width, 1);
+      height = u_minify(height, 1);
+   }
+
+   return TRUE;
+}
+
+
+static boolean
+brw_layout_3d_cube( struct brw_texture *tex )
+{
+   GLuint width  = tex->b.b.width0;
+   GLuint height = tex->b.b.height0;
+   GLuint depth = tex->b.b.depth0;
+   GLuint pack_x_pitch, pack_x_nr;
+   GLuint pack_y_pitch;
+   GLuint level;
+   GLuint align_h = 2;
+   GLuint align_w = 4;
+
+   tex->total_height = 0;
+   brw_tex_alignment_unit(tex->b.b.format, &align_w, &align_h);
+
+   if (tex->compressed) {
+      tex->pitch = align(width, align_w);
+      pack_y_pitch = (height + 3) / 4;
+   } else {
+      tex->pitch = brw_tex_pitch_align(tex, tex->b.b.width0);
+      pack_y_pitch = align(tex->b.b.height0, align_h);
+   }
+
+   pack_x_pitch = width;
+   pack_x_nr = 1;
+
+   for (level = 0 ; level <= tex->b.b.last_level ; level++) {
+      GLuint nr_images = tex->b.b.target == PIPE_TEXTURE_3D ? depth : 6;
+      GLint x = 0;
+      GLint y = 0;
+      GLint q, j;
+
+      brw_tex_set_level_info(tex, level, nr_images,
+				   0, tex->total_height,
+				   width, height, depth);
+
+      for (q = 0; q < nr_images;) {
+	 for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
+	    brw_tex_set_image_offset(tex, level, q, x, y, 0);
+	    x += pack_x_pitch;
+	 }
+
+	 x = 0;
+	 y += pack_y_pitch;
+      }
+
+
+      tex->total_height += y;
+      width  = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth  = u_minify(depth, 1);
+
+      if (tex->compressed) {
+	 pack_y_pitch = (height + 3) / 4;
+
+	 if (pack_x_pitch > align(width, align_w)) {
+	    pack_x_pitch = align(width, align_w);
+	    pack_x_nr <<= 1;
+	 }
+      } else {
+	 if (pack_x_pitch > 4) {
+	    pack_x_pitch >>= 1;
+	    pack_x_nr <<= 1;
+	    assert(pack_x_pitch * pack_x_nr <= tex->pitch);
+	 }
+
+	 if (pack_y_pitch > 2) {
+	    pack_y_pitch >>= 1;
+	    pack_y_pitch = align(pack_y_pitch, align_h);
+	 }
+      }
+   }
+
+   /* The 965's sampler lays cachelines out according to how accesses
+    * in the texture surfaces run, so they may be "vertical" through
+    * memory.  As a result, the docs say in Surface Padding Requirements:
+    * Sampling Engine Surfaces that two extra rows of padding are required.
+    */
+   if (tex->b.b.target == PIPE_TEXTURE_CUBE)
+      tex->total_height += 2;
+
+   return TRUE;
+}
+
+
+
+GLboolean brw_texture_layout(struct brw_screen *brw_screen,
+			     struct brw_texture *tex )
+{
+   switch (tex->b.b.target) {
+   case PIPE_TEXTURE_CUBE:
+      if (brw_screen->chipset.is_igdng)
+	 brw_layout_cubemap_idgng( tex );
+      else
+	 brw_layout_3d_cube( tex );
+      break;
+	    
+   case PIPE_TEXTURE_3D:
+      brw_layout_3d_cube( tex );
+      break;
+
+   default:
+      brw_layout_2d( tex );
+      break;
+   }
+
+   if (BRW_DEBUG & DEBUG_TEXTURE)
+      debug_printf("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+		   tex->pitch,
+		   tex->total_height,
+		   tex->cpp,
+		   tex->pitch * tex->total_height * tex->cpp );
+
+   return GL_TRUE;
+}
diff --git a/src/gallium/drivers/i965/brw_screen.c b/src/gallium/drivers/i965/brw_screen.c
new file mode 100644
index 0000000000..50a446db91
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen.c
@@ -0,0 +1,426 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+
+#include "brw_reg.h"
+#include "brw_context.h"
+#include "brw_screen.h"
+#include "brw_winsys.h"
+#include "brw_debug.h"
+#include "brw_resource.h"
+
+#ifdef DEBUG
+static const struct debug_named_value debug_names[] = {
+   { "tex",   DEBUG_TEXTURE, NULL },
+   { "state", DEBUG_STATE, NULL },
+   { "ioctl", DEBUG_IOCTL, NULL },
+   { "blit",  DEBUG_BLIT, NULL },
+   { "curbe", DEBUG_CURBE, NULL },
+   { "fall",  DEBUG_FALLBACKS, NULL },
+   { "verb",  DEBUG_VERBOSE, NULL },
+   { "bat",   DEBUG_BATCH, NULL },
+   { "pix",   DEBUG_PIXEL, NULL },
+   { "wins",  DEBUG_WINSYS, NULL },
+   { "min",   DEBUG_MIN_URB, NULL },
+   { "dis",   DEBUG_DISASSEM, NULL },
+   { "sync",  DEBUG_SYNC, NULL },
+   { "prim",  DEBUG_PRIMS, NULL },
+   { "vert",  DEBUG_VERTS, NULL },
+   { "dma",   DEBUG_DMA, NULL },
+   { "san",   DEBUG_SANITY, NULL },
+   { "sleep", DEBUG_SLEEP, NULL },
+   { "stats", DEBUG_STATS, NULL },
+   { "sing",  DEBUG_SINGLE_THREAD, NULL },
+   { "thre",  DEBUG_SINGLE_THREAD, NULL },
+   { "wm",    DEBUG_WM, NULL },
+   { "urb",   DEBUG_URB, NULL },
+   { "vs",    DEBUG_VS, NULL },
+   DEBUG_NAMED_VALUE_END
+};
+
+static const struct debug_named_value dump_names[] = {
+   { "asm",   DUMP_ASM, NULL },
+   { "state", DUMP_STATE, NULL },
+   { "batch", DUMP_BATCH, NULL },
+   DEBUG_NAMED_VALUE_END
+};
+
+int BRW_DEBUG = 0;
+int BRW_DUMP = 0;
+
+#endif
+
+
+/*
+ * Probe functions
+ */
+
+
+static const char *
+brw_get_vendor(struct pipe_screen *screen)
+{
+   return "VMware, Inc.";
+}
+
+static const char *
+brw_get_name(struct pipe_screen *screen)
+{
+   static char buffer[128];
+   const char *chipset;
+
+   switch (brw_screen(screen)->chipset.pci_id) {
+   case PCI_CHIP_I965_G:
+      chipset = "I965_G";
+      break;
+   case PCI_CHIP_I965_Q:
+      chipset = "I965_Q";
+      break;
+   case PCI_CHIP_I965_G_1:
+      chipset = "I965_G_1";
+      break;
+   case PCI_CHIP_I946_GZ:
+      chipset = "I946_GZ";
+      break;
+   case PCI_CHIP_I965_GM:
+      chipset = "I965_GM";
+      break;
+   case PCI_CHIP_I965_GME:
+      chipset = "I965_GME";
+      break;
+   case PCI_CHIP_GM45_GM:
+      chipset = "GM45_GM";
+      break;
+   case PCI_CHIP_IGD_E_G:
+      chipset = "IGD_E_G";
+      break;
+   case PCI_CHIP_Q45_G:
+      chipset = "Q45_G";
+      break;
+   case PCI_CHIP_G45_G:
+      chipset = "G45_G";
+      break;
+   case PCI_CHIP_G41_G:
+      chipset = "G41_G";
+      break;
+   case PCI_CHIP_B43_G:
+      chipset = "B43_G";
+      break;
+   case PCI_CHIP_ILD_G:
+      chipset = "ILD_G";
+      break;
+   case PCI_CHIP_ILM_G:
+      chipset = "ILM_G";
+      break;
+   default:
+      chipset = "unknown";
+      break;
+   }
+
+   util_snprintf(buffer, sizeof(buffer), "i965 (chipset: %s)", chipset);
+   return buffer;
+}
+
+static int
+brw_get_param(struct pipe_screen *screen, enum pipe_cap param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return 8;
+   case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+      return 8;
+   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+      return 16; /* XXX correct? */
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 0;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 0;
+   case PIPE_CAP_TIMER_QUERY:
+      return 0;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return BRW_MAX_TEXTURE_2D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return BRW_MAX_TEXTURE_3D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return BRW_MAX_TEXTURE_2D_LEVELS;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+      return 0;
+   case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
+      /* disable for now */
+      return 0;
+   default:
+      return 0;
+   }
+}
+
+static float
+brw_get_paramf(struct pipe_screen *screen, enum pipe_cap param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 7.5;
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0;
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 4.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0;
+
+   default:
+      return 0;
+   }
+}
+
+static boolean
+brw_is_format_supported(struct pipe_screen *screen,
+                         enum pipe_format format,
+                         enum pipe_texture_target target,
+                         unsigned sample_count,
+                         unsigned tex_usage,
+                         unsigned geom_flags)
+{
+   static const enum pipe_format tex_supported[] = {
+      PIPE_FORMAT_L8_UNORM,
+      PIPE_FORMAT_I8_UNORM,
+      PIPE_FORMAT_A8_UNORM,
+      PIPE_FORMAT_L16_UNORM,
+      /*PIPE_FORMAT_I16_UNORM,*/
+      /*PIPE_FORMAT_A16_UNORM,*/
+      PIPE_FORMAT_L8A8_UNORM,
+      PIPE_FORMAT_B5G6R5_UNORM,
+      PIPE_FORMAT_B5G5R5A1_UNORM,
+      PIPE_FORMAT_B4G4R4A4_UNORM,
+      PIPE_FORMAT_B8G8R8X8_UNORM,
+      PIPE_FORMAT_B8G8R8A8_UNORM,
+      /* video */
+      PIPE_FORMAT_UYVY,
+      PIPE_FORMAT_YUYV,
+      /* compressed */
+      /*PIPE_FORMAT_FXT1_RGBA,*/
+      PIPE_FORMAT_DXT1_RGB,
+      PIPE_FORMAT_DXT1_RGBA,
+      PIPE_FORMAT_DXT3_RGBA,
+      PIPE_FORMAT_DXT5_RGBA,
+      /* sRGB */
+      PIPE_FORMAT_A8B8G8R8_SRGB,
+      PIPE_FORMAT_L8A8_SRGB,
+      PIPE_FORMAT_L8_SRGB,
+      PIPE_FORMAT_DXT1_SRGB,
+      /* depth */
+      PIPE_FORMAT_Z32_FLOAT,
+      PIPE_FORMAT_Z24X8_UNORM,
+      PIPE_FORMAT_Z24_UNORM_S8_USCALED,
+      PIPE_FORMAT_Z16_UNORM,
+      /* signed */
+      PIPE_FORMAT_R8G8_SNORM,
+      PIPE_FORMAT_R8G8B8A8_SNORM,
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   static const enum pipe_format render_supported[] = {
+      PIPE_FORMAT_B8G8R8X8_UNORM,
+      PIPE_FORMAT_B8G8R8A8_UNORM,
+      PIPE_FORMAT_B5G6R5_UNORM,
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   static const enum pipe_format depth_supported[] = {
+      PIPE_FORMAT_Z32_FLOAT,
+      PIPE_FORMAT_Z24X8_UNORM,
+      PIPE_FORMAT_Z24_UNORM_S8_USCALED,
+      PIPE_FORMAT_Z16_UNORM,
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   const enum pipe_format *list;
+   uint i;
+
+   if (sample_count > 1)
+      return FALSE;
+
+   if (tex_usage & PIPE_BIND_DEPTH_STENCIL)
+      list = depth_supported;
+   else if (tex_usage & PIPE_BIND_RENDER_TARGET)
+      list = render_supported;
+   else
+      list = tex_supported;
+
+   for (i = 0; list[i] != PIPE_FORMAT_NONE; i++) {
+      if (list[i] == format)
+         return TRUE;
+   }
+
+   return FALSE;
+}
+
+
+/*
+ * Fence functions
+ */
+
+
+static void
+brw_fence_reference(struct pipe_screen *screen,
+                     struct pipe_fence_handle **ptr,
+                     struct pipe_fence_handle *fence)
+{
+}
+
+static int
+brw_fence_signalled(struct pipe_screen *screen,
+                     struct pipe_fence_handle *fence,
+                     unsigned flags)
+{
+   return 0;                    /* XXX shouldn't this be a boolean? */
+}
+
+static int
+brw_fence_finish(struct pipe_screen *screen,
+                 struct pipe_fence_handle *fence,
+                 unsigned flags)
+{
+   return 0;
+}
+
+
+/*
+ * Generic functions
+ */
+
+
+static void
+brw_destroy_screen(struct pipe_screen *screen)
+{
+   struct brw_screen *bscreen = brw_screen(screen);
+
+   if (bscreen->sws)
+      bscreen->sws->destroy(bscreen->sws);
+
+   FREE(bscreen);
+}
+
+/**
+ * Create a new brw_screen object
+ */
+struct pipe_screen *
+brw_create_screen(struct brw_winsys_screen *sws, uint pci_id)
+{
+   struct brw_screen *bscreen;
+   struct brw_chipset chipset;
+
+#ifdef DEBUG
+   BRW_DEBUG = debug_get_flags_option("BRW_DEBUG", debug_names, 0);
+   BRW_DEBUG |= debug_get_flags_option("INTEL_DEBUG", debug_names, 0);
+   BRW_DEBUG |= DEBUG_STATS | DEBUG_MIN_URB | DEBUG_WM;
+
+   BRW_DUMP = debug_get_flags_option("BRW_DUMP", dump_names, 0);
+#endif
+
+   memset(&chipset, 0, sizeof chipset);
+
+   chipset.pci_id = pci_id;
+
+   switch (pci_id) {
+   case PCI_CHIP_I965_G:
+   case PCI_CHIP_I965_Q:
+   case PCI_CHIP_I965_G_1:
+   case PCI_CHIP_I946_GZ:
+   case PCI_CHIP_I965_GM:
+   case PCI_CHIP_I965_GME:
+      chipset.is_965 = TRUE;
+      break;
+
+   case PCI_CHIP_GM45_GM:
+   case PCI_CHIP_IGD_E_G:
+   case PCI_CHIP_Q45_G:
+   case PCI_CHIP_G45_G:
+   case PCI_CHIP_G41_G:
+   case PCI_CHIP_B43_G:
+      chipset.is_g4x = TRUE;
+      break;
+
+   case PCI_CHIP_ILD_G:
+   case PCI_CHIP_ILM_G:
+      chipset.is_igdng = TRUE;
+      break;
+
+   default:
+      debug_printf("%s: unknown pci id 0x%x, cannot create screen\n", 
+                   __FUNCTION__, pci_id);
+      return NULL;
+   }
+
+
+   bscreen = CALLOC_STRUCT(brw_screen);
+   if (!bscreen)
+      return NULL;
+
+   bscreen->chipset = chipset;
+   bscreen->sws = sws;
+   bscreen->base.winsys = NULL;
+   bscreen->base.destroy = brw_destroy_screen;
+   bscreen->base.get_name = brw_get_name;
+   bscreen->base.get_vendor = brw_get_vendor;
+   bscreen->base.get_param = brw_get_param;
+   bscreen->base.get_paramf = brw_get_paramf;
+   bscreen->base.is_format_supported = brw_is_format_supported;
+   bscreen->base.context_create = brw_create_context;
+   bscreen->base.fence_reference = brw_fence_reference;
+   bscreen->base.fence_signalled = brw_fence_signalled;
+   bscreen->base.fence_finish = brw_fence_finish;
+
+   brw_init_screen_resource_functions(bscreen);
+   brw_screen_tex_surface_init(bscreen);
+
+   bscreen->no_tiling = debug_get_option("BRW_NO_TILING", FALSE) != NULL;
+   
+   
+   return &bscreen->base;
+}
diff --git a/src/gallium/drivers/i965/brw_screen.h b/src/gallium/drivers/i965/brw_screen.h
new file mode 100644
index 0000000000..522a3bf899
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen.h
@@ -0,0 +1,102 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_SCREEN_H
+#define BRW_SCREEN_H
+
+#include "pipe/p_state.h"
+#include "pipe/p_screen.h"
+
+#include "brw_reg.h"
+#include "brw_structs.h"
+
+struct brw_winsys_screen;
+
+
+/**
+ * Subclass of pipe_screen
+ */
+struct brw_screen
+{
+   struct pipe_screen base;
+   struct brw_chipset chipset;
+   struct brw_winsys_screen *sws;
+   boolean no_tiling;
+};
+
+
+
+union brw_surface_id {
+   struct {
+      unsigned face:3;
+      unsigned zslice:13;
+      unsigned level:16;
+   } bits;
+   unsigned value;
+};
+
+
+struct brw_surface
+{
+   struct pipe_surface base;
+   
+   union brw_surface_id id;
+   unsigned cpp;
+   unsigned pitch;
+   unsigned draw_offset;
+   unsigned tiling;
+
+   struct brw_surface_state ss;
+   struct brw_winsys_buffer *bo;
+   struct brw_surface *next, *prev;
+};
+
+
+
+/*
+ * Cast wrappers
+ */
+static INLINE struct brw_screen *
+brw_screen(struct pipe_screen *pscreen)
+{
+   return (struct brw_screen *) pscreen;
+}
+
+
+static INLINE struct brw_surface *
+brw_surface(struct pipe_surface *surface)
+{
+   return (struct brw_surface *)surface;
+}
+
+unsigned
+brw_surface_pitch( const struct pipe_surface *surface );
+
+void brw_screen_tex_surface_init( struct brw_screen *brw_screen );
+
+
+#endif /* BRW_SCREEN_H */
diff --git a/src/gallium/drivers/i965/brw_screen_surface.c b/src/gallium/drivers/i965/brw_screen_surface.c
new file mode 100644
index 0000000000..f288fdbcd3
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen_surface.c
@@ -0,0 +1,261 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+#include "util/u_math.h"
+
+#include "pipe/p_screen.h"
+#include "brw_screen.h"
+#include "brw_defines.h"
+#include "brw_resource.h"
+#include "brw_winsys.h"
+
+enum {
+   BRW_VIEW_LINEAR,
+   BRW_VIEW_IN_PLACE
+};
+
+
+static boolean need_linear_view( struct brw_screen *brw_screen,
+				 struct brw_texture *brw_texture,
+				 union brw_surface_id id,
+				 unsigned usage )
+{
+#if 0
+   /* XXX: what about IDGNG?
+    */
+   if (!BRW_IS_G4X(brw->brw_screen->pci_id))
+   {
+      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+
+      /* The original gen4 hardware couldn't set up WM surfaces pointing
+       * at an offset within a tile, which can happen when rendering to
+       * anything but the base level of a texture or the +X face/0 depth.
+       * This was fixed with the 4 Series hardware.
+       *
+       * For these original chips, you would have to make the depth and
+       * color destination surfaces include information on the texture
+       * type, LOD, face, and various limits to use them as a destination.
+       *
+       * This is easy in Gallium as surfaces are all backed by
+       * textures, but there's also a nasty requirement that the depth
+       * and the color surfaces all be of the same LOD, which is
+       * harder to get around as we can't look at a surface in
+       * isolation and decide if it's legal.
+       *
+       * Instead, end up being pessimistic and say that for i965,
+       * ... ??
+       */
+      if (brw_tex->tiling != I915_TILING_NONE &&
+	  (brw_tex_image_offset(brw_tex, face, level, zslize) & 4095)) {
+	 if (BRW_DEBUG & DEBUG_VIEW)
+	    debug_printf("%s: need surface view for non-aligned tex image\n",
+			 __FUNCTION__);
+	 return GL_TRUE;
+      }
+   }
+#endif
+
+   /* Tiled 3d textures don't have subsets that look like 2d surfaces:
+    */
+   
+   /* Everything else should be fine to render to in-place:
+    */
+   return GL_FALSE;
+}
+
+/* Look at all texture views and figure out if any of them need to be
+ * back-copied into the texture for sampling
+ */
+void brw_update_texture( struct brw_screen *brw_screen,
+			 struct brw_texture *tex )
+{
+   /* currently nothing to do */
+}
+
+
+/* Create a new surface with linear layout to serve as a render-target
+ * where it would be illegal (perhaps due to tiling constraints) to do
+ * this in-place.
+ * 
+ * Currently not implmented, not sure if it's needed.
+ */
+static struct brw_surface *create_linear_view( struct brw_screen *brw_screen,
+					       struct brw_texture *tex,
+					       union brw_surface_id id,
+					       unsigned usage )
+{
+   return NULL;
+}
+
+
+/* Create a pipe_surface that just points directly into the existing
+ * texture's storage.
+ */
+static struct brw_surface *create_in_place_view( struct brw_screen *brw_screen,
+						  struct brw_texture *tex,
+						  union brw_surface_id id,
+						  unsigned usage )
+{
+   struct brw_surface *surface;
+
+   surface = CALLOC_STRUCT(brw_surface);
+   if (surface == NULL)
+      return NULL;
+
+   pipe_reference_init(&surface->base.reference, 1);
+
+   /* XXX: ignoring render-to-slice-of-3d-texture
+    */
+   assert(id.bits.zslice == 0);
+
+   surface->base.format = tex->b.b.format;
+   surface->base.width = u_minify(tex->b.b.width0, id.bits.level);
+   surface->base.height = u_minify(tex->b.b.height0, id.bits.level);
+   surface->base.offset = tex->image_offset[id.bits.level][id.bits.face];
+   surface->base.usage = usage;
+   surface->base.zslice = id.bits.zslice;
+   surface->base.face = id.bits.face;
+   surface->base.level = id.bits.level;
+   surface->id = id;
+   surface->cpp = tex->cpp;
+   surface->pitch = tex->pitch;
+   surface->tiling = tex->tiling;
+
+   bo_reference( &surface->bo, tex->bo );
+   pipe_resource_reference( &surface->base.texture, &tex->b.b );
+
+   surface->ss.ss0.surface_format = tex->ss.ss0.surface_format;
+   surface->ss.ss0.surface_type = BRW_SURFACE_2D;
+
+   if (tex->tiling == BRW_TILING_NONE) {
+      surface->ss.ss1.base_addr = surface->base.offset;
+   } else {
+      uint32_t tile_offset = surface->base.offset % 4096;
+
+      surface->ss.ss1.base_addr = surface->base.offset - tile_offset;
+
+      if (brw_screen->chipset.is_g4x) {
+	 if (tex->tiling == BRW_TILING_X) {
+	    /* Note that the low bits of these fields are missing, so
+	     * there's the possibility of getting in trouble.
+	     */
+	    surface->ss.ss5.x_offset = (tile_offset % 512) / tex->cpp / 4;
+	    surface->ss.ss5.y_offset = tile_offset / 512 / 2;
+	 } else {
+	    surface->ss.ss5.x_offset = (tile_offset % 128) / tex->cpp / 4;
+	    surface->ss.ss5.y_offset = tile_offset / 128 / 2;
+	 }
+      }
+      else {
+	 assert(tile_offset == 0);
+      }
+   }
+
+#if 0
+   if (region_bo != NULL)
+      surface->ss.ss1.base_addr += region_bo->offset; /* reloc */
+#endif
+
+   surface->ss.ss2.width = surface->base.width - 1;
+   surface->ss.ss2.height = surface->base.height - 1;
+   surface->ss.ss3.tiled_surface = tex->ss.ss3.tiled_surface;
+   surface->ss.ss3.tile_walk = tex->ss.ss3.tile_walk;
+   surface->ss.ss3.pitch = tex->ss.ss3.pitch;
+
+   return surface;
+}
+
+/* Get a surface which is view into a texture 
+ */
+static struct pipe_surface *brw_get_tex_surface(struct pipe_screen *screen,
+						struct pipe_resource *pt,
+						unsigned face, unsigned level,
+						unsigned zslice,
+						unsigned usage )
+{
+   struct brw_texture *tex = brw_texture(pt);
+   struct brw_screen *bscreen = brw_screen(screen);
+   struct brw_surface *surface;
+   union brw_surface_id id;
+   int type;
+
+   id.bits.face = face;
+   id.bits.level = level;
+   id.bits.zslice = zslice;
+
+   if (need_linear_view(bscreen, tex, id, usage)) 
+      type = BRW_VIEW_LINEAR;
+   else
+      type = BRW_VIEW_IN_PLACE;
+
+   
+   foreach (surface, &tex->views[type]) {
+      if (id.value == surface->id.value)
+	 return &surface->base;
+   }
+
+   switch (type) {
+   case BRW_VIEW_LINEAR:
+      surface = create_linear_view( bscreen, tex, id, usage );
+      break;
+   case BRW_VIEW_IN_PLACE:
+      surface = create_in_place_view( bscreen, tex, id, usage );
+      break;
+   }
+
+   insert_at_head( &tex->views[type], surface );
+   return &surface->base;
+}
+
+
+static void brw_tex_surface_destroy( struct pipe_surface *surf )
+{
+   struct brw_surface *surface = brw_surface(surf);
+
+   /* Unreference texture, shared buffer:
+    */
+   remove_from_list(surface);
+   bo_reference(&surface->bo, NULL);
+   pipe_resource_reference( &surface->base.texture, NULL );
+
+
+   FREE(surface);
+}
+
+
+void brw_screen_tex_surface_init( struct brw_screen *brw_screen )
+{
+   brw_screen->base.get_tex_surface = brw_get_tex_surface;
+   brw_screen->base.tex_surface_destroy = brw_tex_surface_destroy;
+}
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
new file mode 100644
index 0000000000..5abf3848ab
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -0,0 +1,216 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+#include "pipe/p_state.h"
+
+#include "brw_batchbuffer.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_pipe_rast.h"
+#include "brw_eu.h"
+#include "brw_sf.h"
+#include "brw_state.h"
+
+static enum pipe_error compile_sf_prog( struct brw_context *brw,
+                                        struct brw_sf_prog_key *key,
+                                        struct brw_winsys_buffer **bo_out )
+{
+   enum pipe_error ret;
+   struct brw_sf_compile c;
+   const GLuint *program;
+   GLuint program_size;
+
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_compile(brw, &c.func);
+
+   c.key = *key;
+   c.nr_attrs = c.key.nr_attrs;
+   c.nr_attr_regs = (c.nr_attrs+1)/2;
+   c.nr_setup_attrs = c.key.nr_attrs;
+   c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
+
+   c.prog_data.urb_read_length = c.nr_attr_regs;
+   c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
+
+   /* Special case when there are no attributes to setup.
+    *
+    * XXX: should be able to set nr_setup_attrs to nr_attrs-1 -- but
+    * breaks vp-tris.c
+    */
+   if (c.nr_attrs - 1 == 0) {
+      c.nr_verts = 0;
+      brw_emit_null_setup( &c );
+   }
+   else {
+      /* Which primitive?  Or all three? 
+       */
+      switch (key->primitive) {
+      case SF_TRIANGLES:
+         c.nr_verts = 3;
+         brw_emit_tri_setup( &c, GL_TRUE );
+         break;
+      case SF_LINES:
+         c.nr_verts = 2;
+         brw_emit_line_setup( &c, GL_TRUE );
+         break;
+      case SF_POINTS:
+         c.nr_verts = 1;
+         if (key->do_point_sprite)
+            brw_emit_point_sprite_setup( &c, GL_TRUE );
+         else
+            brw_emit_point_setup( &c, GL_TRUE );
+         break;
+      case SF_UNFILLED_TRIS:
+         c.nr_verts = 3;
+         brw_emit_anyprim_setup( &c );
+         break;
+      default:
+         assert(0);
+         return PIPE_ERROR_BAD_INPUT;
+      }
+   }
+
+   /* get the program
+    */
+   ret = brw_get_program(&c.func, &program, &program_size);
+   if (ret)
+      return ret;
+
+   /* Upload
+    */
+   ret = brw_upload_cache( &brw->cache, BRW_SF_PROG,
+                           &c.key, sizeof(c.key),
+                           NULL, 0,
+                           program, program_size,
+                           &c.prog_data,
+                           &brw->sf.prog_data,
+                           bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static enum pipe_error upload_sf_prog(struct brw_context *brw)
+{
+   const struct brw_fs_signature *sig = &brw->curr.fragment_shader->signature;
+   const struct pipe_rasterizer_state *rast = &brw->curr.rast->templ;
+   struct brw_sf_prog_key key;
+   enum pipe_error ret;
+   unsigned i;
+
+   memset(&key, 0, sizeof(key));
+
+   /* Populate the key, noting state dependencies:
+    */
+
+   /* XXX: Add one to account for the position input.
+    */
+   /* PIPE_NEW_FRAGMENT_SIGNATURE */
+   key.nr_attrs = sig->nr_inputs + 1;
+
+
+   /* XXX: why is position required to be linear?  why do we care
+    * about it at all?
+    */
+   key.linear_attrs = 1;        /* position -- but why? */
+
+   for (i = 0; i < sig->nr_inputs; i++) {
+      switch (sig->input[i].interp) {
+      case TGSI_INTERPOLATE_CONSTANT:
+         break;
+      case TGSI_INTERPOLATE_LINEAR:
+         key.linear_attrs |= 1 << (i+1);
+         break;
+      case TGSI_INTERPOLATE_PERSPECTIVE:
+         key.persp_attrs |= 1 << (i+1);
+         break;
+      }
+   }
+
+   /* BRW_NEW_REDUCED_PRIMITIVE */
+   switch (brw->reduced_primitive) {
+   case PIPE_PRIM_TRIANGLES: 
+      /* PIPE_NEW_RAST
+       */
+      if (rast->fill_front != PIPE_POLYGON_MODE_FILL ||
+	  rast->fill_back != PIPE_POLYGON_MODE_FILL)
+	 key.primitive = SF_UNFILLED_TRIS;
+      else
+	 key.primitive = SF_TRIANGLES;
+      break;
+   case PIPE_PRIM_LINES: 
+      key.primitive = SF_LINES; 
+      break;
+   case PIPE_PRIM_POINTS: 
+      key.primitive = SF_POINTS; 
+      break;
+   }
+
+   key.do_point_sprite = rast->sprite_coord_enable ? 1 : 0;
+   key.sprite_origin_lower_left = (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
+   key.point_coord_replace_attrs = rast->sprite_coord_enable;
+   key.do_flat_shading = rast->flatshade;
+   key.do_twoside_color = rast->light_twoside;
+
+   if (key.do_twoside_color) {
+      key.frontface_ccw = rast->front_ccw;
+   }
+
+   if (brw_search_cache(&brw->cache, BRW_SF_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->sf.prog_data,
+                        &brw->sf.prog_bo))
+      return PIPE_OK;
+
+   ret = compile_sf_prog( brw, &key, &brw->sf.prog_bo );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+const struct brw_tracked_state brw_sf_prog = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_RAST | PIPE_NEW_FRAGMENT_SIGNATURE),
+      .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = 0
+   },
+   .prepare = upload_sf_prog
+};
+
diff --git a/src/gallium/drivers/i965/brw_sf.h b/src/gallium/drivers/i965/brw_sf.h
new file mode 100644
index 0000000000..a895c7d2f6
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_sf.h
@@ -0,0 +1,122 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+   
+
+#ifndef BRW_SF_H
+#define BRW_SF_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+
+#define SF_POINTS    0
+#define SF_LINES     1
+#define SF_TRIANGLES 2
+#define SF_UNFILLED_TRIS   3
+
+struct brw_sf_prog_key {
+
+   /* Bitmask of linear and perspective interpolated inputs, 0..nr
+    */
+   GLuint persp_attrs:32;
+   GLuint linear_attrs:32;
+   GLuint point_coord_replace_attrs:32;
+
+   GLuint nr_attrs:8;
+   GLuint primitive:2;
+   GLuint do_twoside_color:1;
+   GLuint do_flat_shading:1;
+   GLuint frontface_ccw:1;
+   GLuint do_point_sprite:1;
+   GLuint sprite_origin_lower_left:1;
+   GLuint pad:17;
+
+   GLuint attr_col0:8;
+   GLuint attr_col1:8;
+   GLuint attr_bfc0:8;
+   GLuint attr_bfc1:8;
+};
+
+struct brw_sf_point_tex {
+   GLboolean CoordReplace;	
+};
+
+struct brw_sf_compile {
+   struct brw_compile func;
+   struct brw_sf_prog_key key;
+   struct brw_sf_prog_data prog_data;
+   
+   struct brw_reg pv;
+   struct brw_reg det;
+   struct brw_reg dx0;
+   struct brw_reg dx2;
+   struct brw_reg dy0;
+   struct brw_reg dy2;
+
+   /* z and 1/w passed in seperately:
+    */
+   struct brw_reg z[3];
+   struct brw_reg inv_w[3];
+   
+   /* The vertices:
+    */
+   struct brw_reg vert[3];
+
+    /* Temporaries, allocated after last vertex reg.
+    */
+   struct brw_reg inv_det;
+   struct brw_reg a1_sub_a0;
+   struct brw_reg a2_sub_a0;
+   struct brw_reg tmp;
+
+   struct brw_reg m1Cx;
+   struct brw_reg m2Cy;
+   struct brw_reg m3C0;
+
+   GLuint nr_verts;
+   GLuint nr_attrs;
+   GLuint nr_attr_regs;
+   GLuint nr_setup_attrs;
+   GLuint nr_setup_regs;
+
+   GLuint point_coord_replace_mask;
+};
+
+ 
+void brw_emit_null_setup( struct brw_sf_compile *c );
+void brw_emit_tri_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_line_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_point_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_anyprim_setup( struct brw_sf_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_sf_emit.c b/src/gallium/drivers/i965/brw_sf_emit.c
new file mode 100644
index 0000000000..497634ec9e
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_sf_emit.c
@@ -0,0 +1,764 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+   
+
+#include "brw_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_sf.h"
+
+
+static struct brw_reg get_vert_attr(struct brw_sf_compile *c,
+				    struct brw_reg vert,
+				    GLuint attr)
+{
+   GLuint off = attr / 2;
+   GLuint sub = attr % 2;
+
+   return brw_vec4_grf(vert.nr + off, sub * 4);
+}
+
+
+/*********************************************************************** 
+ * Twoside lighting
+ */
+static void copy_bfc( struct brw_sf_compile *c,
+		      struct brw_reg vert )
+{
+   struct brw_compile *p = &c->func;
+
+   if (c->key.attr_col0 && c->key.attr_bfc0)
+      brw_MOV(p, 
+	      get_vert_attr(c, vert, c->key.attr_col0), 
+	      get_vert_attr(c, vert, c->key.attr_bfc0));
+
+   if (c->key.attr_col1 && c->key.attr_bfc1)
+      brw_MOV(p, 
+	      get_vert_attr(c, vert, c->key.attr_col1), 
+	      get_vert_attr(c, vert, c->key.attr_bfc1));
+}
+
+
+static void do_twoside_color( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+   GLuint backface_conditional = c->key.frontface_ccw ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_L;
+
+   /* Already done in clip program:
+    */
+   if (c->key.primitive == SF_UNFILLED_TRIS)
+      return;
+
+   /* XXX: What happens if BFC isn't present?  This could only happen
+    * for user-supplied vertex programs, as t_vp_build.c always does
+    * the right thing.
+    */
+   if (!(c->key.attr_col0 && c->key.attr_bfc0) &&
+       !(c->key.attr_col1 && c->key.attr_bfc1))
+      return;
+   
+   /* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order
+    * to get all channels active inside the IF.  In the clipping code
+    * we run with NoMask, so it's not an option and we can use
+    * BRW_EXECUTE_1 for all comparisions.
+    */
+   brw_push_insn_state(p);
+   brw_CMP(p, vec4(brw_null_reg()), backface_conditional, c->det, brw_imm_f(0));
+   if_insn = brw_IF(p, BRW_EXECUTE_4); 
+   {
+      switch (c->nr_verts) {
+      case 3: copy_bfc(c, c->vert[2]);
+      case 2: copy_bfc(c, c->vert[1]);
+      case 1: copy_bfc(c, c->vert[0]);
+      }
+   }
+   brw_ENDIF(p, if_insn);
+   brw_pop_insn_state(p);
+}
+
+
+
+/***********************************************************************
+ * Flat shading
+ */
+
+#define VERT_RESULT_COLOR_BITS ((1<<VERT_RESULT_COL0) | \
+                                 (1<<VERT_RESULT_COL1))
+
+static void copy_colors( struct brw_sf_compile *c,
+		     struct brw_reg dst,
+		     struct brw_reg src)
+{
+   struct brw_compile *p = &c->func;
+
+   if (c->key.attr_col0)
+      brw_MOV(p, 
+	      get_vert_attr(c, dst, c->key.attr_col0), 
+	      get_vert_attr(c, src, c->key.attr_col0));
+
+   if (c->key.attr_col1)
+      brw_MOV(p, 
+	      get_vert_attr(c, dst, c->key.attr_col1), 
+	      get_vert_attr(c, src, c->key.attr_col1));
+
+}
+
+
+
+/* Need to use a computed jump to copy flatshaded attributes as the
+ * vertices are ordered according to y-coordinate before reaching this
+ * point, so the PV could be anywhere.
+ */
+static void do_flatshade_triangle( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg ip = brw_ip_reg();
+   GLuint jmpi = 1;
+   GLuint nr = 0;
+
+   if (c->key.attr_col0)
+      nr++;
+
+   if (c->key.attr_col1)
+      nr++;
+
+   if (nr == 0)
+      return;
+
+   /* Already done in clip program:
+    */
+   if (c->key.primitive == SF_UNFILLED_TRIS)
+      return;
+
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
+   brw_push_insn_state(p);
+   
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1)));
+   brw_JMPI(p, ip, ip, c->pv);
+
+   copy_colors(c, c->vert[1], c->vert[0]);
+   copy_colors(c, c->vert[2], c->vert[0]);
+   brw_JMPI(p, ip, ip, brw_imm_d(jmpi*(nr*4+1)));
+
+   copy_colors(c, c->vert[0], c->vert[1]);
+   copy_colors(c, c->vert[2], c->vert[1]);
+   brw_JMPI(p, ip, ip, brw_imm_d(jmpi*nr*2));
+
+   copy_colors(c, c->vert[0], c->vert[2]);
+   copy_colors(c, c->vert[1], c->vert[2]);
+
+   brw_pop_insn_state(p);
+}
+	
+
+static void do_flatshade_line( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg ip = brw_ip_reg();
+   GLuint jmpi = 1;
+   GLuint nr = 0;
+
+   if (c->key.attr_col0)
+      nr++;
+
+   if (c->key.attr_col1)
+      nr++;
+
+   if (nr == 0)
+      return;
+
+   /* Already done in clip program: 
+    */
+   if (c->key.primitive == SF_UNFILLED_TRIS)
+      return;
+
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
+   brw_push_insn_state(p);
+   
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1)));
+   brw_JMPI(p, ip, ip, c->pv);
+   copy_colors(c, c->vert[1], c->vert[0]);
+
+   brw_JMPI(p, ip, ip, brw_imm_ud(jmpi*nr));
+   copy_colors(c, c->vert[0], c->vert[1]);
+
+   brw_pop_insn_state(p);
+}
+
+	
+
+/***********************************************************************
+ * Triangle setup.
+ */
+
+
+static void alloc_regs( struct brw_sf_compile *c )
+{
+   GLuint reg, i;
+
+   /* Values computed by fixed function unit:
+    */
+   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D);
+   c->det = brw_vec1_grf(1, 2);
+   c->dx0 = brw_vec1_grf(1, 3);
+   c->dx2 = brw_vec1_grf(1, 4);
+   c->dy0 = brw_vec1_grf(1, 5);
+   c->dy2 = brw_vec1_grf(1, 6);
+
+   /* z and 1/w passed in seperately:
+    */
+   c->z[0]     = brw_vec1_grf(2, 0);
+   c->inv_w[0] = brw_vec1_grf(2, 1);
+   c->z[1]     = brw_vec1_grf(2, 2);
+   c->inv_w[1] = brw_vec1_grf(2, 3);
+   c->z[2]     = brw_vec1_grf(2, 4);
+   c->inv_w[2] = brw_vec1_grf(2, 5);
+   
+   /* The vertices:
+    */
+   reg = 3;
+   for (i = 0; i < c->nr_verts; i++) {
+      c->vert[i] = brw_vec8_grf(reg, 0);
+      reg += c->nr_attr_regs;
+   }
+
+   /* Temporaries, allocated after last vertex reg.
+    */
+   c->inv_det = brw_vec1_grf(reg, 0);  reg++;
+   c->a1_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->a2_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->tmp = brw_vec8_grf(reg, 0);  reg++;
+
+   /* Note grf allocation:
+    */
+   c->prog_data.total_grf = reg;
+   
+
+   /* Outputs of this program - interpolation coefficients for
+    * rasterization:
+    */
+   c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0);
+   c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0);
+   c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0);
+}
+
+
+static void copy_z_inv_w( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   brw_push_insn_state(p);
+	
+   /* Copy both scalars with a single MOV:
+    */
+   for (i = 0; i < c->nr_verts; i++)
+      brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i]));
+	 
+   brw_pop_insn_state(p);
+}
+
+
+static void invert_det( struct brw_sf_compile *c)
+{
+   /* Looks like we invert all 8 elements just to get 1/det in
+    * position 2 !?!
+    */
+   brw_math(&c->func, 
+	    c->inv_det, 
+	    BRW_MATH_FUNCTION_INV,
+	    BRW_MATH_SATURATE_NONE,
+	    0, 
+	    c->det,
+	    BRW_MATH_DATA_SCALAR,
+	    BRW_MATH_PRECISION_FULL);
+
+}
+
+
+/* Two attributes packed into a wide register.  Figure out if either
+ * or both of them need linear/perspective interpolation.  Constant
+ * regs are left as-is.
+ */
+static GLboolean calculate_masks( struct brw_sf_compile *c,
+				  GLuint reg,
+				  GLushort *pc,
+				  GLushort *pc_persp,
+				  GLushort *pc_linear)
+{
+   GLboolean is_last_attr = (reg == c->nr_setup_regs - 1);
+   GLuint persp_mask = c->key.persp_attrs;
+   GLuint linear_mask = (c->key.persp_attrs | c->key.linear_attrs);
+
+   *pc_persp = 0;
+   *pc_linear = 0;
+   *pc = 0xf;
+      
+   if (persp_mask & (1 << (reg*2))) 
+      *pc_persp = 0xf;
+
+   if (linear_mask & (1 << (reg*2))) 
+      *pc_linear = 0xf;
+
+   /* Maybe only processs one attribute on the final round:
+    */
+   if (reg*2+1 < c->nr_setup_attrs) {
+      *pc |= 0xf0;
+
+      if (persp_mask & (1 << (reg*2+1))) 
+	 *pc_persp |= 0xf0;
+
+      if (linear_mask & (1 << (reg*2+1))) 
+	 *pc_linear |= 0xf0;
+   }
+
+   return is_last_attr;
+}
+
+
+void brw_emit_null_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+
+   /* m0 is implicitly copied from r0 in the send instruction:
+    */	 
+   brw_urb_WRITE(p, 
+                 brw_null_reg(),
+                 0,
+                 brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
+                 0, 	/* allocate */
+                 1,	/* used */
+                 1, 	/* msg len */
+                 0,	/* response len */
+                 1,	/* eot */
+                 1, 	/* writes complete */
+                 0,	/* offset */
+                 BRW_URB_SWIZZLE_TRANSPOSE); 
+}
+
+void brw_emit_tri_setup( struct brw_sf_compile *c, GLboolean allocate)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   c->nr_verts = 3;
+
+   if (allocate)
+      alloc_regs(c);
+
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   if (c->key.do_twoside_color) 
+      do_twoside_color(c);
+
+   if (c->key.do_flat_shading)
+      do_flatshade_triangle(c);
+      
+   
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      struct brw_reg a2 = offset(c->vert[2], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+	 brw_MUL(p, a2, a2, c->inv_w[2]);
+      }
+      
+      
+      /* Calculate coefficients for interpolated values:
+       */      
+      if (pc_linear)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+	 brw_ADD(p, c->a2_sub_a0, a2, negate(a0));
+
+	 /* calculate dA/dx
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2);
+	 brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0));
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+		
+	 /* calculate dA/dy
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0);
+	 brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2));
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+      
+	 /* Copy m0..m3 to URB.  m0 is implicitly copied from r0 in
+	  * the send instruction:
+	  */	 
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last,	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */
+      }
+   }
+}
+
+
+
+void brw_emit_line_setup( struct brw_sf_compile *c, GLboolean allocate)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+
+   c->nr_verts = 2;
+
+   if (allocate)
+      alloc_regs(c);
+
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   if (c->key.do_flat_shading)
+      do_flatshade_line(c);
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+      }
+
+      /* Calculate coefficients for position, color:
+       */
+      if (pc_linear) {
+	 brw_set_predicate_control_flag_value(p, pc_linear); 
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+
+ 	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0); 
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+		
+	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB. 
+	  */
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1, 	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE); 
+      }
+   } 
+}
+
+void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   c->nr_verts = 1;
+
+   if (allocate)
+      alloc_regs(c);
+
+   copy_z_inv_w(c);
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* XXX: only seems to check point_coord_replace_attrs for every
+       * second attribute?!?
+       */
+      boolean coord_replace = !!(c->key.point_coord_replace_attrs & (1<<(2*i)));
+      struct brw_reg a0 = offset(c->vert[0], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+            
+      if (pc_persp)
+      {				
+	 if (coord_replace) {
+	    brw_set_predicate_control_flag_value(p, pc_persp);
+	    brw_MUL(p, a0, a0, c->inv_w[0]);
+	 }
+      }
+
+      if (coord_replace) {
+	 /* Caculate 1.0/PointWidth */
+	 brw_math(&c->func,
+		  c->tmp,
+		  BRW_MATH_FUNCTION_INV,
+		  BRW_MATH_SATURATE_NONE,
+		  0,
+		  c->dx0,
+		  BRW_MATH_DATA_SCALAR,
+		  BRW_MATH_PRECISION_FULL);
+
+	 if (c->key.sprite_origin_lower_left) {
+	    brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
+	    brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
+	    brw_MUL(p, c->m2Cy, c->tmp, negate(c->inv_w[0]));
+	    brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
+	 } 
+	 else {
+	    brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
+	    brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
+	    brw_MUL(p, c->m2Cy, c->tmp, c->inv_w[0]);
+	    brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
+	 }
+      } 
+      else {
+	 brw_MOV(p, c->m1Cx, brw_imm_ud(0));
+	 brw_MOV(p, c->m2Cy, brw_imm_ud(0));
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+	 if (coord_replace) {
+	    if (c->key.sprite_origin_lower_left) {
+	       brw_MUL(p, c->m3C0, c->inv_w[0], brw_imm_f(1.0));
+	       brw_MOV(p, vec1(suboffset(c->m3C0, 0)), brw_imm_f(0.0));
+	    }
+	    else {
+	       brw_MOV(p, c->m3C0, brw_imm_f(0.0));
+	    }
+	 } 
+	 else {
+	    brw_MOV(p, c->m3C0, a0); /* constant value */
+	 }
+
+	 /* Copy m0..m3 to URB. 
+	  */
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
+
+/* Points setup - several simplifications as all attributes are
+ * constant across the face of the point (point sprites excluded!)
+ */
+void brw_emit_point_setup( struct brw_sf_compile *c, GLboolean allocate)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   c->nr_verts = 1;
+   
+   if (allocate)
+      alloc_regs(c);
+
+   copy_z_inv_w(c);
+
+   brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */
+   brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      struct brw_reg a0 = offset(c->vert[0], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+            
+      if (pc_persp)
+      {				
+	 /* This seems odd as the values are all constant, but the
+	  * fragment shader will be expecting it:
+	  */
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+      }
+
+
+      /* The delta values are always zero, just send the starting
+       * coordinate.  Again, this is to fit in with the interpolation
+       * code in the fragment shader.
+       */
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+
+	 brw_MOV(p, c->m3C0, a0); /* constant value */
+
+	 /* Copy m0..m3 to URB. 
+	  */
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
+
+void brw_emit_anyprim_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg ip = brw_ip_reg();
+   struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
+   struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0); 
+   struct brw_reg primmask;
+   struct brw_instruction *jmp;
+   struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+   
+   GLuint saveflag;
+
+   c->nr_verts = 3;
+   alloc_regs(c);
+
+   primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD);
+
+   brw_MOV(p, primmask, brw_imm_ud(1));
+   brw_SHL(p, primmask, primmask, payload_prim);
+
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) |
+					       (1<<_3DPRIM_TRISTRIP) |
+					       (1<<_3DPRIM_TRIFAN) |
+					       (1<<_3DPRIM_TRISTRIP_REVERSE) |
+					       (1<<_3DPRIM_POLYGON) |
+					       (1<<_3DPRIM_RECTLIST) |
+					       (1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+   {
+      saveflag = p->flag_value;
+      brw_push_insn_state(p); 
+      brw_emit_tri_setup( c, GL_FALSE );
+      brw_pop_insn_state(p);
+      p->flag_value = saveflag;
+      /* note - thread killed in subroutine, so must
+       * restore the flag which is changed when building
+       * the subroutine. fix #13240
+       */
+   }
+   brw_land_fwd_jump(p, jmp);
+
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) |
+					       (1<<_3DPRIM_LINESTRIP) |
+					       (1<<_3DPRIM_LINELOOP) |
+					       (1<<_3DPRIM_LINESTRIP_CONT) |
+					       (1<<_3DPRIM_LINESTRIP_BF) |
+					       (1<<_3DPRIM_LINESTRIP_CONT_BF)));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+   {
+      saveflag = p->flag_value;
+      brw_push_insn_state(p); 
+      brw_emit_line_setup( c, GL_FALSE );
+      brw_pop_insn_state(p);
+      p->flag_value = saveflag;
+      /* note - thread killed in subroutine */
+   }
+   brw_land_fwd_jump(p, jmp); 
+
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+   brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+   {
+      saveflag = p->flag_value;
+      brw_push_insn_state(p); 
+      brw_emit_point_sprite_setup( c, GL_FALSE );
+      brw_pop_insn_state(p);
+      p->flag_value = saveflag;
+   }
+   brw_land_fwd_jump(p, jmp); 
+
+   brw_emit_point_setup( c, GL_FALSE );
+}
+
+
+
+
diff --git a/src/gallium/drivers/i965/brw_sf_state.c b/src/gallium/drivers/i965/brw_sf_state.c
new file mode 100644
index 0000000000..6c299a86b4
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_sf_state.c
@@ -0,0 +1,331 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+   
+#include "util/u_math.h"
+
+#include "pipe/p_state.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_debug.h"
+#include "brw_pipe_rast.h"
+
+static enum pipe_error upload_sf_vp(struct brw_context *brw)
+{
+   const struct pipe_viewport_state *vp = &brw->curr.viewport;
+   const struct pipe_scissor_state *scissor = &brw->curr.scissor;
+   struct brw_sf_viewport sfv;
+   enum pipe_error ret;
+
+   memset(&sfv, 0, sizeof(sfv));
+
+   /* PIPE_NEW_VIEWPORT, PIPE_NEW_SCISSOR */
+
+   sfv.viewport.m00 = vp->scale[0];
+   sfv.viewport.m11 = vp->scale[1];
+   sfv.viewport.m22 = vp->scale[2];
+   sfv.viewport.m30 = vp->translate[0];
+   sfv.viewport.m31 = vp->translate[1];
+   sfv.viewport.m32 = vp->translate[2];
+
+   sfv.scissor.xmin = scissor->minx;
+   sfv.scissor.xmax = scissor->maxx - 1; /* ? */
+   sfv.scissor.ymin = scissor->miny;
+   sfv.scissor.ymax = scissor->maxy - 1; /* ? */
+
+   ret = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0,
+                         &brw->sf.vp_bo );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_sf_vp = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_VIEWPORT | 
+		PIPE_NEW_SCISSOR),
+      .brw   = 0,
+      .cache = 0
+   },
+   .prepare = upload_sf_vp
+};
+
+struct brw_sf_unit_key {
+   unsigned int total_grf;
+   unsigned int urb_entry_read_length;
+   unsigned int nr_urb_entries, urb_size, sfsize;
+   
+   unsigned scissor:1;
+   unsigned line_smooth:1;
+   unsigned point_sprite:1;
+   unsigned point_attenuated:1;
+   unsigned front_ccw:1;
+   unsigned cull_face:2;
+   unsigned flatshade_first:1;
+   unsigned gl_rasterization_rules:1;
+   unsigned line_last_pixel_enable:1;
+   float line_width;
+   float point_size;
+};
+
+static void
+sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
+{
+   const struct pipe_rasterizer_state *rast = &brw->curr.rast->templ;
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_SF_PROG */
+   key->total_grf = brw->sf.prog_data->total_grf;
+   key->urb_entry_read_length = brw->sf.prog_data->urb_read_length;
+
+   /* BRW_NEW_URB_FENCE */
+   key->nr_urb_entries = brw->urb.nr_sf_entries;
+   key->urb_size = brw->urb.vsize;
+   key->sfsize = brw->urb.sfsize;
+
+   /* PIPE_NEW_RAST */
+   key->scissor = rast->scissor;
+   key->front_ccw = rast->front_ccw;
+   key->cull_face = rast->cull_face;
+   key->line_smooth = rast->line_smooth;
+   key->line_width = rast->line_width;
+   key->flatshade_first = rast->flatshade_first;
+   key->line_last_pixel_enable = rast->line_last_pixel;
+   key->gl_rasterization_rules = rast->gl_rasterization_rules;
+
+   key->point_sprite = rast->sprite_coord_enable ? 1 : 0;
+   key->point_attenuated = rast->point_size_per_vertex;
+
+   key->point_size = rast->point_size;
+}
+
+static enum pipe_error
+sf_unit_create_from_key(struct brw_context *brw,
+                        struct brw_sf_unit_key *key,
+                        struct brw_winsys_reloc *reloc,
+                        struct brw_winsys_buffer **bo_out)
+{
+   struct brw_sf_unit_state sf;
+   enum pipe_error ret;
+   int chipset_max_threads;
+   memset(&sf, 0, sizeof(sf));
+
+   sf.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
+   /* reloc */
+   sf.thread0.kernel_start_pointer = 0;
+
+   sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+
+   sf.thread3.dispatch_grf_start_reg = 3;
+
+   if (BRW_IS_IGDNG(brw))
+       sf.thread3.urb_entry_read_offset = 3;
+   else
+       sf.thread3.urb_entry_read_offset = 1;
+
+   sf.thread3.urb_entry_read_length = key->urb_entry_read_length;
+
+   sf.thread4.nr_urb_entries = key->nr_urb_entries;
+   sf.thread4.urb_entry_allocation_size = key->sfsize - 1;
+
+   /* Each SF thread produces 1 PUE, and there can be up to 24(Pre-IGDNG) or 
+    * 48(IGDNG) threads 
+    */
+   if (BRW_IS_IGDNG(brw))
+      chipset_max_threads = 48;
+   else
+      chipset_max_threads = 24;
+
+   sf.thread4.max_threads = MIN2(chipset_max_threads, key->nr_urb_entries) - 1;
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      sf.thread4.max_threads = 0;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      sf.thread4.stats_enable = 1;
+
+   /* CACHE_NEW_SF_VP */
+   /* reloc */
+   sf.sf5.sf_viewport_state_offset = 0;
+
+   sf.sf5.viewport_transform = 1;
+
+   if (key->scissor)
+      sf.sf6.scissor = 1;
+
+   if (key->front_ccw)
+      sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
+   else
+      sf.sf5.front_winding = BRW_FRONTWINDING_CW;
+
+   switch (key->cull_face) {
+   case PIPE_FACE_FRONT:
+      sf.sf6.cull_mode = BRW_CULLMODE_FRONT;
+      break;
+   case PIPE_FACE_BACK:
+      sf.sf6.cull_mode = BRW_CULLMODE_BACK;
+      break;
+   case PIPE_FACE_FRONT_AND_BACK:
+      sf.sf6.cull_mode = BRW_CULLMODE_BOTH;
+      break;
+   case PIPE_FACE_NONE:
+      sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+      break;
+   default:
+      assert(0);
+      sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+      break;
+   }
+
+   /* _NEW_LINE */
+   /* XXX use ctx->Const.Min/MaxLineWidth here */
+   sf.sf6.line_width = CLAMP(key->line_width, 1.0, 5.0) * (1<<1);
+
+   sf.sf6.line_endcap_aa_region_width = 1;
+   if (key->line_smooth)
+      sf.sf6.aa_enable = 1;
+   else if (sf.sf6.line_width <= 0x2)
+       sf.sf6.line_width = 0;
+
+   /* XXX: gl_rasterization_rules?  something else?
+    */
+   sf.sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT;
+   sf.sf6.point_rast_rule = BRW_RASTRULE_LOWER_RIGHT;
+   sf.sf6.point_rast_rule = 1;
+
+   /* XXX clamp max depends on AA vs. non-AA */
+
+   /* _NEW_POINT */
+   sf.sf7.sprite_point = key->point_sprite;
+   sf.sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3);
+   sf.sf7.use_point_size_state = !key->point_attenuated;
+   sf.sf7.aa_line_distance_mode = 0;
+
+   /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
+    */
+   if (!key->flatshade_first) {
+      sf.sf7.trifan_pv = 2;
+      sf.sf7.linestrip_pv = 1;
+      sf.sf7.tristrip_pv = 2;
+   } else {
+      sf.sf7.trifan_pv = 1;
+      sf.sf7.linestrip_pv = 0;
+      sf.sf7.tristrip_pv = 0;
+   }
+
+   sf.sf7.line_last_pixel_enable = key->line_last_pixel_enable;
+
+   /* Set bias for OpenGL rasterization rules:
+    */
+   if (key->gl_rasterization_rules) {
+      sf.sf6.dest_org_vbias = 0x8;
+      sf.sf6.dest_org_hbias = 0x8;
+   }
+   else {
+      sf.sf6.dest_org_vbias = 0x0;
+      sf.sf6.dest_org_hbias = 0x0;
+   }
+
+   ret = brw_upload_cache(&brw->cache, BRW_SF_UNIT,
+                          key, sizeof(*key),
+                          reloc, 2,
+                          &sf, sizeof(sf),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   
+   return PIPE_OK;
+}
+
+static enum pipe_error upload_sf_unit( struct brw_context *brw )
+{
+   struct brw_sf_unit_key key;
+   struct brw_winsys_reloc reloc[2];
+   unsigned total_grf;
+   unsigned viewport_transform;
+   unsigned front_winding;
+   enum pipe_error ret;
+
+   sf_unit_populate_key(brw, &key);
+   
+   /* XXX: cut this crap and pre calculate the key:
+    */
+   total_grf = (align(key.total_grf, 16) / 16 - 1);
+   viewport_transform = 1;
+   front_winding = (key.front_ccw ?
+                    BRW_FRONTWINDING_CCW :
+                    BRW_FRONTWINDING_CW);
+
+   /* Emit SF program relocation */
+   make_reloc(&reloc[0],
+              BRW_USAGE_STATE,
+              total_grf << 1,
+              offsetof(struct brw_sf_unit_state, thread0),
+              brw->sf.prog_bo);
+
+   /* Emit SF viewport relocation */
+   make_reloc(&reloc[1],
+              BRW_USAGE_STATE,
+              front_winding | (viewport_transform << 1),
+              offsetof(struct brw_sf_unit_state, sf5),
+              brw->sf.vp_bo);
+
+
+   if (brw_search_cache(&brw->cache, BRW_SF_UNIT,
+                        &key, sizeof(key),
+                        reloc, 2,
+                        NULL,
+                        &brw->sf.state_bo))
+      return PIPE_OK;
+
+
+   ret = sf_unit_create_from_key(brw, &key,
+                                 reloc,
+                                 &brw->sf.state_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_sf_unit = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_RAST),
+      .brw   = BRW_NEW_URB_FENCE,
+      .cache = (CACHE_NEW_SF_VP |
+		CACHE_NEW_SF_PROG)
+   },
+   .prepare = upload_sf_unit,
+};
diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h
new file mode 100644
index 0000000000..d2bbd0123d
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_state.h
@@ -0,0 +1,174 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+    
+
+#ifndef BRW_STATE_H
+#define BRW_STATE_H
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+
+#include "brw_context.h"
+
+static INLINE void
+brw_add_validated_bo(struct brw_context *brw, struct brw_winsys_buffer *bo)
+{
+   assert(brw->state.validated_bo_count < Elements(brw->state.validated_bos));
+
+   if (bo != NULL) {
+      bo_reference( &brw->state.validated_bos[brw->state.validated_bo_count++],
+                    bo );
+   }
+}
+
+const struct brw_tracked_state brw_blend_constant_color;
+const struct brw_tracked_state brw_cc_unit;
+const struct brw_tracked_state brw_cc_vp;
+const struct brw_tracked_state brw_clip_prog;
+const struct brw_tracked_state brw_clip_unit;
+const struct brw_tracked_state brw_curbe_buffer;
+const struct brw_tracked_state brw_curbe_offsets;
+const struct brw_tracked_state brw_invarient_state;
+const struct brw_tracked_state brw_gs_prog;
+const struct brw_tracked_state brw_gs_unit;
+const struct brw_tracked_state brw_line_stipple;
+const struct brw_tracked_state brw_aa_line_parameters;
+const struct brw_tracked_state brw_pipelined_state_pointers;
+const struct brw_tracked_state brw_binding_table_pointers;
+const struct brw_tracked_state brw_depthbuffer;
+const struct brw_tracked_state brw_polygon_stipple;
+const struct brw_tracked_state brw_program_parameters;
+const struct brw_tracked_state brw_recalculate_urb_fence;
+const struct brw_tracked_state brw_sf_prog;
+const struct brw_tracked_state brw_sf_unit;
+const struct brw_tracked_state brw_sf_vp;
+const struct brw_tracked_state brw_state_base_address;
+const struct brw_tracked_state brw_urb_fence;
+const struct brw_tracked_state brw_vertex_state;
+const struct brw_tracked_state brw_vs_surfaces;
+const struct brw_tracked_state brw_vs_prog;
+const struct brw_tracked_state brw_vs_unit;
+const struct brw_tracked_state brw_wm_input_sizes;
+const struct brw_tracked_state brw_wm_prog;
+const struct brw_tracked_state brw_wm_samplers;
+const struct brw_tracked_state brw_wm_constant_surface;
+const struct brw_tracked_state brw_wm_surfaces;
+const struct brw_tracked_state brw_wm_unit;
+
+const struct brw_tracked_state brw_psp_urb_cbs;
+
+const struct brw_tracked_state brw_pipe_control;
+
+const struct brw_tracked_state brw_drawing_rect;
+const struct brw_tracked_state brw_indices;
+const struct brw_tracked_state brw_vertices;
+const struct brw_tracked_state brw_index_buffer;
+
+
+/***********************************************************************
+ * brw_state.c
+ */
+int brw_validate_state(struct brw_context *brw);
+int brw_upload_state(struct brw_context *brw);
+void brw_init_state(struct brw_context *brw);
+void brw_destroy_state(struct brw_context *brw);
+
+/***********************************************************************
+ * brw_state_cache.c
+ */
+enum pipe_error brw_cache_data(struct brw_cache *cache,
+                               enum brw_cache_id cache_id,
+                               const void *data,
+                               struct brw_winsys_reloc *relocs,
+                               GLuint nr_relocs,
+                               struct brw_winsys_buffer **bo_out );
+
+enum pipe_error brw_cache_data_sz(struct brw_cache *cache,
+                                  enum brw_cache_id cache_id,
+                                  const void *data,
+                                  GLuint data_size,
+                                  struct brw_winsys_reloc *relocs,
+                                  GLuint nr_relocs,
+                                  struct brw_winsys_buffer **bo_out);
+
+enum pipe_error brw_upload_cache( struct brw_cache *cache,
+                                  enum brw_cache_id cache_id,
+                                  const void *key,
+                                  GLuint key_sz,
+                                  struct brw_winsys_reloc *relocs,
+                                  GLuint nr_relocs,
+                                  const void *data,
+                                  GLuint data_sz,
+                                  const void *aux,
+                                  void *aux_return ,
+                                  struct brw_winsys_buffer **bo_out);
+
+boolean brw_search_cache( struct brw_cache *cache,
+                          enum brw_cache_id cache_id,
+                          const void *key,
+                          GLuint key_size,
+                          struct brw_winsys_reloc *relocs,
+                          GLuint nr_relocs,
+                          void *aux_return,
+                          struct brw_winsys_buffer **bo_out);
+
+void brw_state_cache_check_size( struct brw_context *brw );
+
+void brw_init_caches( struct brw_context *brw );
+void brw_destroy_caches( struct brw_context *brw );
+void brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo);
+
+/***********************************************************************
+ * brw_state_batch.c
+ */
+#define BRW_BATCH_STRUCT(brw, s) brw_batchbuffer_data( brw->batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
+#define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
+
+GLboolean brw_cached_batch_struct( struct brw_context *brw,
+				   const void *data,
+				   GLuint sz );
+void brw_destroy_batch_cache( struct brw_context *brw );
+void brw_clear_batch_cache( struct brw_context *brw );
+
+/***********************************************************************
+ * brw_wm_surface_state.c 
+ */
+
+/***********************************************************************
+ * brw_state_debug.c
+ */
+void brw_update_dirty_counts( unsigned mesa,
+			      unsigned brw,
+			      unsigned cache );
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_state_batch.c b/src/gallium/drivers/i965/brw_state_batch.c
new file mode 100644
index 0000000000..ce5ed0a9ed
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_state_batch.c
@@ -0,0 +1,98 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+     
+
+
+#include "brw_state.h"
+#include "brw_batchbuffer.h"
+
+
+
+/* A facility similar to the data caching code above, which aims to
+ * prevent identical commands being issued repeatedly.
+ */
+GLboolean brw_cached_batch_struct( struct brw_context *brw,
+				   const void *data,
+				   GLuint sz )
+{
+   struct brw_cached_batch_item *item = brw->cached_batch_items;
+   struct header *newheader = (struct header *)data;
+
+   if (brw->flags.always_emit_state) {
+      brw_batchbuffer_data(brw->batch, data, sz, IGNORE_CLIPRECTS);
+      return GL_TRUE;
+   }
+
+   while (item) {
+      if (item->header->opcode == newheader->opcode) {
+	 if (item->sz == sz && memcmp(item->header, newheader, sz) == 0)
+	    return GL_FALSE;
+	 if (item->sz != sz) {
+	    FREE(item->header);
+	    item->header = MALLOC(sz);
+	    item->sz = sz;
+	 }
+	 goto emit;
+      }
+      item = item->next;
+   }
+
+   assert(!item);
+   item = CALLOC_STRUCT(brw_cached_batch_item);
+   item->header = MALLOC(sz);
+   item->sz = sz;
+   item->next = brw->cached_batch_items;
+   brw->cached_batch_items = item;
+
+ emit:
+   memcpy(item->header, newheader, sz);
+   brw_batchbuffer_data(brw->batch, data, sz, IGNORE_CLIPRECTS);
+   return GL_TRUE;
+}
+
+void brw_clear_batch_cache( struct brw_context *brw )
+{
+   struct brw_cached_batch_item *item = brw->cached_batch_items;
+
+   while (item) {
+      struct brw_cached_batch_item *next = item->next;
+      FREE((void *)item->header);
+      FREE(item);
+      item = next;
+   }
+
+   brw->cached_batch_items = NULL;
+}
+
+void brw_destroy_batch_cache( struct brw_context *brw )
+{
+   brw_clear_batch_cache(brw);
+}
diff --git a/src/gallium/drivers/i965/brw_state_cache.c b/src/gallium/drivers/i965/brw_state_cache.c
new file mode 100644
index 0000000000..c911f3997d
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_state_cache.c
@@ -0,0 +1,617 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+/** @file brw_state_cache.c
+ *
+ * This file implements a simple static state cache for 965.  The consumers
+ * can query the hash table of state using a cache_id, opaque key data,
+ * and list of buffers that will be used in relocations, and receive the
+ * corresponding state buffer object of state (plus associated auxiliary
+ * data) in return.
+ *
+ * The inner workings are a simple hash table based on a CRC of the key data.
+ * The cache_id and relocation target buffers associated with the state
+ * buffer are included as auxiliary key data, but are not part of the hash
+ * value (this should be fixed, but will likely be fixed instead by making
+ * consumers use structured keys).
+ *
+ * Replacement is not implemented.  Instead, when the cache gets too big, at
+ * a safe point (unlock) we throw out all of the cache data and let it
+ * regenerate for the next rendering operation.
+ *
+ * The reloc structs need to be included as key data, otherwise the
+ * non-unique values stuffed in the offset in key data through
+ * brw_cache_data() may result in successful probe for state buffers
+ * even when the buffer being referenced doesn't match.  The result would be
+ * that the same state cache entry is used twice for different buffers,
+ * only one of the two buffers referenced gets put into the offset, and the
+ * incorrect program is run for the other instance.
+ */
+#include "util/u_memory.h"
+
+#include "brw_debug.h"
+#include "brw_state.h"
+
+/* XXX: Fixme - have to include these to get the sizes of the prog_key
+ * structs:
+ */
+#include "brw_wm.h"
+#include "brw_vs.h"
+#include "brw_clip.h"
+#include "brw_sf.h"
+#include "brw_gs.h"
+
+
+static GLuint
+hash_key(const void *key, GLuint key_size,
+         struct brw_winsys_reloc *relocs, GLuint nr_relocs)
+{
+   GLuint *ikey = (GLuint *)key;
+   GLuint hash = 0, i;
+
+   assert(key_size % 4 == 0);
+
+   /* I'm sure this can be improved on:
+    */
+   for (i = 0; i < key_size/4; i++) {
+      hash ^= ikey[i];
+      hash = (hash << 5) | (hash >> 27);
+   }
+
+   /* Include the BO pointers as key data as well */
+   ikey = (GLuint *)relocs;
+   key_size = nr_relocs * sizeof(struct brw_winsys_reloc);
+   for (i = 0; i < key_size/4; i++) {
+      hash ^= ikey[i];
+      hash = (hash << 5) | (hash >> 27);
+   }
+
+   return hash;
+}
+
+
+/**
+ * Marks a new buffer as being chosen for the given cache id.
+ */
+static void
+update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
+		  struct brw_winsys_buffer *bo)
+{
+   if (bo == cache->last_bo[cache_id])
+      return; /* no change */
+
+   bo_reference( &cache->last_bo[cache_id],  bo );
+
+   cache->brw->state.dirty.cache |= 1 << cache_id;
+}
+
+
+static struct brw_cache_item *
+search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
+	     GLuint hash, const void *key, GLuint key_size,
+	     struct brw_winsys_reloc *relocs, GLuint nr_relocs)
+{
+   struct brw_cache_item *c;
+
+#if 0
+   int bucketcount = 0;
+
+   for (c = cache->items[hash % cache->size]; c; c = c->next)
+      bucketcount++;
+
+   debug_printf("bucket %d/%d = %d/%d items\n", hash % cache->size,
+	   cache->size, bucketcount, cache->n_items);
+#endif
+
+   for (c = cache->items[hash % cache->size]; c; c = c->next) {
+      if (c->cache_id == cache_id &&
+	  c->hash == hash &&
+	  c->key_size == key_size &&
+	  memcmp(c->key, key, key_size) == 0 &&
+	  c->nr_relocs == nr_relocs &&
+	  memcmp(c->relocs, relocs, nr_relocs * sizeof *relocs) == 0)
+	 return c;
+   }
+
+   return NULL;
+}
+
+
+static void
+rehash(struct brw_cache *cache)
+{
+   struct brw_cache_item **items;
+   struct brw_cache_item *c, *next;
+   GLuint size, i;
+
+   size = cache->size * 3;
+   items = (struct brw_cache_item**) CALLOC(size, sizeof(*items));
+
+   for (i = 0; i < cache->size; i++)
+      for (c = cache->items[i]; c; c = next) {
+	 next = c->next;
+	 c->next = items[c->hash % size];
+	 items[c->hash % size] = c;
+      }
+
+   FREE(cache->items);
+   cache->items = items;
+   cache->size = size;
+}
+
+
+/**
+ * Returns the buffer object matching cache_id and key, or NULL.
+ */
+boolean
+brw_search_cache(struct brw_cache *cache,
+                 enum brw_cache_id cache_id,
+                 const void *key,
+                 GLuint key_size,
+                 struct brw_winsys_reloc *relocs, 
+		 GLuint nr_relocs,
+                 void *aux_return,
+                 struct brw_winsys_buffer **bo_out)
+{
+   struct brw_cache_item *item;
+   GLuint hash = hash_key(key, key_size, relocs, nr_relocs);
+
+   item = search_cache(cache, cache_id, hash, key, key_size,
+		       relocs, nr_relocs);
+
+   if (item) {
+      if (aux_return)
+         *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+      
+      update_cache_last(cache, cache_id, item->bo);
+      bo_reference(bo_out, item->bo);
+      return TRUE;
+   }
+   
+   return FALSE;      
+}
+
+
+enum pipe_error
+brw_upload_cache( struct brw_cache *cache,
+		  enum brw_cache_id cache_id,
+		  const void *key,
+		  GLuint key_size,
+		  struct brw_winsys_reloc *relocs,
+		  GLuint nr_relocs,
+		  const void *data,
+		  GLuint data_size,
+		  const void *aux,
+		  void *aux_return,
+                  struct brw_winsys_buffer **bo_out)
+{
+   struct brw_cache_item *item;
+   GLuint hash = hash_key(key, key_size, relocs, nr_relocs);
+   GLuint relocs_size = nr_relocs * sizeof relocs[0];
+   GLuint aux_size = cache->aux_size[cache_id];
+   enum pipe_error ret;
+   void *tmp;
+   int i;
+
+   /* Create the buffer object to contain the data.  For now, use a
+    * single buffer type to describe all cached state atoms.  Later,
+    * may want to take advantage of hardware distinctions between
+    * these various entities.
+    */
+   ret = cache->sws->bo_alloc(cache->sws,
+                              cache->buffer_type,
+                              data_size, 1 << 6, 
+                              bo_out);
+   if (ret)
+      return ret;
+
+   item = CALLOC_STRUCT(brw_cache_item);
+
+   /* Set up the memory containing the key, aux_data, and relocs */
+   tmp = MALLOC(key_size + aux_size + relocs_size);
+
+   memcpy(tmp, key, key_size);
+   memcpy((char *)tmp + key_size, aux, cache->aux_size[cache_id]);
+   memcpy((char *)tmp + key_size + aux_size, relocs, relocs_size);
+   for (i = 0; i < nr_relocs; i++) {
+      p_atomic_inc(&relocs[i].bo->reference.count);
+   }
+
+   item->cache_id = cache_id;
+   item->key = tmp;
+   item->hash = hash;
+   item->key_size = key_size;
+   item->relocs = (struct brw_winsys_reloc *)((char *)tmp + key_size + aux_size);
+   item->nr_relocs = nr_relocs;
+   bo_reference( &item->bo, *bo_out );
+   item->data_size = data_size;
+
+   if (cache->n_items > cache->size * 1.5)
+      rehash(cache);
+
+   hash %= cache->size;
+   item->next = cache->items[hash];
+   cache->items[hash] = item;
+   cache->n_items++;
+
+   if (aux_return) {
+      assert(cache->aux_size[cache_id]);
+      *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+   }
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("upload %s: %d bytes to cache id %d\n",
+		   cache->name[cache_id],
+		   data_size, cache_id);
+
+   /* Copy data to the buffer */
+   ret = cache->sws->bo_subdata(item->bo, 
+                                cache_id,
+                                0, data_size, data,
+                                relocs, nr_relocs);
+   if (ret)
+      return ret;
+
+   update_cache_last(cache, cache_id, item->bo);
+
+   return PIPE_OK;
+}
+
+
+/**
+ * This doesn't really work with aux data.  Use search/upload instead
+ */
+enum pipe_error
+brw_cache_data_sz(struct brw_cache *cache,
+		  enum brw_cache_id cache_id,
+		  const void *data,
+		  GLuint data_size,
+		  struct brw_winsys_reloc *relocs,
+		  GLuint nr_relocs,
+                  struct brw_winsys_buffer **bo_out)
+{
+   struct brw_cache_item *item;
+   GLuint hash = hash_key(data, data_size, relocs, nr_relocs);
+
+   item = search_cache(cache, cache_id, hash, data, data_size,
+		       relocs, nr_relocs);
+   if (item) {
+      update_cache_last(cache, cache_id, item->bo);
+
+      bo_reference(bo_out, item->bo);
+      return PIPE_OK;
+   }
+
+   return brw_upload_cache(cache, cache_id,
+                           data, data_size,
+                           relocs, nr_relocs,
+                           data, data_size,
+                           NULL, NULL,
+                           bo_out);
+}
+
+
+/**
+ * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
+ *
+ * If nr_relocs is nonzero, brw_search_cache()/brw_upload_cache() would be
+ * better to use, as the potentially changing offsets in the data-used-as-key
+ * will result in excessive cache misses.
+ * 
+ * XXX: above is no longer true -- can we remove some code?
+ */
+enum pipe_error
+brw_cache_data(struct brw_cache *cache,
+	       enum brw_cache_id cache_id,
+	       const void *data,
+	       struct brw_winsys_reloc *relocs,
+	       GLuint nr_relocs,
+               struct brw_winsys_buffer **bo_out)
+{
+   return brw_cache_data_sz(cache, cache_id, data, cache->key_size[cache_id],
+			    relocs, nr_relocs, bo_out);
+}
+
+
+static void
+brw_init_cache_id(struct brw_cache *cache,
+                  const char *name,
+                  enum brw_cache_id id,
+                  GLuint key_size,
+                  GLuint aux_size)
+{
+   cache->name[id] = strdup(name);
+   cache->key_size[id] = key_size;
+   cache->aux_size[id] = aux_size;
+}
+
+
+static void
+brw_init_general_state_cache(struct brw_context *brw)
+{
+   struct brw_cache *cache = &brw->cache;
+
+   cache->brw = brw;
+   cache->sws = brw->sws;
+
+   cache->buffer_type = BRW_BUFFER_TYPE_GENERAL_STATE;
+
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      CALLOC(cache->size, sizeof(struct brw_cache_item));
+
+   brw_init_cache_id(cache,
+		     "CC_VP",
+		     BRW_CC_VP,
+		     sizeof(struct brw_cc_viewport),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "CC_UNIT",
+		     BRW_CC_UNIT,
+		     sizeof(struct brw_cc_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "WM_PROG",
+		     BRW_WM_PROG,
+		     sizeof(struct brw_wm_prog_key),
+		     sizeof(struct brw_wm_prog_data));
+
+   brw_init_cache_id(cache,
+		     "SAMPLER_DEFAULT_COLOR",
+		     BRW_SAMPLER_DEFAULT_COLOR,
+		     sizeof(struct brw_sampler_default_color),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "SAMPLER",
+		     BRW_SAMPLER,
+		     0,		/* variable key/data size */
+		     0);
+
+   brw_init_cache_id(cache,
+		     "WM_UNIT",
+		     BRW_WM_UNIT,
+		     sizeof(struct brw_wm_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "SF_PROG",
+		     BRW_SF_PROG,
+		     sizeof(struct brw_sf_prog_key),
+		     sizeof(struct brw_sf_prog_data));
+
+   brw_init_cache_id(cache,
+		     "SF_VP",
+		     BRW_SF_VP,
+		     sizeof(struct brw_sf_viewport),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "SF_UNIT",
+		     BRW_SF_UNIT,
+		     sizeof(struct brw_sf_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "VS_UNIT",
+		     BRW_VS_UNIT,
+		     sizeof(struct brw_vs_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "VS_PROG",
+		     BRW_VS_PROG,
+		     sizeof(struct brw_vs_prog_key),
+		     sizeof(struct brw_vs_prog_data));
+
+   brw_init_cache_id(cache,
+		     "CLIP_UNIT",
+		     BRW_CLIP_UNIT,
+		     sizeof(struct brw_clip_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "CLIP_PROG",
+		     BRW_CLIP_PROG,
+		     sizeof(struct brw_clip_prog_key),
+		     sizeof(struct brw_clip_prog_data));
+
+   brw_init_cache_id(cache,
+		     "GS_UNIT",
+		     BRW_GS_UNIT,
+		     sizeof(struct brw_gs_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "GS_PROG",
+		     BRW_GS_PROG,
+		     sizeof(struct brw_gs_prog_key),
+		     sizeof(struct brw_gs_prog_data));
+}
+
+
+static void
+brw_init_surface_state_cache(struct brw_context *brw)
+{
+   struct brw_cache *cache = &brw->surface_cache;
+
+   cache->brw = brw;
+   cache->sws = brw->sws;
+
+   cache->buffer_type = BRW_BUFFER_TYPE_SURFACE_STATE;
+
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      CALLOC(cache->size, sizeof(struct brw_cache_item));
+
+   brw_init_cache_id(cache,
+		     "SS_SURFACE",
+		     BRW_SS_SURFACE,
+		     sizeof(struct brw_surface_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "SS_SURF_BIND",
+		     BRW_SS_SURF_BIND,
+		     0,
+		     0);
+}
+
+
+void
+brw_init_caches(struct brw_context *brw)
+{
+   brw_init_general_state_cache(brw);
+   brw_init_surface_state_cache(brw);
+}
+
+
+static void
+brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
+{
+   struct brw_cache_item *c, *next;
+   GLuint i;
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
+
+   for (i = 0; i < cache->size; i++) {
+      for (c = cache->items[i]; c; c = next) {
+	 int j;
+
+	 next = c->next;
+
+	 for (j = 0; j < c->nr_relocs; j++)
+	    bo_reference(&c->relocs[j].bo, NULL);
+
+	 bo_reference(&c->bo, NULL);
+	 FREE((void *)c->key);
+	 FREE(c);
+      }
+      cache->items[i] = NULL;
+   }
+
+   cache->n_items = 0;
+
+   if (brw->curbe.last_buf) {
+      FREE(brw->curbe.last_buf);
+      brw->curbe.last_buf = NULL;
+   }
+
+   brw->state.dirty.mesa |= ~0;
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+}
+
+/* Clear all entries from the cache that point to the given bo.
+ *
+ * This lets us release memory for reuse earlier for known-dead buffers,
+ * at the cost of walking the entire hash table.
+ */
+void
+brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo)
+{
+   struct brw_cache_item **prev;
+   GLuint i;
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
+
+   for (i = 0; i < cache->size; i++) {
+      for (prev = &cache->items[i]; *prev;) {
+	 struct brw_cache_item *c = *prev;
+
+	 if (cache->sws->bo_references(c->bo, bo)) {
+	    int j;
+
+	    *prev = c->next;
+
+	    for (j = 0; j < c->nr_relocs; j++)
+	       bo_reference(&c->relocs[j].bo, NULL);
+
+	    bo_reference(&c->bo, NULL);
+
+	    FREE((void *)c->key);
+	    FREE(c);
+	    cache->n_items--;
+	 } else {
+	    prev = &c->next;
+	 }
+      }
+   }
+}
+
+void
+brw_state_cache_check_size(struct brw_context *brw)
+{
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
+
+   /* un-tuned guess.  We've got around 20 state objects for a total of around
+    * 32k, so 1000 of them is around 1.5MB.
+    */
+   if (brw->cache.n_items > 1000)
+      brw_clear_cache(brw, &brw->cache);
+
+   if (brw->surface_cache.n_items > 1000)
+      brw_clear_cache(brw, &brw->surface_cache);
+}
+
+
+static void
+brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
+{
+   GLuint i;
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
+
+   brw_clear_cache(brw, cache);
+   for (i = 0; i < BRW_MAX_CACHE; i++) {
+      bo_reference(&cache->last_bo[i], NULL);
+      FREE(cache->name[i]);
+   }
+   FREE(cache->items);
+   cache->items = NULL;
+   cache->size = 0;
+}
+
+
+void
+brw_destroy_caches(struct brw_context *brw)
+{
+   brw_destroy_cache(brw, &brw->cache);
+   brw_destroy_cache(brw, &brw->surface_cache);
+}
diff --git a/src/gallium/drivers/i965/brw_state_debug.c b/src/gallium/drivers/i965/brw_state_debug.c
new file mode 100644
index 0000000000..049c278c93
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_state_debug.c
@@ -0,0 +1,153 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+      
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+
+
+struct dirty_bit_map {
+   uint32_t bit;
+   char *name;
+   uint32_t count;
+};
+
+#define DEFINE_BIT(name) {name, #name, 0}
+
+static struct dirty_bit_map mesa_bits[] = {
+   DEFINE_BIT(PIPE_NEW_DEPTH_STENCIL_ALPHA),
+   DEFINE_BIT(PIPE_NEW_RAST),
+   DEFINE_BIT(PIPE_NEW_BLEND),
+   DEFINE_BIT(PIPE_NEW_VIEWPORT),
+   DEFINE_BIT(PIPE_NEW_SAMPLERS),
+   DEFINE_BIT(PIPE_NEW_VERTEX_BUFFER),
+   DEFINE_BIT(PIPE_NEW_VERTEX_ELEMENT),
+   DEFINE_BIT(PIPE_NEW_FRAGMENT_SHADER),
+   DEFINE_BIT(PIPE_NEW_VERTEX_SHADER),
+   DEFINE_BIT(PIPE_NEW_FRAGMENT_CONSTANTS),
+   DEFINE_BIT(PIPE_NEW_VERTEX_CONSTANTS),
+   DEFINE_BIT(PIPE_NEW_CLIP),
+   DEFINE_BIT(PIPE_NEW_INDEX_BUFFER),
+   DEFINE_BIT(PIPE_NEW_INDEX_RANGE),
+   DEFINE_BIT(PIPE_NEW_BLEND_COLOR),
+   DEFINE_BIT(PIPE_NEW_POLYGON_STIPPLE),
+   DEFINE_BIT(PIPE_NEW_FRAMEBUFFER_DIMENSIONS),
+   DEFINE_BIT(PIPE_NEW_DEPTH_BUFFER),
+   DEFINE_BIT(PIPE_NEW_COLOR_BUFFERS),
+   DEFINE_BIT(PIPE_NEW_QUERY),
+   DEFINE_BIT(PIPE_NEW_SCISSOR),
+   DEFINE_BIT(PIPE_NEW_BOUND_TEXTURES),
+   DEFINE_BIT(PIPE_NEW_NR_CBUFS),
+   {0, 0, 0}
+};
+
+static struct dirty_bit_map brw_bits[] = {
+   DEFINE_BIT(BRW_NEW_URB_FENCE),
+   DEFINE_BIT(BRW_NEW_FRAGMENT_PROGRAM),
+   DEFINE_BIT(BRW_NEW_VERTEX_PROGRAM),
+   DEFINE_BIT(BRW_NEW_INPUT_DIMENSIONS),
+   DEFINE_BIT(BRW_NEW_CURBE_OFFSETS),
+   DEFINE_BIT(BRW_NEW_REDUCED_PRIMITIVE),
+   DEFINE_BIT(BRW_NEW_PRIMITIVE),
+   DEFINE_BIT(BRW_NEW_CONTEXT),
+   DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
+   DEFINE_BIT(BRW_NEW_PSP),
+   DEFINE_BIT(BRW_NEW_WM_SURFACES),
+   DEFINE_BIT(BRW_NEW_xxx),
+   DEFINE_BIT(BRW_NEW_INDICES),
+   {0, 0, 0}
+};
+
+static struct dirty_bit_map cache_bits[] = {
+   DEFINE_BIT(CACHE_NEW_CC_VP),
+   DEFINE_BIT(CACHE_NEW_CC_UNIT),
+   DEFINE_BIT(CACHE_NEW_WM_PROG),
+   DEFINE_BIT(CACHE_NEW_SAMPLER_DEFAULT_COLOR),
+   DEFINE_BIT(CACHE_NEW_SAMPLER),
+   DEFINE_BIT(CACHE_NEW_WM_UNIT),
+   DEFINE_BIT(CACHE_NEW_SF_PROG),
+   DEFINE_BIT(CACHE_NEW_SF_VP),
+   DEFINE_BIT(CACHE_NEW_SF_UNIT),
+   DEFINE_BIT(CACHE_NEW_VS_UNIT),
+   DEFINE_BIT(CACHE_NEW_VS_PROG),
+   DEFINE_BIT(CACHE_NEW_GS_UNIT),
+   DEFINE_BIT(CACHE_NEW_GS_PROG),
+   DEFINE_BIT(CACHE_NEW_CLIP_VP),
+   DEFINE_BIT(CACHE_NEW_CLIP_UNIT),
+   DEFINE_BIT(CACHE_NEW_CLIP_PROG),
+   DEFINE_BIT(CACHE_NEW_SURFACE),
+   DEFINE_BIT(CACHE_NEW_SURF_BIND),
+   {0, 0, 0}
+};
+
+
+static void
+brw_update_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
+{
+   int i;
+
+   for (i = 0; i < 32; i++) {
+      if (bit_map[i].bit == 0)
+	 return;
+
+      if (bit_map[i].bit & bits)
+	 bit_map[i].count++;
+   }
+}
+
+static void
+brw_print_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
+{
+   int i;
+
+   for (i = 0; i < 32; i++) {
+      if (bit_map[i].bit == 0)
+	 return;
+
+      debug_printf("0x%08x: %12d (%s)\n",
+	      bit_map[i].bit, bit_map[i].count, bit_map[i].name);
+   }
+}
+
+void
+brw_update_dirty_counts( unsigned mesa,
+			 unsigned brw,
+			 unsigned cache )
+{
+   static int dirty_count = 0;
+
+   brw_update_dirty_count(mesa_bits, mesa);
+   brw_update_dirty_count(brw_bits, brw);
+   brw_update_dirty_count(cache_bits, cache);
+      if (dirty_count++ % 1000 == 0) {
+	 brw_print_dirty_count(mesa_bits, mesa);
+	 brw_print_dirty_count(brw_bits, brw);
+	 brw_print_dirty_count(cache_bits, cache);
+	 debug_printf("\n");
+      }
+}
diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c
new file mode 100644
index 0000000000..f8b91eff81
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@@ -0,0 +1,270 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+       
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_batchbuffer.h"
+#include "brw_debug.h"
+
+const struct brw_tracked_state *atoms[] =
+{
+/*   &brw_wm_input_sizes, */
+   &brw_vs_prog,
+   &brw_gs_prog, 
+   &brw_clip_prog, 
+   &brw_sf_prog,
+   &brw_wm_prog,
+
+   /* Once all the programs are done, we know how large urb entry
+    * sizes need to be and can decide if we need to change the urb
+    * layout.
+    */
+   &brw_curbe_offsets,
+   &brw_recalculate_urb_fence,
+
+   &brw_cc_vp,
+   &brw_cc_unit,
+
+   &brw_vs_surfaces,		/* must do before unit */
+   /*&brw_wm_constant_surface,*/	/* must do before wm surfaces/bind bo */
+   &brw_wm_surfaces,		/* must do before samplers and unit */
+   &brw_wm_samplers,
+
+   &brw_wm_unit,
+   &brw_sf_vp,
+   &brw_sf_unit,
+   &brw_vs_unit,		/* always required, enabled or not */
+   &brw_clip_unit,
+   &brw_gs_unit,  
+
+   /* Command packets:
+    */
+   &brw_invarient_state,
+   &brw_state_base_address,
+
+   &brw_binding_table_pointers,
+   &brw_blend_constant_color,
+
+   &brw_depthbuffer,
+   &brw_polygon_stipple,
+   &brw_line_stipple,
+
+   &brw_psp_urb_cbs,
+
+   &brw_drawing_rect,
+   &brw_indices,
+   &brw_index_buffer,
+   &brw_vertices,
+
+   &brw_curbe_buffer
+};
+
+
+void brw_init_state( struct brw_context *brw )
+{
+   brw_init_caches(brw);
+}
+
+
+void brw_destroy_state( struct brw_context *brw )
+{
+   brw_destroy_caches(brw);
+   brw_destroy_batch_cache(brw);
+}
+
+/***********************************************************************
+ */
+
+static GLboolean check_state( const struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   return ((a->mesa & b->mesa) ||
+	   (a->brw & b->brw) ||
+	   (a->cache & b->cache));
+}
+
+static void accumulate_state( struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   a->mesa |= b->mesa;
+   a->brw |= b->brw;
+   a->cache |= b->cache;
+}
+
+
+static void xor_states( struct brw_state_flags *result,
+			     const struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   result->mesa = a->mesa ^ b->mesa;
+   result->brw = a->brw ^ b->brw;
+   result->cache = a->cache ^ b->cache;
+}
+
+static void
+brw_clear_validated_bos(struct brw_context *brw)
+{
+   int i;
+
+   /* Clear the last round of validated bos */
+   for (i = 0; i < brw->state.validated_bo_count; i++) {
+      bo_reference(&brw->state.validated_bos[i], NULL);
+   }
+   brw->state.validated_bo_count = 0;
+}
+
+
+/***********************************************************************
+ * Emit all state:
+ */
+enum pipe_error brw_validate_state( struct brw_context *brw )
+{
+   struct brw_state_flags *state = &brw->state.dirty;
+   GLuint i;
+   int ret;
+
+   brw_clear_validated_bos(brw);
+   brw_add_validated_bo(brw, brw->batch->buf);
+
+   if (brw->flags.always_emit_state) {
+      state->mesa |= ~0;
+      state->brw |= ~0;
+      state->cache |= ~0;
+   }
+
+   if (state->mesa == 0 &&
+       state->cache == 0 &&
+       state->brw == 0)
+      return 0;
+
+   if (brw->state.dirty.brw & BRW_NEW_CONTEXT)
+      brw_clear_batch_cache(brw);
+
+   /* do prepare stage for all atoms */
+   for (i = 0; i < Elements(atoms); i++) {
+      const struct brw_tracked_state *atom = atoms[i];
+
+      if (check_state(state, &atom->dirty)) {
+         if (atom->prepare) {
+            ret = atom->prepare(brw);
+	    if (ret)
+	       return ret;
+        }
+      }
+   }
+
+   /* Make sure that the textures which are referenced by the current
+    * brw fragment program are actually present/valid.
+    * If this fails, we can experience GPU lock-ups.
+    */
+   {
+      const struct brw_fragment_shader *fp = brw->curr.fragment_shader;
+      if (fp) {
+         assert(fp->info.file_max[TGSI_FILE_SAMPLER] < (int)brw->curr.num_samplers);
+	 /*assert(fp->info.texture_max <= brw->curr.num_textures);*/
+      }
+   }
+
+   return 0;
+}
+
+
+enum pipe_error brw_upload_state(struct brw_context *brw)
+{
+   struct brw_state_flags *state = &brw->state.dirty;
+   int ret;
+   int i;
+
+   brw_clear_validated_bos(brw);
+
+   if (BRW_DEBUG) {
+      /* Debug version which enforces various sanity checks on the
+       * state flags which are generated and checked to help ensure
+       * state atoms are ordered correctly in the list.
+       */
+      struct brw_state_flags examined, prev;      
+      memset(&examined, 0, sizeof(examined));
+      prev = *state;
+
+      for (i = 0; i < Elements(atoms); i++) {
+	 const struct brw_tracked_state *atom = atoms[i];
+	 struct brw_state_flags generated;
+
+	 assert(atom->dirty.mesa ||
+		atom->dirty.brw ||
+		atom->dirty.cache);
+
+	 if (check_state(state, &atom->dirty)) {
+	    if (atom->emit) {
+	       ret = atom->emit( brw );
+	       if (ret)
+		  return ret;
+	    }
+	 }
+
+	 accumulate_state(&examined, &atom->dirty);
+
+	 /* generated = (prev ^ state)
+	  * if (examined & generated)
+	  *     fail;
+	  */
+	 xor_states(&generated, &prev, state);
+	 assert(!check_state(&examined, &generated));
+	 prev = *state;
+      }
+   }
+   else {
+      for (i = 0; i < Elements(atoms); i++) {	 
+	 const struct brw_tracked_state *atom = atoms[i];
+
+	 if (check_state(state, &atom->dirty)) {
+	    if (atom->emit) {
+	       ret = atom->emit( brw );
+	       if (ret)
+		  return ret;
+	    }
+	 }
+      }
+   }
+
+   if (BRW_DEBUG & DEBUG_STATE) {
+      brw_update_dirty_counts( state->mesa, 
+			       state->brw,
+			       state->cache );
+   }
+   
+   /* Clear dirty flags:
+    */
+   memset(state, 0, sizeof(*state));
+   return 0;
+}
diff --git a/src/gallium/drivers/i965/brw_structs.h b/src/gallium/drivers/i965/brw_structs.h
new file mode 100644
index 0000000000..e97ddeb5e1
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_structs.h
@@ -0,0 +1,1576 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_STRUCTS_H
+#define BRW_STRUCTS_H
+
+#include "brw_types.h"
+
+/** Number of general purpose registers (VS, WM, etc) */
+#define BRW_MAX_GRF 128
+
+/** Number of message register file registers */
+#define BRW_MAX_MRF 16
+
+
+/* Command packets:
+ */
+struct header 
+{
+   GLuint length:16; 
+   GLuint opcode:16; 
+};
+
+
+union header_union
+{
+   struct header bits;
+   GLuint dword;
+};
+
+struct brw_3d_control
+{   
+   struct 
+   {
+      GLuint length:8;
+      GLuint notify_enable:1;
+      GLuint pad:3;
+      GLuint wc_flush_enable:1; 
+      GLuint depth_stall_enable:1; 
+      GLuint operation:2; 
+      GLuint opcode:16; 
+   } header;
+   
+   struct
+   {
+      GLuint pad:2;
+      GLuint dest_addr_type:1; 
+      GLuint dest_addr:29; 
+   } dest;
+   
+   GLuint dword2;   
+   GLuint dword3;   
+};
+
+
+struct brw_3d_primitive
+{
+   struct
+   {
+      GLuint length:8; 
+      GLuint pad:2;
+      GLuint topology:5; 
+      GLuint indexed:1; 
+      GLuint opcode:16; 
+   } header;
+
+   GLuint verts_per_instance;  
+   GLuint start_vert_location;  
+   GLuint instance_count;  
+   GLuint start_instance_location;  
+   GLuint base_vert_location;  
+};
+
+/* These seem to be passed around as function args, so it works out
+ * better to keep them as #defines:
+ */
+#define BRW_FLUSH_READ_CACHE           0x1
+#define BRW_FLUSH_STATE_CACHE          0x2
+#define BRW_INHIBIT_FLUSH_RENDER_CACHE 0x4
+#define BRW_FLUSH_SNAPSHOT_COUNTERS    0x8
+
+struct brw_mi_flush
+{
+   GLuint flags:4;
+   GLuint pad:12;
+   GLuint opcode:16;
+};
+
+struct brw_vf_statistics
+{
+   GLuint statistics_enable:1;
+   GLuint pad:15;
+   GLuint opcode:16;
+};
+
+
+
+struct brw_binding_table_pointers
+{
+   struct header header;
+   GLuint vs; 
+   GLuint gs; 
+   GLuint clp; 
+   GLuint sf; 
+   GLuint wm; 
+};
+
+
+struct brw_blend_constant_color
+{
+   struct header header;
+   GLfloat blend_constant_color[4];  
+};
+
+
+struct brw_depthbuffer
+{
+   union header_union header;
+   
+   union {
+      struct {
+	 GLuint pitch:18; 
+	 GLuint format:3; 
+	 GLuint pad:2;
+	 GLuint software_tiled_rendering_mode:2;
+	 GLuint depth_offset_disable:1; 
+	 GLuint tile_walk:1; 
+	 GLuint tiled_surface:1; 
+	 GLuint pad2:1;
+	 GLuint surface_type:3; 
+      } bits;
+      GLuint dword;
+   } dword1;
+   
+   GLuint dword2_base_addr; 
+ 
+   union {
+      struct {
+	 GLuint pad:1;
+	 GLuint mipmap_layout:1; 
+	 GLuint lod:4; 
+	 GLuint width:13; 
+	 GLuint height:13; 
+      } bits;
+      GLuint dword;
+   } dword3;
+
+   union {
+      struct {
+	 GLuint pad:10;
+	 GLuint min_array_element:11; 
+	 GLuint depth:11; 
+      } bits;
+      GLuint dword;
+   } dword4;
+};
+
+struct brw_depthbuffer_g4x
+{
+   union header_union header;
+   
+   union {
+      struct {
+	 GLuint pitch:18; 
+	 GLuint format:3; 
+	 GLuint pad:2;
+	 GLuint software_tiled_rendering_mode:2;
+	 GLuint depth_offset_disable:1; 
+	 GLuint tile_walk:1; 
+	 GLuint tiled_surface:1; 
+	 GLuint pad2:1;
+	 GLuint surface_type:3; 
+      } bits;
+      GLuint dword;
+   } dword1;
+   
+   GLuint dword2_base_addr; 
+ 
+   union {
+      struct {
+	 GLuint pad:1;
+	 GLuint mipmap_layout:1; 
+	 GLuint lod:4; 
+	 GLuint width:13; 
+	 GLuint height:13; 
+      } bits;
+      GLuint dword;
+   } dword3;
+
+   union {
+      struct {
+	 GLuint pad:10;
+	 GLuint min_array_element:11; 
+	 GLuint depth:11; 
+      } bits;
+      GLuint dword;
+   } dword4;
+
+   union {
+      struct {
+         GLuint xoffset:16;
+         GLuint yoffset:16;
+      } bits;
+      GLuint dword;
+   } dword5;   /* NEW in Integrated Graphics Device */
+};
+
+struct brw_drawrect
+{
+   struct header header;
+   GLuint xmin:16; 
+   GLuint ymin:16; 
+   GLuint xmax:16; 
+   GLuint ymax:16; 
+   GLuint xorg:16;  
+   GLuint yorg:16;  
+};
+
+
+
+
+struct brw_global_depth_offset_clamp
+{
+   struct header header;
+   GLfloat depth_offset_clamp;  
+};
+
+struct brw_indexbuffer
+{   
+   union {
+      struct
+      {
+	 GLuint length:8; 
+	 GLuint index_format:2; 
+	 GLuint cut_index_enable:1; 
+	 GLuint pad:5; 
+	 GLuint opcode:16; 
+      } bits;
+      GLuint dword;
+
+   } header;
+
+   GLuint buffer_start; 
+   GLuint buffer_end; 
+};
+
+/* NEW in Integrated Graphics Device */
+struct brw_aa_line_parameters
+{
+   struct header header;
+
+   struct {
+      GLuint aa_coverage_scope:8;
+      GLuint pad0:8;
+      GLuint aa_coverage_bias:8;
+      GLuint pad1:8;
+   } bits0;
+
+   struct {
+      GLuint aa_coverage_endcap_slope:8;
+      GLuint pad0:8;
+      GLuint aa_coverage_endcap_bias:8;
+      GLuint pad1:8;
+   } bits1;
+};
+
+struct brw_line_stipple
+{   
+   struct header header;
+  
+   struct
+   {
+      GLuint pattern:16; 
+      GLuint pad:16;
+   } bits0;
+   
+   struct
+   {
+      GLuint repeat_count:9; 
+      GLuint pad:7;
+      GLuint inverse_repeat_count:16; 
+   } bits1;
+};
+
+
+struct brw_pipelined_state_pointers
+{
+   struct header header;
+   
+   struct {
+      GLuint pad:5;
+      GLuint offset:27; /* Offset from GENERAL_STATE_BASE */
+   } vs;
+   
+   struct
+   {
+      GLuint enable:1;
+      GLuint pad:4;
+      GLuint offset:27; /* Offset from GENERAL_STATE_BASE */
+   } gs;
+   
+   struct
+   {
+      GLuint enable:1;
+      GLuint pad:4;
+      GLuint offset:27; /* Offset from GENERAL_STATE_BASE */
+   } clp;
+   
+   struct
+   {
+      GLuint pad:5;
+      GLuint offset:27; /* Offset from GENERAL_STATE_BASE */
+   } sf;
+
+   struct
+   {
+      GLuint pad:5;
+      GLuint offset:27; /* Offset from GENERAL_STATE_BASE */
+   } wm;
+   
+   struct
+   {
+      GLuint pad:5;
+      GLuint offset:27; /* Offset from GENERAL_STATE_BASE. KW: check me! */
+   } cc;
+};
+
+
+struct brw_polygon_stipple_offset
+{
+   struct header header;
+
+   struct {
+      GLuint y_offset:5; 
+      GLuint pad:3;
+      GLuint x_offset:5; 
+      GLuint pad0:19;
+   } bits0;
+};
+
+
+
+struct brw_polygon_stipple
+{
+   struct header header;
+   GLuint stipple[32];
+};
+
+
+
+struct brw_pipeline_select
+{
+   struct
+   {
+      GLuint pipeline_select:1;   
+      GLuint pad:15;
+      GLuint opcode:16;   
+   } header;
+};
+
+
+struct brw_pipe_control
+{
+   struct
+   {
+      GLuint length:8;
+      GLuint notify_enable:1;
+      GLuint texture_cache_flush_enable:1;
+      GLuint indirect_state_pointers_disable:1;
+      GLuint instruction_state_cache_flush_enable:1;
+      GLuint write_cache_flush_enable:1;
+      GLuint depth_stall_enable:1;
+      GLuint post_sync_operation:2;
+
+      GLuint opcode:16;
+   } header;
+
+   struct
+   {
+      GLuint pad:2;
+      GLuint dest_addr_type:1;
+      GLuint dest_addr:29;
+   } bits1;
+
+   GLuint data0;
+   GLuint data1;
+};
+
+
+struct brw_urb_fence
+{
+   struct
+   {
+      GLuint length:8;   
+      GLuint vs_realloc:1;   
+      GLuint gs_realloc:1;   
+      GLuint clp_realloc:1;   
+      GLuint sf_realloc:1;   
+      GLuint vfe_realloc:1;   
+      GLuint cs_realloc:1;   
+      GLuint pad:2;
+      GLuint opcode:16;   
+   } header;
+
+   struct
+   {
+      GLuint vs_fence:10;  
+      GLuint gs_fence:10;  
+      GLuint clp_fence:10;  
+      GLuint pad:2;
+   } bits0;
+
+   struct
+   {
+      GLuint sf_fence:10;  
+      GLuint vf_fence:10;  
+      GLuint cs_fence:11;  
+      GLuint pad:1;
+   } bits1;
+};
+
+struct brw_cs_urb_state
+{
+   struct header header;
+
+   struct
+   {
+      GLuint nr_urb_entries:3;   
+      GLuint pad:1;
+      GLuint urb_entry_size:5;   
+      GLuint pad0:23;
+   } bits0;
+};
+
+struct brw_constant_buffer
+{
+   struct
+   {
+      GLuint length:8;   
+      GLuint valid:1;   
+      GLuint pad:7;
+      GLuint opcode:16;   
+   } header;
+
+   struct
+   {
+      GLuint buffer_length:6;   
+      GLuint buffer_address:26;  
+   } bits0;
+};
+
+struct brw_state_base_address
+{
+   struct header header;
+
+   struct
+   {
+      GLuint modify_enable:1;
+      GLuint pad:4;
+      GLuint general_state_address:27;  
+   } bits0;
+
+   struct
+   {
+      GLuint modify_enable:1;
+      GLuint pad:4;
+      GLuint surface_state_address:27;  
+   } bits1;
+
+   struct
+   {
+      GLuint modify_enable:1;
+      GLuint pad:4;
+      GLuint indirect_object_state_address:27;  
+   } bits2;
+
+   struct
+   {
+      GLuint modify_enable:1;
+      GLuint pad:11;
+      GLuint general_state_upper_bound:20;  
+   } bits3;
+
+   struct
+   {
+      GLuint modify_enable:1;
+      GLuint pad:11;
+      GLuint indirect_object_state_upper_bound:20;  
+   } bits4;
+};
+
+struct brw_state_prefetch
+{
+   struct header header;
+
+   struct
+   {
+      GLuint prefetch_count:3;   
+      GLuint pad:3;
+      GLuint prefetch_pointer:26;  
+   } bits0;
+};
+
+struct brw_system_instruction_pointer
+{
+   struct header header;
+
+   struct
+   {
+      GLuint pad:4;
+      GLuint system_instruction_pointer:28;  
+   } bits0;
+};
+
+
+
+
+/* State structs for the various fixed function units:
+ */
+
+
+struct thread0
+{
+   GLuint pad0:1;
+   GLuint grf_reg_count:3; 
+   GLuint pad1:2;
+   GLuint kernel_start_pointer:26; /* Offset from GENERAL_STATE_BASE */
+};
+
+struct thread1
+{
+   GLuint ext_halt_exception_enable:1; 
+   GLuint sw_exception_enable:1; 
+   GLuint mask_stack_exception_enable:1; 
+   GLuint timeout_exception_enable:1; 
+   GLuint illegal_op_exception_enable:1; 
+   GLuint pad0:3;
+   GLuint depth_coef_urb_read_offset:6;	/* WM only */
+   GLuint pad1:2;
+   GLuint floating_point_mode:1; 
+   GLuint thread_priority:1; 
+   GLuint binding_table_entry_count:8; 
+   GLuint pad3:5;
+   GLuint single_program_flow:1; 
+};
+
+struct thread2
+{
+   GLuint per_thread_scratch_space:4; 
+   GLuint pad0:6;
+   GLuint scratch_space_base_pointer:22; 
+};
+
+   
+struct thread3
+{
+   GLuint dispatch_grf_start_reg:4; 
+   GLuint urb_entry_read_offset:6; 
+   GLuint pad0:1;
+   GLuint urb_entry_read_length:6; 
+   GLuint pad1:1;
+   GLuint const_urb_entry_read_offset:6; 
+   GLuint pad2:1;
+   GLuint const_urb_entry_read_length:6; 
+   GLuint pad3:1;
+};
+
+
+
+struct brw_clip_unit_state
+{
+   struct thread0 thread0;
+   struct
+   {
+      GLuint pad0:7;
+      GLuint sw_exception_enable:1;
+      GLuint pad1:3;
+      GLuint mask_stack_exception_enable:1;
+      GLuint pad2:1;
+      GLuint illegal_op_exception_enable:1;
+      GLuint pad3:2;
+      GLuint floating_point_mode:1;
+      GLuint thread_priority:1;
+      GLuint binding_table_entry_count:8;
+      GLuint pad4:5;
+      GLuint single_program_flow:1;
+   } thread1;
+
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      GLuint pad0:9;
+      GLuint gs_output_stats:1; /* not always */
+      GLuint stats_enable:1; 
+      GLuint nr_urb_entries:7; 
+      GLuint pad1:1;
+      GLuint urb_entry_allocation_size:5; 
+      GLuint pad2:1;
+      GLuint max_threads:5; 	/* may be less */
+      GLuint pad3:2;
+   } thread4;   
+      
+   struct
+   {
+      GLuint pad0:13;
+      GLuint clip_mode:3; 
+      GLuint userclip_enable_flags:8; 
+      GLuint userclip_must_clip:1; 
+      GLuint negative_w_clip_test:1;
+      GLuint guard_band_enable:1; 
+      GLuint viewport_z_clip_enable:1; 
+      GLuint viewport_xy_clip_enable:1; 
+      GLuint vertex_position_space:1; 
+      GLuint api_mode:1; 
+      GLuint pad2:1;
+   } clip5;
+   
+   struct
+   {
+      GLuint pad0:5;
+      GLuint clipper_viewport_state_ptr:27; 
+   } clip6;
+
+   
+   GLfloat viewport_xmin;  
+   GLfloat viewport_xmax;  
+   GLfloat viewport_ymin;  
+   GLfloat viewport_ymax;  
+};
+
+
+
+struct brw_cc_unit_state
+{
+   struct brw_cc0
+   {
+      GLuint pad0:3;
+      GLuint bf_stencil_pass_depth_pass_op:3; 
+      GLuint bf_stencil_pass_depth_fail_op:3; 
+      GLuint bf_stencil_fail_op:3; 
+      GLuint bf_stencil_func:3; 
+      GLuint bf_stencil_enable:1; 
+      GLuint pad1:2;
+      GLuint stencil_write_enable:1; 
+      GLuint stencil_pass_depth_pass_op:3; 
+      GLuint stencil_pass_depth_fail_op:3; 
+      GLuint stencil_fail_op:3; 
+      GLuint stencil_func:3; 
+      GLuint stencil_enable:1; 
+   } cc0;
+
+   
+   struct brw_cc1
+   {
+      GLuint bf_stencil_ref:8; 
+      GLuint stencil_write_mask:8; 
+      GLuint stencil_test_mask:8; 
+      GLuint stencil_ref:8; 
+   } cc1;
+
+   
+   struct brw_cc2
+   {
+      GLuint logicop_enable:1; 
+      GLuint pad0:10;
+      GLuint depth_write_enable:1; 
+      GLuint depth_test_function:3; 
+      GLuint depth_test:1; 
+      GLuint bf_stencil_write_mask:8; 
+      GLuint bf_stencil_test_mask:8; 
+   } cc2;
+
+   
+   struct brw_cc3
+   {
+      GLuint pad0:8;
+      GLuint alpha_test_func:3; 
+      GLuint alpha_test:1; 
+      GLuint blend_enable:1; 
+      GLuint ia_blend_enable:1; 
+      GLuint pad1:1;
+      GLuint alpha_test_format:1;
+      GLuint pad2:16;
+   } cc3;
+   
+   struct brw_cc4
+   {
+      GLuint pad0:5; 
+      GLuint cc_viewport_state_offset:27; /* Offset from GENERAL_STATE_BASE */
+   } cc4;
+   
+   struct brw_cc5
+   {
+      GLuint pad0:2;
+      GLuint ia_dest_blend_factor:5; 
+      GLuint ia_src_blend_factor:5; 
+      GLuint ia_blend_function:3; 
+      GLuint statistics_enable:1; 
+      GLuint logicop_func:4; 
+      GLuint pad1:11;
+      GLuint dither_enable:1; 
+   } cc5;
+
+   struct brw_cc6
+   {
+      GLuint clamp_post_alpha_blend:1; 
+      GLuint clamp_pre_alpha_blend:1; 
+      GLuint clamp_range:2; 
+      GLuint pad0:11;
+      GLuint y_dither_offset:2; 
+      GLuint x_dither_offset:2; 
+      GLuint dest_blend_factor:5; 
+      GLuint src_blend_factor:5; 
+      GLuint blend_function:3; 
+   } cc6;
+
+   struct brw_cc7 {
+      union {
+	 GLfloat f;  
+	 GLubyte ub[4];
+      } alpha_ref;
+   } cc7;
+};
+
+
+
+struct brw_sf_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      GLuint pad0:10;
+      GLuint stats_enable:1; 
+      GLuint nr_urb_entries:7; 
+      GLuint pad1:1;
+      GLuint urb_entry_allocation_size:5; 
+      GLuint pad2:1;
+      GLuint max_threads:6; 
+      GLuint pad3:1;
+   } thread4;   
+
+   struct
+   {
+      GLuint front_winding:1; 
+      GLuint viewport_transform:1; 
+      GLuint pad0:3;
+      GLuint sf_viewport_state_offset:27; /* Offset from GENERAL_STATE_BASE */
+   } sf5;
+   
+   struct
+   {
+      GLuint pad0:9;
+      GLuint dest_org_vbias:4; 
+      GLuint dest_org_hbias:4; 
+      GLuint scissor:1; 
+      GLuint disable_2x2_trifilter:1; 
+      GLuint disable_zero_pix_trifilter:1; 
+      GLuint point_rast_rule:2; 
+      GLuint line_endcap_aa_region_width:2; 
+      GLuint line_width:4; 
+      GLuint fast_scissor_disable:1; 
+      GLuint cull_mode:2; 
+      GLuint aa_enable:1; 
+   } sf6;
+
+   struct
+   {
+      GLuint point_size:11; 
+      GLuint use_point_size_state:1; 
+      GLuint subpixel_precision:1; 
+      GLuint sprite_point:1; 
+      GLuint pad0:10;
+      GLuint aa_line_distance_mode:1;
+      GLuint trifan_pv:2; 
+      GLuint linestrip_pv:2; 
+      GLuint tristrip_pv:2; 
+      GLuint line_last_pixel_enable:1; 
+   } sf7;
+
+};
+
+
+struct brw_gs_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      GLuint pad0:8;
+      GLuint rendering_enable:1; /* for IGDNG */
+      GLuint pad4:1;
+      GLuint stats_enable:1; 
+      GLuint nr_urb_entries:7; 
+      GLuint pad1:1;
+      GLuint urb_entry_allocation_size:5; 
+      GLuint pad2:1;
+      GLuint max_threads:5; 
+      GLuint pad3:2;
+   } thread4;   
+      
+   struct
+   {
+      GLuint sampler_count:3; 
+      GLuint pad0:2;
+      GLuint sampler_state_pointer:27; 
+   } gs5;
+
+   
+   struct
+   {
+      GLuint max_vp_index:4; 
+      GLuint pad0:12;
+      GLuint svbi_post_inc_value:10;
+      GLuint pad1:1;
+      GLuint svbi_post_inc_enable:1;
+      GLuint svbi_payload:1;
+      GLuint discard_adjaceny:1;
+      GLuint reorder_enable:1; 
+      GLuint pad2:1;
+   } gs6;
+};
+
+
+struct brw_vs_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+   
+   struct
+   {
+      GLuint pad0:10;
+      GLuint stats_enable:1; 
+      GLuint nr_urb_entries:7; 
+      GLuint pad1:1;
+      GLuint urb_entry_allocation_size:5; 
+      GLuint pad2:1;
+      GLuint max_threads:6; 
+      GLuint pad3:1;
+   } thread4;   
+
+   struct
+   {
+      GLuint sampler_count:3; 
+      GLuint pad0:2;
+      GLuint sampler_state_pointer:27; 
+   } vs5;
+
+   struct
+   {
+      GLuint vs_enable:1; 
+      GLuint vert_cache_disable:1; 
+      GLuint pad0:30;
+   } vs6;
+};
+
+
+struct brw_wm_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+   
+   struct {
+      GLuint stats_enable:1; 
+      GLuint depth_buffer_clear:1;
+      GLuint sampler_count:3; 
+      GLuint sampler_state_pointer:27; 
+   } wm4;
+   
+   struct
+   {
+      GLuint enable_8_pix:1; 
+      GLuint enable_16_pix:1; 
+      GLuint enable_32_pix:1; 
+      GLuint enable_con_32_pix:1;
+      GLuint enable_con_64_pix:1;
+      GLuint pad0:5;
+      GLuint legacy_global_depth_bias:1; 
+      GLuint line_stipple:1; 
+      GLuint depth_offset:1; 
+      GLuint polygon_stipple:1; 
+      GLuint line_aa_region_width:2; 
+      GLuint line_endcap_aa_region_width:2; 
+      GLuint early_depth_test:1; 
+      GLuint thread_dispatch_enable:1; 
+      GLuint program_uses_depth:1; 
+      GLuint program_computes_depth:1; 
+      GLuint program_uses_killpixel:1; 
+      GLuint legacy_line_rast: 1; 
+      GLuint transposed_urb_read_enable:1; 
+      GLuint max_threads:7; 
+   } wm5;
+   
+   GLfloat global_depth_offset_constant;  
+   GLfloat global_depth_offset_scale;   
+   
+   /* for IGDNG only */
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_1:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_1:26;
+   } wm8;       
+
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_2:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_2:26;
+   } wm9;       
+
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_3:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_3:26;
+   } wm10;       
+};
+
+struct brw_sampler_default_color {
+   GLfloat color[4];
+};
+
+struct brw_sampler_state
+{
+   
+   struct brw_ss0
+   {
+      GLuint shadow_function:3; 
+      GLuint lod_bias:11; 
+      GLuint min_filter:3; 
+      GLuint mag_filter:3; 
+      GLuint mip_filter:2; 
+      GLuint base_level:5; 
+      GLuint pad:1;
+      GLuint lod_preclamp:1; 
+      GLuint default_color_mode:1; 
+      GLuint pad0:1;
+      GLuint disable:1; 
+   } ss0;
+
+   struct brw_ss1
+   {
+      GLuint r_wrap_mode:3; 
+      GLuint t_wrap_mode:3; 
+      GLuint s_wrap_mode:3; 
+      GLuint pad:3;
+      GLuint max_lod:10; 
+      GLuint min_lod:10; 
+   } ss1;
+
+   
+   struct brw_ss2
+   {
+      GLuint pad:5;
+      GLuint default_color_pointer:27; 
+   } ss2;
+   
+   struct brw_ss3
+   {
+      GLuint pad:19;
+      GLuint max_aniso:3; 
+      GLuint chroma_key_mode:1; 
+      GLuint chroma_key_index:2; 
+      GLuint chroma_key_enable:1; 
+      GLuint monochrome_filter_width:3; 
+      GLuint monochrome_filter_height:3; 
+   } ss3;
+};
+
+
+struct brw_clipper_viewport
+{
+   GLfloat xmin;  
+   GLfloat xmax;  
+   GLfloat ymin;  
+   GLfloat ymax;  
+};
+
+struct brw_cc_viewport
+{
+   GLfloat min_depth;  
+   GLfloat max_depth;  
+};
+
+struct brw_sf_viewport
+{
+   struct {
+      GLfloat m00;  
+      GLfloat m11;  
+      GLfloat m22;  
+      GLfloat m30;  
+      GLfloat m31;  
+      GLfloat m32;  
+   } viewport;
+
+   /* scissor coordinates are inclusive */
+   struct {
+      GLshort xmin;
+      GLshort ymin;
+      GLshort xmax;
+      GLshort ymax;
+   } scissor;
+};
+
+/* Documented in the subsystem/shared-functions/sampler chapter...
+ */
+struct brw_surface_state
+{
+   struct brw_surf_ss0 {
+      GLuint cube_pos_z:1; 
+      GLuint cube_neg_z:1; 
+      GLuint cube_pos_y:1; 
+      GLuint cube_neg_y:1; 
+      GLuint cube_pos_x:1; 
+      GLuint cube_neg_x:1; 
+      GLuint pad:4;
+      GLuint mipmap_layout_mode:1; 
+      GLuint vert_line_stride_ofs:1; 
+      GLuint vert_line_stride:1; 
+      GLuint color_blend:1; 
+      GLuint writedisable_blue:1; 
+      GLuint writedisable_green:1; 
+      GLuint writedisable_red:1; 
+      GLuint writedisable_alpha:1; 
+      GLuint surface_format:9;     /**< BRW_SURFACEFORMAT_x */
+      GLuint data_return_format:1; 
+      GLuint pad0:1;
+      GLuint surface_type:3;       /**< BRW_SURFACE_1D/2D/3D/CUBE */
+   } ss0;
+   
+   struct brw_surf_ss1 {
+      GLuint base_addr;  
+   } ss1;
+   
+   struct brw_surf_ss2 {
+      GLuint pad:2;
+      GLuint mip_count:4; 
+      GLuint width:13; 
+      GLuint height:13; 
+   } ss2;
+
+   struct brw_surf_ss3 {
+      GLuint tile_walk:1; 
+      GLuint tiled_surface:1; 
+      GLuint pad:1; 
+      GLuint pitch:18; 
+      GLuint depth:11; 
+   } ss3;
+   
+   struct brw_surf_ss4 {
+      GLuint multisample_position_palette_index:3;
+      GLuint pad1:1;
+      GLuint num_multisamples:3;
+      GLuint pad0:1;
+      GLuint render_target_view_extent:9;
+      GLuint min_array_elt:11;
+      GLuint min_lod:4; 
+   } ss4;
+
+   struct brw_surf_ss5 {
+      GLuint pad1:16;
+      GLuint llc_mapping:1;
+      GLuint mlc_mapping:1;
+      GLuint gfdt:1;
+      GLuint gfdt_src:1;
+      GLuint y_offset:4;
+      GLuint pad0:1;
+      GLuint x_offset:7;
+   } ss5;   /* New in G4X */
+
+};
+
+
+
+struct brw_vertex_buffer_state
+{
+   struct {
+      GLuint pitch:11; 
+      GLuint pad:15;
+      GLuint access_type:1; 
+      GLuint vb_index:5; 
+   } vb0;
+   
+   GLuint start_addr; 
+   GLuint max_index;   
+#if 1
+   GLuint instance_data_step_rate; /* not included for sequential/random vertices? */
+#endif
+};
+
+#define BRW_VBP_MAX 17
+
+struct brw_vb_array_state {
+   struct header header;
+   struct brw_vertex_buffer_state vb[BRW_VBP_MAX];
+};
+
+
+struct brw_vertex_element_state
+{
+   struct
+   {
+      GLuint src_offset:11; 
+      GLuint pad:5;
+      GLuint src_format:9; 
+      GLuint pad0:1;
+      GLuint valid:1; 
+      GLuint vertex_buffer_index:5; 
+   } ve0;
+
+   struct
+   {
+      GLuint dst_offset:8; 
+      GLuint pad:8;
+      GLuint vfcomponent3:4; 
+      GLuint vfcomponent2:4; 
+      GLuint vfcomponent1:4; 
+      GLuint vfcomponent0:4; 
+   } ve1;
+};
+
+#define BRW_VEP_MAX 18
+
+struct brw_vertex_element_packet {
+   struct header header;
+   struct brw_vertex_element_state ve[BRW_VEP_MAX]; /* note: less than _TNL_ATTRIB_MAX */
+};
+
+
+struct brw_urb_immediate {
+   GLuint opcode:4;
+   GLuint offset:6;
+   GLuint swizzle_control:2; 
+   GLuint pad:1;
+   GLuint allocate:1;
+   GLuint used:1;
+   GLuint complete:1;
+   GLuint response_length:4;
+   GLuint msg_length:4;
+   GLuint msg_target:4;
+   GLuint pad1:3;
+   GLuint end_of_thread:1;
+};
+
+/* Instruction format for the execution units:
+ */
+ 
+struct brw_instruction
+{
+   struct 
+   {
+      GLuint opcode:7;
+      GLuint pad:1;
+      GLuint access_mode:1;
+      GLuint mask_control:1;
+      GLuint dependency_control:2;
+      GLuint compression_control:2;
+      GLuint thread_control:2;
+      GLuint predicate_control:4;
+      GLuint predicate_inverse:1;
+      GLuint execution_size:3;
+      GLuint destreg__conditionalmod:4; /* destreg - send, conditionalmod - others */
+      GLuint pad0:2;
+      GLuint debug_control:1;
+      GLuint saturate:1;
+   } header;
+
+   union {
+      struct
+      {
+	 GLuint dest_reg_file:2;
+	 GLuint dest_reg_type:3;
+	 GLuint src0_reg_file:2;
+	 GLuint src0_reg_type:3;
+	 GLuint src1_reg_file:2;
+	 GLuint src1_reg_type:3;
+	 GLuint pad:1;
+	 GLuint dest_subreg_nr:5;
+	 GLuint dest_reg_nr:8;
+	 GLuint dest_horiz_stride:2;
+	 GLuint dest_address_mode:1;
+      } da1;
+
+      struct
+      {
+	 GLuint dest_reg_file:2;
+	 GLuint dest_reg_type:3;
+	 GLuint src0_reg_file:2;
+	 GLuint src0_reg_type:3;
+	 GLuint src1_reg_file:2;        /* 0x00000c00 */
+	 GLuint src1_reg_type:3;        /* 0x00007000 */
+	 GLuint pad:1;
+	 GLint dest_indirect_offset:10;	/* offset against the deref'd address reg */
+	 GLuint dest_subreg_nr:3; /* subnr for the address reg a0.x */
+	 GLuint dest_horiz_stride:2;
+	 GLuint dest_address_mode:1;
+      } ia1;
+
+      struct
+      {
+	 GLuint dest_reg_file:2;
+	 GLuint dest_reg_type:3;
+	 GLuint src0_reg_file:2;
+	 GLuint src0_reg_type:3;
+	 GLuint src1_reg_file:2;
+	 GLuint src1_reg_type:3;
+	 GLuint pad:1;
+	 GLuint dest_writemask:4;
+	 GLuint dest_subreg_nr:1;
+	 GLuint dest_reg_nr:8;
+	 GLuint pad1:2;
+	 GLuint dest_address_mode:1;
+      } da16;
+
+      struct
+      {
+	 GLuint dest_reg_file:2;
+	 GLuint dest_reg_type:3;
+	 GLuint src0_reg_file:2;
+	 GLuint src0_reg_type:3;
+	 GLuint pad0:6;
+	 GLuint dest_writemask:4;
+	 GLint dest_indirect_offset:6;
+	 GLuint dest_subreg_nr:3;
+	 GLuint pad1:2;
+	 GLuint dest_address_mode:1;
+      } ia16;
+   } bits1;
+
+
+   union {
+      struct
+      {
+	 GLuint src0_subreg_nr:5;
+	 GLuint src0_reg_nr:8;
+	 GLuint src0_abs:1;
+	 GLuint src0_negate:1;
+	 GLuint src0_address_mode:1;
+	 GLuint src0_horiz_stride:2;
+	 GLuint src0_width:3;
+	 GLuint src0_vert_stride:4;
+	 GLuint flag_reg_nr:1;
+	 GLuint pad:6;
+      } da1;
+
+      struct
+      {
+	 GLint src0_indirect_offset:10;
+	 GLuint src0_subreg_nr:3;
+	 GLuint src0_abs:1;
+	 GLuint src0_negate:1;
+	 GLuint src0_address_mode:1;
+	 GLuint src0_horiz_stride:2;
+	 GLuint src0_width:3;
+	 GLuint src0_vert_stride:4;
+	 GLuint flag_reg_nr:1;
+	 GLuint pad:6;	
+      } ia1;
+
+      struct
+      {
+	 GLuint src0_swz_x:2;
+	 GLuint src0_swz_y:2;
+	 GLuint src0_subreg_nr:1;
+	 GLuint src0_reg_nr:8;
+	 GLuint src0_abs:1;
+	 GLuint src0_negate:1;
+	 GLuint src0_address_mode:1;
+	 GLuint src0_swz_z:2;
+	 GLuint src0_swz_w:2;
+	 GLuint pad0:1;
+	 GLuint src0_vert_stride:4;
+	 GLuint flag_reg_nr:1;
+	 GLuint pad1:6;
+      } da16;
+
+      struct
+      {
+	 GLuint src0_swz_x:2;
+	 GLuint src0_swz_y:2;
+	 GLint src0_indirect_offset:6;
+	 GLuint src0_subreg_nr:3;
+	 GLuint src0_abs:1;
+	 GLuint src0_negate:1;
+	 GLuint src0_address_mode:1;
+	 GLuint src0_swz_z:2;
+	 GLuint src0_swz_w:2;
+	 GLuint pad0:1;
+	 GLuint src0_vert_stride:4;
+	 GLuint flag_reg_nr:1;
+	 GLuint pad1:6;
+      } ia16;
+
+       struct 
+       {
+           GLuint pad:26;
+           GLuint end_of_thread:1;
+           GLuint pad1:1;
+           GLuint sfid:4;
+       } send_igdng;  /* for IGDNG only */
+
+   } bits2;
+
+   union
+   {
+      struct
+      {
+	 GLuint src1_subreg_nr:5;
+	 GLuint src1_reg_nr:8;
+	 GLuint src1_abs:1;
+	 GLuint src1_negate:1;
+	 GLuint src1_address_mode:1;
+	 GLuint src1_horiz_stride:2;
+	 GLuint src1_width:3;
+	 GLuint src1_vert_stride:4;
+	 GLuint pad0:7;
+      } da1;
+
+      struct
+      {
+	 GLuint src1_swz_x:2;
+	 GLuint src1_swz_y:2;
+	 GLuint src1_subreg_nr:1;
+	 GLuint src1_reg_nr:8;
+	 GLuint src1_abs:1;
+	 GLuint src1_negate:1;
+	 GLuint src1_address_mode:1;
+	 GLuint src1_swz_z:2;
+	 GLuint src1_swz_w:2;
+	 GLuint pad1:1;
+	 GLuint src1_vert_stride:4;
+	 GLuint pad2:7;
+      } da16;
+
+      struct
+      {
+	 GLint  src1_indirect_offset:10;
+	 GLuint src1_subreg_nr:3;
+	 GLuint src1_abs:1;
+	 GLuint src1_negate:1;
+	 GLuint src1_address_mode:1;
+	 GLuint src1_horiz_stride:2;
+	 GLuint src1_width:3;
+	 GLuint src1_vert_stride:4;
+	 GLuint flag_reg_nr:1;
+	 GLuint pad1:6;	
+      } ia1;
+
+      struct
+      {
+	 GLuint src1_swz_x:2;
+	 GLuint src1_swz_y:2;
+	 GLint  src1_indirect_offset:6;
+	 GLuint src1_subreg_nr:3;
+	 GLuint src1_abs:1;
+	 GLuint src1_negate:1;
+	 GLuint pad0:1;
+	 GLuint src1_swz_z:2;
+	 GLuint src1_swz_w:2;
+	 GLuint pad1:1;
+	 GLuint src1_vert_stride:4;
+	 GLuint flag_reg_nr:1;
+	 GLuint pad2:6;
+      } ia16;
+
+
+      struct
+      {
+	 GLint  jump_count:16;	/* note: signed */
+	 GLuint  pop_count:4;
+	 GLuint  pad0:12;
+      } if_else;
+
+      struct {
+	 GLuint function:4;
+	 GLuint int_type:1;
+	 GLuint precision:1;
+	 GLuint saturate:1;
+	 GLuint data_type:1;
+	 GLuint pad0:8;
+	 GLuint response_length:4;
+	 GLuint msg_length:4;
+	 GLuint msg_target:4;
+	 GLuint pad1:3;
+	 GLuint end_of_thread:1;
+      } math;
+
+      struct {
+	 GLuint function:4;
+	 GLuint int_type:1;
+	 GLuint precision:1;
+	 GLuint saturate:1;
+	 GLuint data_type:1;
+	 GLuint snapshot:1;
+	 GLuint pad0:10;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } math_igdng;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint sampler:4;
+	 GLuint return_format:2; 
+	 GLuint msg_type:2;   
+	 GLuint response_length:4;
+	 GLuint msg_length:4;
+	 GLuint msg_target:4;
+	 GLuint pad1:3;
+	 GLuint end_of_thread:1;
+      } sampler;
+
+      struct {
+         GLuint binding_table_index:8;
+         GLuint sampler:4;
+         GLuint msg_type:4;
+         GLuint response_length:4;
+         GLuint msg_length:4;
+         GLuint msg_target:4;
+         GLuint pad1:3;
+         GLuint end_of_thread:1;
+      } sampler_g4x;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint sampler:4;
+	 GLuint msg_type:4;
+	 GLuint simd_mode:2;
+	 GLuint pad0:1;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } sampler_igdng;
+
+      struct brw_urb_immediate urb;
+
+      struct {
+	 GLuint opcode:4;
+	 GLuint offset:6;
+	 GLuint swizzle_control:2; 
+	 GLuint pad:1;
+	 GLuint allocate:1;
+	 GLuint used:1;
+	 GLuint complete:1;
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } urb_igdng;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:4;  
+	 GLuint msg_type:2;  
+	 GLuint target_cache:2;    
+	 GLuint response_length:4;
+	 GLuint msg_length:4;
+	 GLuint msg_target:4;
+	 GLuint pad1:3;
+	 GLuint end_of_thread:1;
+      } dp_read;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;  
+	 GLuint msg_type:3;  
+	 GLuint target_cache:2;    
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_read_igdng;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;
+	 GLuint pixel_scoreboard_clear:1;
+	 GLuint msg_type:3;    
+	 GLuint send_commit_msg:1;
+	 GLuint response_length:4;
+	 GLuint msg_length:4;
+	 GLuint msg_target:4;
+	 GLuint pad1:3;
+	 GLuint end_of_thread:1;
+      } dp_write;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;
+	 GLuint pixel_scoreboard_clear:1;
+	 GLuint msg_type:3;    
+	 GLuint send_commit_msg:1;
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_write_igdng;
+
+      struct {
+	 GLuint pad:16;
+	 GLuint response_length:4;
+	 GLuint msg_length:4;
+	 GLuint msg_target:4;
+	 GLuint pad1:3;
+	 GLuint end_of_thread:1;
+      } generic;
+
+      struct {
+	 GLuint pad:19;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } generic_igdng;
+
+      GLint d;
+      GLuint ud;
+      float f;
+   } bits3;
+};
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_structs_dump.c b/src/gallium/drivers/i965/brw_structs_dump.c
new file mode 100644
index 0000000000..cd40fc6d61
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_structs_dump.c
@@ -0,0 +1,1247 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Dump i965 data structures.
+ *
+ * Generated automatically from brw_structs.h by brw_structs_dump.py.
+ */
+
+#include "util/u_debug.h"
+
+#include "brw_types.h"
+#include "brw_structs.h"
+#include "brw_structs_dump.h"
+
+void
+brw_dump_3d_control(const struct brw_3d_control *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.notify_enable = 0x%x\n", (*ptr).header.notify_enable);
+   debug_printf("\t\t.header.wc_flush_enable = 0x%x\n", (*ptr).header.wc_flush_enable);
+   debug_printf("\t\t.header.depth_stall_enable = 0x%x\n", (*ptr).header.depth_stall_enable);
+   debug_printf("\t\t.header.operation = 0x%x\n", (*ptr).header.operation);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.dest.dest_addr_type = 0x%x\n", (*ptr).dest.dest_addr_type);
+   debug_printf("\t\t.dest.dest_addr = 0x%x\n", (*ptr).dest.dest_addr);
+   debug_printf("\t\t.dword2 = 0x%x\n", (*ptr).dword2);
+   debug_printf("\t\t.dword3 = 0x%x\n", (*ptr).dword3);
+}
+
+void
+brw_dump_3d_primitive(const struct brw_3d_primitive *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.topology = 0x%x\n", (*ptr).header.topology);
+   debug_printf("\t\t.header.indexed = 0x%x\n", (*ptr).header.indexed);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.verts_per_instance = 0x%x\n", (*ptr).verts_per_instance);
+   debug_printf("\t\t.start_vert_location = 0x%x\n", (*ptr).start_vert_location);
+   debug_printf("\t\t.instance_count = 0x%x\n", (*ptr).instance_count);
+   debug_printf("\t\t.start_instance_location = 0x%x\n", (*ptr).start_instance_location);
+   debug_printf("\t\t.base_vert_location = 0x%x\n", (*ptr).base_vert_location);
+}
+
+void
+brw_dump_aa_line_parameters(const struct brw_aa_line_parameters *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.aa_coverage_scope = 0x%x\n", (*ptr).bits0.aa_coverage_scope);
+   debug_printf("\t\t.bits0.aa_coverage_bias = 0x%x\n", (*ptr).bits0.aa_coverage_bias);
+   debug_printf("\t\t.bits1.aa_coverage_endcap_slope = 0x%x\n", (*ptr).bits1.aa_coverage_endcap_slope);
+   debug_printf("\t\t.bits1.aa_coverage_endcap_bias = 0x%x\n", (*ptr).bits1.aa_coverage_endcap_bias);
+}
+
+void
+brw_dump_binding_table_pointers(const struct brw_binding_table_pointers *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.vs = 0x%x\n", (*ptr).vs);
+   debug_printf("\t\t.gs = 0x%x\n", (*ptr).gs);
+   debug_printf("\t\t.clp = 0x%x\n", (*ptr).clp);
+   debug_printf("\t\t.sf = 0x%x\n", (*ptr).sf);
+   debug_printf("\t\t.wm = 0x%x\n", (*ptr).wm);
+}
+
+void
+brw_dump_blend_constant_color(const struct brw_blend_constant_color *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.blend_constant_color[0] = %f\n", (*ptr).blend_constant_color[0]);
+   debug_printf("\t\t.blend_constant_color[1] = %f\n", (*ptr).blend_constant_color[1]);
+   debug_printf("\t\t.blend_constant_color[2] = %f\n", (*ptr).blend_constant_color[2]);
+   debug_printf("\t\t.blend_constant_color[3] = %f\n", (*ptr).blend_constant_color[3]);
+}
+
+void
+brw_dump_cc0(const struct brw_cc0 *ptr)
+{
+   debug_printf("\t\t.bf_stencil_pass_depth_pass_op = 0x%x\n", (*ptr).bf_stencil_pass_depth_pass_op);
+   debug_printf("\t\t.bf_stencil_pass_depth_fail_op = 0x%x\n", (*ptr).bf_stencil_pass_depth_fail_op);
+   debug_printf("\t\t.bf_stencil_fail_op = 0x%x\n", (*ptr).bf_stencil_fail_op);
+   debug_printf("\t\t.bf_stencil_func = 0x%x\n", (*ptr).bf_stencil_func);
+   debug_printf("\t\t.bf_stencil_enable = 0x%x\n", (*ptr).bf_stencil_enable);
+   debug_printf("\t\t.stencil_write_enable = 0x%x\n", (*ptr).stencil_write_enable);
+   debug_printf("\t\t.stencil_pass_depth_pass_op = 0x%x\n", (*ptr).stencil_pass_depth_pass_op);
+   debug_printf("\t\t.stencil_pass_depth_fail_op = 0x%x\n", (*ptr).stencil_pass_depth_fail_op);
+   debug_printf("\t\t.stencil_fail_op = 0x%x\n", (*ptr).stencil_fail_op);
+   debug_printf("\t\t.stencil_func = 0x%x\n", (*ptr).stencil_func);
+   debug_printf("\t\t.stencil_enable = 0x%x\n", (*ptr).stencil_enable);
+}
+
+void
+brw_dump_cc1(const struct brw_cc1 *ptr)
+{
+   debug_printf("\t\t.bf_stencil_ref = 0x%x\n", (*ptr).bf_stencil_ref);
+   debug_printf("\t\t.stencil_write_mask = 0x%x\n", (*ptr).stencil_write_mask);
+   debug_printf("\t\t.stencil_test_mask = 0x%x\n", (*ptr).stencil_test_mask);
+   debug_printf("\t\t.stencil_ref = 0x%x\n", (*ptr).stencil_ref);
+}
+
+void
+brw_dump_cc2(const struct brw_cc2 *ptr)
+{
+   debug_printf("\t\t.logicop_enable = 0x%x\n", (*ptr).logicop_enable);
+   debug_printf("\t\t.depth_write_enable = 0x%x\n", (*ptr).depth_write_enable);
+   debug_printf("\t\t.depth_test_function = 0x%x\n", (*ptr).depth_test_function);
+   debug_printf("\t\t.depth_test = 0x%x\n", (*ptr).depth_test);
+   debug_printf("\t\t.bf_stencil_write_mask = 0x%x\n", (*ptr).bf_stencil_write_mask);
+   debug_printf("\t\t.bf_stencil_test_mask = 0x%x\n", (*ptr).bf_stencil_test_mask);
+}
+
+void
+brw_dump_cc3(const struct brw_cc3 *ptr)
+{
+   debug_printf("\t\t.alpha_test_func = 0x%x\n", (*ptr).alpha_test_func);
+   debug_printf("\t\t.alpha_test = 0x%x\n", (*ptr).alpha_test);
+   debug_printf("\t\t.blend_enable = 0x%x\n", (*ptr).blend_enable);
+   debug_printf("\t\t.ia_blend_enable = 0x%x\n", (*ptr).ia_blend_enable);
+   debug_printf("\t\t.alpha_test_format = 0x%x\n", (*ptr).alpha_test_format);
+}
+
+void
+brw_dump_cc4(const struct brw_cc4 *ptr)
+{
+   debug_printf("\t\t.cc_viewport_state_offset = 0x%x\n", (*ptr).cc_viewport_state_offset);
+}
+
+void
+brw_dump_cc5(const struct brw_cc5 *ptr)
+{
+   debug_printf("\t\t.ia_dest_blend_factor = 0x%x\n", (*ptr).ia_dest_blend_factor);
+   debug_printf("\t\t.ia_src_blend_factor = 0x%x\n", (*ptr).ia_src_blend_factor);
+   debug_printf("\t\t.ia_blend_function = 0x%x\n", (*ptr).ia_blend_function);
+   debug_printf("\t\t.statistics_enable = 0x%x\n", (*ptr).statistics_enable);
+   debug_printf("\t\t.logicop_func = 0x%x\n", (*ptr).logicop_func);
+   debug_printf("\t\t.dither_enable = 0x%x\n", (*ptr).dither_enable);
+}
+
+void
+brw_dump_cc6(const struct brw_cc6 *ptr)
+{
+   debug_printf("\t\t.clamp_post_alpha_blend = 0x%x\n", (*ptr).clamp_post_alpha_blend);
+   debug_printf("\t\t.clamp_pre_alpha_blend = 0x%x\n", (*ptr).clamp_pre_alpha_blend);
+   debug_printf("\t\t.clamp_range = 0x%x\n", (*ptr).clamp_range);
+   debug_printf("\t\t.y_dither_offset = 0x%x\n", (*ptr).y_dither_offset);
+   debug_printf("\t\t.x_dither_offset = 0x%x\n", (*ptr).x_dither_offset);
+   debug_printf("\t\t.dest_blend_factor = 0x%x\n", (*ptr).dest_blend_factor);
+   debug_printf("\t\t.src_blend_factor = 0x%x\n", (*ptr).src_blend_factor);
+   debug_printf("\t\t.blend_function = 0x%x\n", (*ptr).blend_function);
+}
+
+void
+brw_dump_cc7(const struct brw_cc7 *ptr)
+{
+   debug_printf("\t\t.alpha_ref.f = %f\n", (*ptr).alpha_ref.f);
+   debug_printf("\t\t.alpha_ref.ub[0] = 0x%x\n", (*ptr).alpha_ref.ub[0]);
+   debug_printf("\t\t.alpha_ref.ub[1] = 0x%x\n", (*ptr).alpha_ref.ub[1]);
+   debug_printf("\t\t.alpha_ref.ub[2] = 0x%x\n", (*ptr).alpha_ref.ub[2]);
+   debug_printf("\t\t.alpha_ref.ub[3] = 0x%x\n", (*ptr).alpha_ref.ub[3]);
+}
+
+void
+brw_dump_cc_unit_state(const struct brw_cc_unit_state *ptr)
+{
+   debug_printf("\t\t.cc0.bf_stencil_pass_depth_pass_op = 0x%x\n", (*ptr).cc0.bf_stencil_pass_depth_pass_op);
+   debug_printf("\t\t.cc0.bf_stencil_pass_depth_fail_op = 0x%x\n", (*ptr).cc0.bf_stencil_pass_depth_fail_op);
+   debug_printf("\t\t.cc0.bf_stencil_fail_op = 0x%x\n", (*ptr).cc0.bf_stencil_fail_op);
+   debug_printf("\t\t.cc0.bf_stencil_func = 0x%x\n", (*ptr).cc0.bf_stencil_func);
+   debug_printf("\t\t.cc0.bf_stencil_enable = 0x%x\n", (*ptr).cc0.bf_stencil_enable);
+   debug_printf("\t\t.cc0.stencil_write_enable = 0x%x\n", (*ptr).cc0.stencil_write_enable);
+   debug_printf("\t\t.cc0.stencil_pass_depth_pass_op = 0x%x\n", (*ptr).cc0.stencil_pass_depth_pass_op);
+   debug_printf("\t\t.cc0.stencil_pass_depth_fail_op = 0x%x\n", (*ptr).cc0.stencil_pass_depth_fail_op);
+   debug_printf("\t\t.cc0.stencil_fail_op = 0x%x\n", (*ptr).cc0.stencil_fail_op);
+   debug_printf("\t\t.cc0.stencil_func = 0x%x\n", (*ptr).cc0.stencil_func);
+   debug_printf("\t\t.cc0.stencil_enable = 0x%x\n", (*ptr).cc0.stencil_enable);
+   debug_printf("\t\t.cc1.bf_stencil_ref = 0x%x\n", (*ptr).cc1.bf_stencil_ref);
+   debug_printf("\t\t.cc1.stencil_write_mask = 0x%x\n", (*ptr).cc1.stencil_write_mask);
+   debug_printf("\t\t.cc1.stencil_test_mask = 0x%x\n", (*ptr).cc1.stencil_test_mask);
+   debug_printf("\t\t.cc1.stencil_ref = 0x%x\n", (*ptr).cc1.stencil_ref);
+   debug_printf("\t\t.cc2.logicop_enable = 0x%x\n", (*ptr).cc2.logicop_enable);
+   debug_printf("\t\t.cc2.depth_write_enable = 0x%x\n", (*ptr).cc2.depth_write_enable);
+   debug_printf("\t\t.cc2.depth_test_function = 0x%x\n", (*ptr).cc2.depth_test_function);
+   debug_printf("\t\t.cc2.depth_test = 0x%x\n", (*ptr).cc2.depth_test);
+   debug_printf("\t\t.cc2.bf_stencil_write_mask = 0x%x\n", (*ptr).cc2.bf_stencil_write_mask);
+   debug_printf("\t\t.cc2.bf_stencil_test_mask = 0x%x\n", (*ptr).cc2.bf_stencil_test_mask);
+   debug_printf("\t\t.cc3.alpha_test_func = 0x%x\n", (*ptr).cc3.alpha_test_func);
+   debug_printf("\t\t.cc3.alpha_test = 0x%x\n", (*ptr).cc3.alpha_test);
+   debug_printf("\t\t.cc3.blend_enable = 0x%x\n", (*ptr).cc3.blend_enable);
+   debug_printf("\t\t.cc3.ia_blend_enable = 0x%x\n", (*ptr).cc3.ia_blend_enable);
+   debug_printf("\t\t.cc3.alpha_test_format = 0x%x\n", (*ptr).cc3.alpha_test_format);
+   debug_printf("\t\t.cc4.cc_viewport_state_offset = 0x%x\n", (*ptr).cc4.cc_viewport_state_offset);
+   debug_printf("\t\t.cc5.ia_dest_blend_factor = 0x%x\n", (*ptr).cc5.ia_dest_blend_factor);
+   debug_printf("\t\t.cc5.ia_src_blend_factor = 0x%x\n", (*ptr).cc5.ia_src_blend_factor);
+   debug_printf("\t\t.cc5.ia_blend_function = 0x%x\n", (*ptr).cc5.ia_blend_function);
+   debug_printf("\t\t.cc5.statistics_enable = 0x%x\n", (*ptr).cc5.statistics_enable);
+   debug_printf("\t\t.cc5.logicop_func = 0x%x\n", (*ptr).cc5.logicop_func);
+   debug_printf("\t\t.cc5.dither_enable = 0x%x\n", (*ptr).cc5.dither_enable);
+   debug_printf("\t\t.cc6.clamp_post_alpha_blend = 0x%x\n", (*ptr).cc6.clamp_post_alpha_blend);
+   debug_printf("\t\t.cc6.clamp_pre_alpha_blend = 0x%x\n", (*ptr).cc6.clamp_pre_alpha_blend);
+   debug_printf("\t\t.cc6.clamp_range = 0x%x\n", (*ptr).cc6.clamp_range);
+   debug_printf("\t\t.cc6.y_dither_offset = 0x%x\n", (*ptr).cc6.y_dither_offset);
+   debug_printf("\t\t.cc6.x_dither_offset = 0x%x\n", (*ptr).cc6.x_dither_offset);
+   debug_printf("\t\t.cc6.dest_blend_factor = 0x%x\n", (*ptr).cc6.dest_blend_factor);
+   debug_printf("\t\t.cc6.src_blend_factor = 0x%x\n", (*ptr).cc6.src_blend_factor);
+   debug_printf("\t\t.cc6.blend_function = 0x%x\n", (*ptr).cc6.blend_function);
+   debug_printf("\t\t.cc7.alpha_ref.f = %f\n", (*ptr).cc7.alpha_ref.f);
+   debug_printf("\t\t.cc7.alpha_ref.ub[0] = 0x%x\n", (*ptr).cc7.alpha_ref.ub[0]);
+   debug_printf("\t\t.cc7.alpha_ref.ub[1] = 0x%x\n", (*ptr).cc7.alpha_ref.ub[1]);
+   debug_printf("\t\t.cc7.alpha_ref.ub[2] = 0x%x\n", (*ptr).cc7.alpha_ref.ub[2]);
+   debug_printf("\t\t.cc7.alpha_ref.ub[3] = 0x%x\n", (*ptr).cc7.alpha_ref.ub[3]);
+}
+
+void
+brw_dump_cc_viewport(const struct brw_cc_viewport *ptr)
+{
+   debug_printf("\t\t.min_depth = %f\n", (*ptr).min_depth);
+   debug_printf("\t\t.max_depth = %f\n", (*ptr).max_depth);
+}
+
+void
+brw_dump_clip_unit_state(const struct brw_clip_unit_state *ptr)
+{
+   debug_printf("\t\t.thread0.grf_reg_count = 0x%x\n", (*ptr).thread0.grf_reg_count);
+   debug_printf("\t\t.thread0.kernel_start_pointer = 0x%x\n", (*ptr).thread0.kernel_start_pointer);
+   debug_printf("\t\t.thread1.sw_exception_enable = 0x%x\n", (*ptr).thread1.sw_exception_enable);
+   debug_printf("\t\t.thread1.mask_stack_exception_enable = 0x%x\n", (*ptr).thread1.mask_stack_exception_enable);
+   debug_printf("\t\t.thread1.illegal_op_exception_enable = 0x%x\n", (*ptr).thread1.illegal_op_exception_enable);
+   debug_printf("\t\t.thread1.floating_point_mode = 0x%x\n", (*ptr).thread1.floating_point_mode);
+   debug_printf("\t\t.thread1.thread_priority = 0x%x\n", (*ptr).thread1.thread_priority);
+   debug_printf("\t\t.thread1.binding_table_entry_count = 0x%x\n", (*ptr).thread1.binding_table_entry_count);
+   debug_printf("\t\t.thread1.single_program_flow = 0x%x\n", (*ptr).thread1.single_program_flow);
+   debug_printf("\t\t.thread2.per_thread_scratch_space = 0x%x\n", (*ptr).thread2.per_thread_scratch_space);
+   debug_printf("\t\t.thread2.scratch_space_base_pointer = 0x%x\n", (*ptr).thread2.scratch_space_base_pointer);
+   debug_printf("\t\t.thread3.dispatch_grf_start_reg = 0x%x\n", (*ptr).thread3.dispatch_grf_start_reg);
+   debug_printf("\t\t.thread3.urb_entry_read_offset = 0x%x\n", (*ptr).thread3.urb_entry_read_offset);
+   debug_printf("\t\t.thread3.urb_entry_read_length = 0x%x\n", (*ptr).thread3.urb_entry_read_length);
+   debug_printf("\t\t.thread3.const_urb_entry_read_offset = 0x%x\n", (*ptr).thread3.const_urb_entry_read_offset);
+   debug_printf("\t\t.thread3.const_urb_entry_read_length = 0x%x\n", (*ptr).thread3.const_urb_entry_read_length);
+   debug_printf("\t\t.thread4.gs_output_stats = 0x%x\n", (*ptr).thread4.gs_output_stats);
+   debug_printf("\t\t.thread4.stats_enable = 0x%x\n", (*ptr).thread4.stats_enable);
+   debug_printf("\t\t.thread4.nr_urb_entries = 0x%x\n", (*ptr).thread4.nr_urb_entries);
+   debug_printf("\t\t.thread4.urb_entry_allocation_size = 0x%x\n", (*ptr).thread4.urb_entry_allocation_size);
+   debug_printf("\t\t.thread4.max_threads = 0x%x\n", (*ptr).thread4.max_threads);
+   debug_printf("\t\t.clip5.clip_mode = 0x%x\n", (*ptr).clip5.clip_mode);
+   debug_printf("\t\t.clip5.userclip_enable_flags = 0x%x\n", (*ptr).clip5.userclip_enable_flags);
+   debug_printf("\t\t.clip5.userclip_must_clip = 0x%x\n", (*ptr).clip5.userclip_must_clip);
+   debug_printf("\t\t.clip5.negative_w_clip_test = 0x%x\n", (*ptr).clip5.negative_w_clip_test);
+   debug_printf("\t\t.clip5.guard_band_enable = 0x%x\n", (*ptr).clip5.guard_band_enable);
+   debug_printf("\t\t.clip5.viewport_z_clip_enable = 0x%x\n", (*ptr).clip5.viewport_z_clip_enable);
+   debug_printf("\t\t.clip5.viewport_xy_clip_enable = 0x%x\n", (*ptr).clip5.viewport_xy_clip_enable);
+   debug_printf("\t\t.clip5.vertex_position_space = 0x%x\n", (*ptr).clip5.vertex_position_space);
+   debug_printf("\t\t.clip5.api_mode = 0x%x\n", (*ptr).clip5.api_mode);
+   debug_printf("\t\t.clip6.clipper_viewport_state_ptr = 0x%x\n", (*ptr).clip6.clipper_viewport_state_ptr);
+   debug_printf("\t\t.viewport_xmin = %f\n", (*ptr).viewport_xmin);
+   debug_printf("\t\t.viewport_xmax = %f\n", (*ptr).viewport_xmax);
+   debug_printf("\t\t.viewport_ymin = %f\n", (*ptr).viewport_ymin);
+   debug_printf("\t\t.viewport_ymax = %f\n", (*ptr).viewport_ymax);
+}
+
+void
+brw_dump_clipper_viewport(const struct brw_clipper_viewport *ptr)
+{
+   debug_printf("\t\t.xmin = %f\n", (*ptr).xmin);
+   debug_printf("\t\t.xmax = %f\n", (*ptr).xmax);
+   debug_printf("\t\t.ymin = %f\n", (*ptr).ymin);
+   debug_printf("\t\t.ymax = %f\n", (*ptr).ymax);
+}
+
+void
+brw_dump_constant_buffer(const struct brw_constant_buffer *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.valid = 0x%x\n", (*ptr).header.valid);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.buffer_length = 0x%x\n", (*ptr).bits0.buffer_length);
+   debug_printf("\t\t.bits0.buffer_address = 0x%x\n", (*ptr).bits0.buffer_address);
+}
+
+void
+brw_dump_cs_urb_state(const struct brw_cs_urb_state *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.nr_urb_entries = 0x%x\n", (*ptr).bits0.nr_urb_entries);
+   debug_printf("\t\t.bits0.urb_entry_size = 0x%x\n", (*ptr).bits0.urb_entry_size);
+}
+
+void
+brw_dump_depthbuffer(const struct brw_depthbuffer *ptr)
+{
+   debug_printf("\t\t.header.bits.length = 0x%x\n", (*ptr).header.bits.length);
+   debug_printf("\t\t.header.bits.opcode = 0x%x\n", (*ptr).header.bits.opcode);
+   debug_printf("\t\t.dword1.bits.pitch = 0x%x\n", (*ptr).dword1.bits.pitch);
+   debug_printf("\t\t.dword1.bits.format = 0x%x\n", (*ptr).dword1.bits.format);
+   debug_printf("\t\t.dword1.bits.software_tiled_rendering_mode = 0x%x\n", (*ptr).dword1.bits.software_tiled_rendering_mode);
+   debug_printf("\t\t.dword1.bits.depth_offset_disable = 0x%x\n", (*ptr).dword1.bits.depth_offset_disable);
+   debug_printf("\t\t.dword1.bits.tile_walk = 0x%x\n", (*ptr).dword1.bits.tile_walk);
+   debug_printf("\t\t.dword1.bits.tiled_surface = 0x%x\n", (*ptr).dword1.bits.tiled_surface);
+   debug_printf("\t\t.dword1.bits.surface_type = 0x%x\n", (*ptr).dword1.bits.surface_type);
+   debug_printf("\t\t.dword2_base_addr = 0x%x\n", (*ptr).dword2_base_addr);
+   debug_printf("\t\t.dword3.bits.mipmap_layout = 0x%x\n", (*ptr).dword3.bits.mipmap_layout);
+   debug_printf("\t\t.dword3.bits.lod = 0x%x\n", (*ptr).dword3.bits.lod);
+   debug_printf("\t\t.dword3.bits.width = 0x%x\n", (*ptr).dword3.bits.width);
+   debug_printf("\t\t.dword3.bits.height = 0x%x\n", (*ptr).dword3.bits.height);
+   debug_printf("\t\t.dword4.bits.min_array_element = 0x%x\n", (*ptr).dword4.bits.min_array_element);
+   debug_printf("\t\t.dword4.bits.depth = 0x%x\n", (*ptr).dword4.bits.depth);
+}
+
+void
+brw_dump_depthbuffer_g4x(const struct brw_depthbuffer_g4x *ptr)
+{
+   debug_printf("\t\t.header.bits.length = 0x%x\n", (*ptr).header.bits.length);
+   debug_printf("\t\t.header.bits.opcode = 0x%x\n", (*ptr).header.bits.opcode);
+   debug_printf("\t\t.dword1.bits.pitch = 0x%x\n", (*ptr).dword1.bits.pitch);
+   debug_printf("\t\t.dword1.bits.format = 0x%x\n", (*ptr).dword1.bits.format);
+   debug_printf("\t\t.dword1.bits.software_tiled_rendering_mode = 0x%x\n", (*ptr).dword1.bits.software_tiled_rendering_mode);
+   debug_printf("\t\t.dword1.bits.depth_offset_disable = 0x%x\n", (*ptr).dword1.bits.depth_offset_disable);
+   debug_printf("\t\t.dword1.bits.tile_walk = 0x%x\n", (*ptr).dword1.bits.tile_walk);
+   debug_printf("\t\t.dword1.bits.tiled_surface = 0x%x\n", (*ptr).dword1.bits.tiled_surface);
+   debug_printf("\t\t.dword1.bits.surface_type = 0x%x\n", (*ptr).dword1.bits.surface_type);
+   debug_printf("\t\t.dword2_base_addr = 0x%x\n", (*ptr).dword2_base_addr);
+   debug_printf("\t\t.dword3.bits.mipmap_layout = 0x%x\n", (*ptr).dword3.bits.mipmap_layout);
+   debug_printf("\t\t.dword3.bits.lod = 0x%x\n", (*ptr).dword3.bits.lod);
+   debug_printf("\t\t.dword3.bits.width = 0x%x\n", (*ptr).dword3.bits.width);
+   debug_printf("\t\t.dword3.bits.height = 0x%x\n", (*ptr).dword3.bits.height);
+   debug_printf("\t\t.dword4.bits.min_array_element = 0x%x\n", (*ptr).dword4.bits.min_array_element);
+   debug_printf("\t\t.dword4.bits.depth = 0x%x\n", (*ptr).dword4.bits.depth);
+   debug_printf("\t\t.dword5.bits.xoffset = 0x%x\n", (*ptr).dword5.bits.xoffset);
+   debug_printf("\t\t.dword5.bits.yoffset = 0x%x\n", (*ptr).dword5.bits.yoffset);
+}
+
+void
+brw_dump_drawrect(const struct brw_drawrect *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.xmin = 0x%x\n", (*ptr).xmin);
+   debug_printf("\t\t.ymin = 0x%x\n", (*ptr).ymin);
+   debug_printf("\t\t.xmax = 0x%x\n", (*ptr).xmax);
+   debug_printf("\t\t.ymax = 0x%x\n", (*ptr).ymax);
+   debug_printf("\t\t.xorg = 0x%x\n", (*ptr).xorg);
+   debug_printf("\t\t.yorg = 0x%x\n", (*ptr).yorg);
+}
+
+void
+brw_dump_global_depth_offset_clamp(const struct brw_global_depth_offset_clamp *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.depth_offset_clamp = %f\n", (*ptr).depth_offset_clamp);
+}
+
+void
+brw_dump_gs_unit_state(const struct brw_gs_unit_state *ptr)
+{
+   debug_printf("\t\t.thread0.grf_reg_count = 0x%x\n", (*ptr).thread0.grf_reg_count);
+   debug_printf("\t\t.thread0.kernel_start_pointer = 0x%x\n", (*ptr).thread0.kernel_start_pointer);
+   debug_printf("\t\t.thread1.ext_halt_exception_enable = 0x%x\n", (*ptr).thread1.ext_halt_exception_enable);
+   debug_printf("\t\t.thread1.sw_exception_enable = 0x%x\n", (*ptr).thread1.sw_exception_enable);
+   debug_printf("\t\t.thread1.mask_stack_exception_enable = 0x%x\n", (*ptr).thread1.mask_stack_exception_enable);
+   debug_printf("\t\t.thread1.timeout_exception_enable = 0x%x\n", (*ptr).thread1.timeout_exception_enable);
+   debug_printf("\t\t.thread1.illegal_op_exception_enable = 0x%x\n", (*ptr).thread1.illegal_op_exception_enable);
+   debug_printf("\t\t.thread1.depth_coef_urb_read_offset = 0x%x\n", (*ptr).thread1.depth_coef_urb_read_offset);
+   debug_printf("\t\t.thread1.floating_point_mode = 0x%x\n", (*ptr).thread1.floating_point_mode);
+   debug_printf("\t\t.thread1.thread_priority = 0x%x\n", (*ptr).thread1.thread_priority);
+   debug_printf("\t\t.thread1.binding_table_entry_count = 0x%x\n", (*ptr).thread1.binding_table_entry_count);
+   debug_printf("\t\t.thread1.single_program_flow = 0x%x\n", (*ptr).thread1.single_program_flow);
+   debug_printf("\t\t.thread2.per_thread_scratch_space = 0x%x\n", (*ptr).thread2.per_thread_scratch_space);
+   debug_printf("\t\t.thread2.scratch_space_base_pointer = 0x%x\n", (*ptr).thread2.scratch_space_base_pointer);
+   debug_printf("\t\t.thread3.dispatch_grf_start_reg = 0x%x\n", (*ptr).thread3.dispatch_grf_start_reg);
+   debug_printf("\t\t.thread3.urb_entry_read_offset = 0x%x\n", (*ptr).thread3.urb_entry_read_offset);
+   debug_printf("\t\t.thread3.urb_entry_read_length = 0x%x\n", (*ptr).thread3.urb_entry_read_length);
+   debug_printf("\t\t.thread3.const_urb_entry_read_offset = 0x%x\n", (*ptr).thread3.const_urb_entry_read_offset);
+   debug_printf("\t\t.thread3.const_urb_entry_read_length = 0x%x\n", (*ptr).thread3.const_urb_entry_read_length);
+   debug_printf("\t\t.thread4.rendering_enable = 0x%x\n", (*ptr).thread4.rendering_enable);
+   debug_printf("\t\t.thread4.stats_enable = 0x%x\n", (*ptr).thread4.stats_enable);
+   debug_printf("\t\t.thread4.nr_urb_entries = 0x%x\n", (*ptr).thread4.nr_urb_entries);
+   debug_printf("\t\t.thread4.urb_entry_allocation_size = 0x%x\n", (*ptr).thread4.urb_entry_allocation_size);
+   debug_printf("\t\t.thread4.max_threads = 0x%x\n", (*ptr).thread4.max_threads);
+   debug_printf("\t\t.gs5.sampler_count = 0x%x\n", (*ptr).gs5.sampler_count);
+   debug_printf("\t\t.gs5.sampler_state_pointer = 0x%x\n", (*ptr).gs5.sampler_state_pointer);
+   debug_printf("\t\t.gs6.max_vp_index = 0x%x\n", (*ptr).gs6.max_vp_index);
+   debug_printf("\t\t.gs6.svbi_post_inc_value = 0x%x\n", (*ptr).gs6.svbi_post_inc_value);
+   debug_printf("\t\t.gs6.svbi_post_inc_enable = 0x%x\n", (*ptr).gs6.svbi_post_inc_enable);
+   debug_printf("\t\t.gs6.svbi_payload = 0x%x\n", (*ptr).gs6.svbi_payload);
+   debug_printf("\t\t.gs6.discard_adjaceny = 0x%x\n", (*ptr).gs6.discard_adjaceny);
+   debug_printf("\t\t.gs6.reorder_enable = 0x%x\n", (*ptr).gs6.reorder_enable);
+}
+
+void
+brw_dump_indexbuffer(const struct brw_indexbuffer *ptr)
+{
+   debug_printf("\t\t.header.bits.length = 0x%x\n", (*ptr).header.bits.length);
+   debug_printf("\t\t.header.bits.index_format = 0x%x\n", (*ptr).header.bits.index_format);
+   debug_printf("\t\t.header.bits.cut_index_enable = 0x%x\n", (*ptr).header.bits.cut_index_enable);
+   debug_printf("\t\t.header.bits.opcode = 0x%x\n", (*ptr).header.bits.opcode);
+   debug_printf("\t\t.buffer_start = 0x%x\n", (*ptr).buffer_start);
+   debug_printf("\t\t.buffer_end = 0x%x\n", (*ptr).buffer_end);
+}
+
+void
+brw_dump_line_stipple(const struct brw_line_stipple *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.pattern = 0x%x\n", (*ptr).bits0.pattern);
+   debug_printf("\t\t.bits1.repeat_count = 0x%x\n", (*ptr).bits1.repeat_count);
+   debug_printf("\t\t.bits1.inverse_repeat_count = 0x%x\n", (*ptr).bits1.inverse_repeat_count);
+}
+
+void
+brw_dump_mi_flush(const struct brw_mi_flush *ptr)
+{
+   debug_printf("\t\t.flags = 0x%x\n", (*ptr).flags);
+   debug_printf("\t\t.opcode = 0x%x\n", (*ptr).opcode);
+}
+
+void
+brw_dump_pipe_control(const struct brw_pipe_control *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.notify_enable = 0x%x\n", (*ptr).header.notify_enable);
+   debug_printf("\t\t.header.texture_cache_flush_enable = 0x%x\n", (*ptr).header.texture_cache_flush_enable);
+   debug_printf("\t\t.header.indirect_state_pointers_disable = 0x%x\n", (*ptr).header.indirect_state_pointers_disable);
+   debug_printf("\t\t.header.instruction_state_cache_flush_enable = 0x%x\n", (*ptr).header.instruction_state_cache_flush_enable);
+   debug_printf("\t\t.header.write_cache_flush_enable = 0x%x\n", (*ptr).header.write_cache_flush_enable);
+   debug_printf("\t\t.header.depth_stall_enable = 0x%x\n", (*ptr).header.depth_stall_enable);
+   debug_printf("\t\t.header.post_sync_operation = 0x%x\n", (*ptr).header.post_sync_operation);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits1.dest_addr_type = 0x%x\n", (*ptr).bits1.dest_addr_type);
+   debug_printf("\t\t.bits1.dest_addr = 0x%x\n", (*ptr).bits1.dest_addr);
+   debug_printf("\t\t.data0 = 0x%x\n", (*ptr).data0);
+   debug_printf("\t\t.data1 = 0x%x\n", (*ptr).data1);
+}
+
+void
+brw_dump_pipeline_select(const struct brw_pipeline_select *ptr)
+{
+   debug_printf("\t\t.header.pipeline_select = 0x%x\n", (*ptr).header.pipeline_select);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+}
+
+void
+brw_dump_pipelined_state_pointers(const struct brw_pipelined_state_pointers *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.vs.offset = 0x%x\n", (*ptr).vs.offset);
+   debug_printf("\t\t.gs.enable = 0x%x\n", (*ptr).gs.enable);
+   debug_printf("\t\t.gs.offset = 0x%x\n", (*ptr).gs.offset);
+   debug_printf("\t\t.clp.enable = 0x%x\n", (*ptr).clp.enable);
+   debug_printf("\t\t.clp.offset = 0x%x\n", (*ptr).clp.offset);
+   debug_printf("\t\t.sf.offset = 0x%x\n", (*ptr).sf.offset);
+   debug_printf("\t\t.wm.offset = 0x%x\n", (*ptr).wm.offset);
+   debug_printf("\t\t.cc.offset = 0x%x\n", (*ptr).cc.offset);
+}
+
+void
+brw_dump_polygon_stipple(const struct brw_polygon_stipple *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.stipple[0] = 0x%x\n", (*ptr).stipple[0]);
+   debug_printf("\t\t.stipple[1] = 0x%x\n", (*ptr).stipple[1]);
+   debug_printf("\t\t.stipple[2] = 0x%x\n", (*ptr).stipple[2]);
+   debug_printf("\t\t.stipple[3] = 0x%x\n", (*ptr).stipple[3]);
+   debug_printf("\t\t.stipple[4] = 0x%x\n", (*ptr).stipple[4]);
+   debug_printf("\t\t.stipple[5] = 0x%x\n", (*ptr).stipple[5]);
+   debug_printf("\t\t.stipple[6] = 0x%x\n", (*ptr).stipple[6]);
+   debug_printf("\t\t.stipple[7] = 0x%x\n", (*ptr).stipple[7]);
+   debug_printf("\t\t.stipple[8] = 0x%x\n", (*ptr).stipple[8]);
+   debug_printf("\t\t.stipple[9] = 0x%x\n", (*ptr).stipple[9]);
+   debug_printf("\t\t.stipple[10] = 0x%x\n", (*ptr).stipple[10]);
+   debug_printf("\t\t.stipple[11] = 0x%x\n", (*ptr).stipple[11]);
+   debug_printf("\t\t.stipple[12] = 0x%x\n", (*ptr).stipple[12]);
+   debug_printf("\t\t.stipple[13] = 0x%x\n", (*ptr).stipple[13]);
+   debug_printf("\t\t.stipple[14] = 0x%x\n", (*ptr).stipple[14]);
+   debug_printf("\t\t.stipple[15] = 0x%x\n", (*ptr).stipple[15]);
+   debug_printf("\t\t.stipple[16] = 0x%x\n", (*ptr).stipple[16]);
+   debug_printf("\t\t.stipple[17] = 0x%x\n", (*ptr).stipple[17]);
+   debug_printf("\t\t.stipple[18] = 0x%x\n", (*ptr).stipple[18]);
+   debug_printf("\t\t.stipple[19] = 0x%x\n", (*ptr).stipple[19]);
+   debug_printf("\t\t.stipple[20] = 0x%x\n", (*ptr).stipple[20]);
+   debug_printf("\t\t.stipple[21] = 0x%x\n", (*ptr).stipple[21]);
+   debug_printf("\t\t.stipple[22] = 0x%x\n", (*ptr).stipple[22]);
+   debug_printf("\t\t.stipple[23] = 0x%x\n", (*ptr).stipple[23]);
+   debug_printf("\t\t.stipple[24] = 0x%x\n", (*ptr).stipple[24]);
+   debug_printf("\t\t.stipple[25] = 0x%x\n", (*ptr).stipple[25]);
+   debug_printf("\t\t.stipple[26] = 0x%x\n", (*ptr).stipple[26]);
+   debug_printf("\t\t.stipple[27] = 0x%x\n", (*ptr).stipple[27]);
+   debug_printf("\t\t.stipple[28] = 0x%x\n", (*ptr).stipple[28]);
+   debug_printf("\t\t.stipple[29] = 0x%x\n", (*ptr).stipple[29]);
+   debug_printf("\t\t.stipple[30] = 0x%x\n", (*ptr).stipple[30]);
+   debug_printf("\t\t.stipple[31] = 0x%x\n", (*ptr).stipple[31]);
+}
+
+void
+brw_dump_polygon_stipple_offset(const struct brw_polygon_stipple_offset *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.y_offset = 0x%x\n", (*ptr).bits0.y_offset);
+   debug_printf("\t\t.bits0.x_offset = 0x%x\n", (*ptr).bits0.x_offset);
+}
+
+void
+brw_dump_sampler_default_color(const struct brw_sampler_default_color *ptr)
+{
+   debug_printf("\t\t.color[0] = %f\n", (*ptr).color[0]);
+   debug_printf("\t\t.color[1] = %f\n", (*ptr).color[1]);
+   debug_printf("\t\t.color[2] = %f\n", (*ptr).color[2]);
+   debug_printf("\t\t.color[3] = %f\n", (*ptr).color[3]);
+}
+
+void
+brw_dump_sampler_state(const struct brw_sampler_state *ptr)
+{
+   debug_printf("\t\t.ss0.shadow_function = 0x%x\n", (*ptr).ss0.shadow_function);
+   debug_printf("\t\t.ss0.lod_bias = 0x%x\n", (*ptr).ss0.lod_bias);
+   debug_printf("\t\t.ss0.min_filter = 0x%x\n", (*ptr).ss0.min_filter);
+   debug_printf("\t\t.ss0.mag_filter = 0x%x\n", (*ptr).ss0.mag_filter);
+   debug_printf("\t\t.ss0.mip_filter = 0x%x\n", (*ptr).ss0.mip_filter);
+   debug_printf("\t\t.ss0.base_level = 0x%x\n", (*ptr).ss0.base_level);
+   debug_printf("\t\t.ss0.lod_preclamp = 0x%x\n", (*ptr).ss0.lod_preclamp);
+   debug_printf("\t\t.ss0.default_color_mode = 0x%x\n", (*ptr).ss0.default_color_mode);
+   debug_printf("\t\t.ss0.disable = 0x%x\n", (*ptr).ss0.disable);
+   debug_printf("\t\t.ss1.r_wrap_mode = 0x%x\n", (*ptr).ss1.r_wrap_mode);
+   debug_printf("\t\t.ss1.t_wrap_mode = 0x%x\n", (*ptr).ss1.t_wrap_mode);
+   debug_printf("\t\t.ss1.s_wrap_mode = 0x%x\n", (*ptr).ss1.s_wrap_mode);
+   debug_printf("\t\t.ss1.max_lod = 0x%x\n", (*ptr).ss1.max_lod);
+   debug_printf("\t\t.ss1.min_lod = 0x%x\n", (*ptr).ss1.min_lod);
+   debug_printf("\t\t.ss2.default_color_pointer = 0x%x\n", (*ptr).ss2.default_color_pointer);
+   debug_printf("\t\t.ss3.max_aniso = 0x%x\n", (*ptr).ss3.max_aniso);
+   debug_printf("\t\t.ss3.chroma_key_mode = 0x%x\n", (*ptr).ss3.chroma_key_mode);
+   debug_printf("\t\t.ss3.chroma_key_index = 0x%x\n", (*ptr).ss3.chroma_key_index);
+   debug_printf("\t\t.ss3.chroma_key_enable = 0x%x\n", (*ptr).ss3.chroma_key_enable);
+   debug_printf("\t\t.ss3.monochrome_filter_width = 0x%x\n", (*ptr).ss3.monochrome_filter_width);
+   debug_printf("\t\t.ss3.monochrome_filter_height = 0x%x\n", (*ptr).ss3.monochrome_filter_height);
+}
+
+void
+brw_dump_sf_unit_state(const struct brw_sf_unit_state *ptr)
+{
+   debug_printf("\t\t.thread0.grf_reg_count = 0x%x\n", (*ptr).thread0.grf_reg_count);
+   debug_printf("\t\t.thread0.kernel_start_pointer = 0x%x\n", (*ptr).thread0.kernel_start_pointer);
+   debug_printf("\t\t.thread1.ext_halt_exception_enable = 0x%x\n", (*ptr).thread1.ext_halt_exception_enable);
+   debug_printf("\t\t.thread1.sw_exception_enable = 0x%x\n", (*ptr).thread1.sw_exception_enable);
+   debug_printf("\t\t.thread1.mask_stack_exception_enable = 0x%x\n", (*ptr).thread1.mask_stack_exception_enable);
+   debug_printf("\t\t.thread1.timeout_exception_enable = 0x%x\n", (*ptr).thread1.timeout_exception_enable);
+   debug_printf("\t\t.thread1.illegal_op_exception_enable = 0x%x\n", (*ptr).thread1.illegal_op_exception_enable);
+   debug_printf("\t\t.thread1.depth_coef_urb_read_offset = 0x%x\n", (*ptr).thread1.depth_coef_urb_read_offset);
+   debug_printf("\t\t.thread1.floating_point_mode = 0x%x\n", (*ptr).thread1.floating_point_mode);
+   debug_printf("\t\t.thread1.thread_priority = 0x%x\n", (*ptr).thread1.thread_priority);
+   debug_printf("\t\t.thread1.binding_table_entry_count = 0x%x\n", (*ptr).thread1.binding_table_entry_count);
+   debug_printf("\t\t.thread1.single_program_flow = 0x%x\n", (*ptr).thread1.single_program_flow);
+   debug_printf("\t\t.thread2.per_thread_scratch_space = 0x%x\n", (*ptr).thread2.per_thread_scratch_space);
+   debug_printf("\t\t.thread2.scratch_space_base_pointer = 0x%x\n", (*ptr).thread2.scratch_space_base_pointer);
+   debug_printf("\t\t.thread3.dispatch_grf_start_reg = 0x%x\n", (*ptr).thread3.dispatch_grf_start_reg);
+   debug_printf("\t\t.thread3.urb_entry_read_offset = 0x%x\n", (*ptr).thread3.urb_entry_read_offset);
+   debug_printf("\t\t.thread3.urb_entry_read_length = 0x%x\n", (*ptr).thread3.urb_entry_read_length);
+   debug_printf("\t\t.thread3.const_urb_entry_read_offset = 0x%x\n", (*ptr).thread3.const_urb_entry_read_offset);
+   debug_printf("\t\t.thread3.const_urb_entry_read_length = 0x%x\n", (*ptr).thread3.const_urb_entry_read_length);
+   debug_printf("\t\t.thread4.stats_enable = 0x%x\n", (*ptr).thread4.stats_enable);
+   debug_printf("\t\t.thread4.nr_urb_entries = 0x%x\n", (*ptr).thread4.nr_urb_entries);
+   debug_printf("\t\t.thread4.urb_entry_allocation_size = 0x%x\n", (*ptr).thread4.urb_entry_allocation_size);
+   debug_printf("\t\t.thread4.max_threads = 0x%x\n", (*ptr).thread4.max_threads);
+   debug_printf("\t\t.sf5.front_winding = 0x%x\n", (*ptr).sf5.front_winding);
+   debug_printf("\t\t.sf5.viewport_transform = 0x%x\n", (*ptr).sf5.viewport_transform);
+   debug_printf("\t\t.sf5.sf_viewport_state_offset = 0x%x\n", (*ptr).sf5.sf_viewport_state_offset);
+   debug_printf("\t\t.sf6.dest_org_vbias = 0x%x\n", (*ptr).sf6.dest_org_vbias);
+   debug_printf("\t\t.sf6.dest_org_hbias = 0x%x\n", (*ptr).sf6.dest_org_hbias);
+   debug_printf("\t\t.sf6.scissor = 0x%x\n", (*ptr).sf6.scissor);
+   debug_printf("\t\t.sf6.disable_2x2_trifilter = 0x%x\n", (*ptr).sf6.disable_2x2_trifilter);
+   debug_printf("\t\t.sf6.disable_zero_pix_trifilter = 0x%x\n", (*ptr).sf6.disable_zero_pix_trifilter);
+   debug_printf("\t\t.sf6.point_rast_rule = 0x%x\n", (*ptr).sf6.point_rast_rule);
+   debug_printf("\t\t.sf6.line_endcap_aa_region_width = 0x%x\n", (*ptr).sf6.line_endcap_aa_region_width);
+   debug_printf("\t\t.sf6.line_width = 0x%x\n", (*ptr).sf6.line_width);
+   debug_printf("\t\t.sf6.fast_scissor_disable = 0x%x\n", (*ptr).sf6.fast_scissor_disable);
+   debug_printf("\t\t.sf6.cull_mode = 0x%x\n", (*ptr).sf6.cull_mode);
+   debug_printf("\t\t.sf6.aa_enable = 0x%x\n", (*ptr).sf6.aa_enable);
+   debug_printf("\t\t.sf7.point_size = 0x%x\n", (*ptr).sf7.point_size);
+   debug_printf("\t\t.sf7.use_point_size_state = 0x%x\n", (*ptr).sf7.use_point_size_state);
+   debug_printf("\t\t.sf7.subpixel_precision = 0x%x\n", (*ptr).sf7.subpixel_precision);
+   debug_printf("\t\t.sf7.sprite_point = 0x%x\n", (*ptr).sf7.sprite_point);
+   debug_printf("\t\t.sf7.aa_line_distance_mode = 0x%x\n", (*ptr).sf7.aa_line_distance_mode);
+   debug_printf("\t\t.sf7.trifan_pv = 0x%x\n", (*ptr).sf7.trifan_pv);
+   debug_printf("\t\t.sf7.linestrip_pv = 0x%x\n", (*ptr).sf7.linestrip_pv);
+   debug_printf("\t\t.sf7.tristrip_pv = 0x%x\n", (*ptr).sf7.tristrip_pv);
+   debug_printf("\t\t.sf7.line_last_pixel_enable = 0x%x\n", (*ptr).sf7.line_last_pixel_enable);
+}
+
+void
+brw_dump_sf_viewport(const struct brw_sf_viewport *ptr)
+{
+   debug_printf("\t\t.viewport.m00 = %f\n", (*ptr).viewport.m00);
+   debug_printf("\t\t.viewport.m11 = %f\n", (*ptr).viewport.m11);
+   debug_printf("\t\t.viewport.m22 = %f\n", (*ptr).viewport.m22);
+   debug_printf("\t\t.viewport.m30 = %f\n", (*ptr).viewport.m30);
+   debug_printf("\t\t.viewport.m31 = %f\n", (*ptr).viewport.m31);
+   debug_printf("\t\t.viewport.m32 = %f\n", (*ptr).viewport.m32);
+   debug_printf("\t\t.scissor.xmin = 0x%x\n", (*ptr).scissor.xmin);
+   debug_printf("\t\t.scissor.ymin = 0x%x\n", (*ptr).scissor.ymin);
+   debug_printf("\t\t.scissor.xmax = 0x%x\n", (*ptr).scissor.xmax);
+   debug_printf("\t\t.scissor.ymax = 0x%x\n", (*ptr).scissor.ymax);
+}
+
+void
+brw_dump_ss0(const struct brw_ss0 *ptr)
+{
+   debug_printf("\t\t.shadow_function = 0x%x\n", (*ptr).shadow_function);
+   debug_printf("\t\t.lod_bias = 0x%x\n", (*ptr).lod_bias);
+   debug_printf("\t\t.min_filter = 0x%x\n", (*ptr).min_filter);
+   debug_printf("\t\t.mag_filter = 0x%x\n", (*ptr).mag_filter);
+   debug_printf("\t\t.mip_filter = 0x%x\n", (*ptr).mip_filter);
+   debug_printf("\t\t.base_level = 0x%x\n", (*ptr).base_level);
+   debug_printf("\t\t.lod_preclamp = 0x%x\n", (*ptr).lod_preclamp);
+   debug_printf("\t\t.default_color_mode = 0x%x\n", (*ptr).default_color_mode);
+   debug_printf("\t\t.disable = 0x%x\n", (*ptr).disable);
+}
+
+void
+brw_dump_ss1(const struct brw_ss1 *ptr)
+{
+   debug_printf("\t\t.r_wrap_mode = 0x%x\n", (*ptr).r_wrap_mode);
+   debug_printf("\t\t.t_wrap_mode = 0x%x\n", (*ptr).t_wrap_mode);
+   debug_printf("\t\t.s_wrap_mode = 0x%x\n", (*ptr).s_wrap_mode);
+   debug_printf("\t\t.max_lod = 0x%x\n", (*ptr).max_lod);
+   debug_printf("\t\t.min_lod = 0x%x\n", (*ptr).min_lod);
+}
+
+void
+brw_dump_ss2(const struct brw_ss2 *ptr)
+{
+   debug_printf("\t\t.default_color_pointer = 0x%x\n", (*ptr).default_color_pointer);
+}
+
+void
+brw_dump_ss3(const struct brw_ss3 *ptr)
+{
+   debug_printf("\t\t.max_aniso = 0x%x\n", (*ptr).max_aniso);
+   debug_printf("\t\t.chroma_key_mode = 0x%x\n", (*ptr).chroma_key_mode);
+   debug_printf("\t\t.chroma_key_index = 0x%x\n", (*ptr).chroma_key_index);
+   debug_printf("\t\t.chroma_key_enable = 0x%x\n", (*ptr).chroma_key_enable);
+   debug_printf("\t\t.monochrome_filter_width = 0x%x\n", (*ptr).monochrome_filter_width);
+   debug_printf("\t\t.monochrome_filter_height = 0x%x\n", (*ptr).monochrome_filter_height);
+}
+
+void
+brw_dump_state_base_address(const struct brw_state_base_address *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.modify_enable = 0x%x\n", (*ptr).bits0.modify_enable);
+   debug_printf("\t\t.bits0.general_state_address = 0x%x\n", (*ptr).bits0.general_state_address);
+   debug_printf("\t\t.bits1.modify_enable = 0x%x\n", (*ptr).bits1.modify_enable);
+   debug_printf("\t\t.bits1.surface_state_address = 0x%x\n", (*ptr).bits1.surface_state_address);
+   debug_printf("\t\t.bits2.modify_enable = 0x%x\n", (*ptr).bits2.modify_enable);
+   debug_printf("\t\t.bits2.indirect_object_state_address = 0x%x\n", (*ptr).bits2.indirect_object_state_address);
+   debug_printf("\t\t.bits3.modify_enable = 0x%x\n", (*ptr).bits3.modify_enable);
+   debug_printf("\t\t.bits3.general_state_upper_bound = 0x%x\n", (*ptr).bits3.general_state_upper_bound);
+   debug_printf("\t\t.bits4.modify_enable = 0x%x\n", (*ptr).bits4.modify_enable);
+   debug_printf("\t\t.bits4.indirect_object_state_upper_bound = 0x%x\n", (*ptr).bits4.indirect_object_state_upper_bound);
+}
+
+void
+brw_dump_state_prefetch(const struct brw_state_prefetch *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.prefetch_count = 0x%x\n", (*ptr).bits0.prefetch_count);
+   debug_printf("\t\t.bits0.prefetch_pointer = 0x%x\n", (*ptr).bits0.prefetch_pointer);
+}
+
+void
+brw_dump_surf_ss0(const struct brw_surf_ss0 *ptr)
+{
+   debug_printf("\t\t.cube_pos_z = 0x%x\n", (*ptr).cube_pos_z);
+   debug_printf("\t\t.cube_neg_z = 0x%x\n", (*ptr).cube_neg_z);
+   debug_printf("\t\t.cube_pos_y = 0x%x\n", (*ptr).cube_pos_y);
+   debug_printf("\t\t.cube_neg_y = 0x%x\n", (*ptr).cube_neg_y);
+   debug_printf("\t\t.cube_pos_x = 0x%x\n", (*ptr).cube_pos_x);
+   debug_printf("\t\t.cube_neg_x = 0x%x\n", (*ptr).cube_neg_x);
+   debug_printf("\t\t.mipmap_layout_mode = 0x%x\n", (*ptr).mipmap_layout_mode);
+   debug_printf("\t\t.vert_line_stride_ofs = 0x%x\n", (*ptr).vert_line_stride_ofs);
+   debug_printf("\t\t.vert_line_stride = 0x%x\n", (*ptr).vert_line_stride);
+   debug_printf("\t\t.color_blend = 0x%x\n", (*ptr).color_blend);
+   debug_printf("\t\t.writedisable_blue = 0x%x\n", (*ptr).writedisable_blue);
+   debug_printf("\t\t.writedisable_green = 0x%x\n", (*ptr).writedisable_green);
+   debug_printf("\t\t.writedisable_red = 0x%x\n", (*ptr).writedisable_red);
+   debug_printf("\t\t.writedisable_alpha = 0x%x\n", (*ptr).writedisable_alpha);
+   debug_printf("\t\t.surface_format = 0x%x\n", (*ptr).surface_format);
+   debug_printf("\t\t.data_return_format = 0x%x\n", (*ptr).data_return_format);
+   debug_printf("\t\t.surface_type = 0x%x\n", (*ptr).surface_type);
+}
+
+void
+brw_dump_surf_ss1(const struct brw_surf_ss1 *ptr)
+{
+   debug_printf("\t\t.base_addr = 0x%x\n", (*ptr).base_addr);
+}
+
+void
+brw_dump_surf_ss2(const struct brw_surf_ss2 *ptr)
+{
+   debug_printf("\t\t.mip_count = 0x%x\n", (*ptr).mip_count);
+   debug_printf("\t\t.width = 0x%x\n", (*ptr).width);
+   debug_printf("\t\t.height = 0x%x\n", (*ptr).height);
+}
+
+void
+brw_dump_surf_ss3(const struct brw_surf_ss3 *ptr)
+{
+   debug_printf("\t\t.tile_walk = 0x%x\n", (*ptr).tile_walk);
+   debug_printf("\t\t.tiled_surface = 0x%x\n", (*ptr).tiled_surface);
+   debug_printf("\t\t.pitch = 0x%x\n", (*ptr).pitch);
+   debug_printf("\t\t.depth = 0x%x\n", (*ptr).depth);
+}
+
+void
+brw_dump_surf_ss4(const struct brw_surf_ss4 *ptr)
+{
+   debug_printf("\t\t.multisample_position_palette_index = 0x%x\n", (*ptr).multisample_position_palette_index);
+   debug_printf("\t\t.num_multisamples = 0x%x\n", (*ptr).num_multisamples);
+   debug_printf("\t\t.render_target_view_extent = 0x%x\n", (*ptr).render_target_view_extent);
+   debug_printf("\t\t.min_array_elt = 0x%x\n", (*ptr).min_array_elt);
+   debug_printf("\t\t.min_lod = 0x%x\n", (*ptr).min_lod);
+}
+
+void
+brw_dump_surf_ss5(const struct brw_surf_ss5 *ptr)
+{
+   debug_printf("\t\t.llc_mapping = 0x%x\n", (*ptr).llc_mapping);
+   debug_printf("\t\t.mlc_mapping = 0x%x\n", (*ptr).mlc_mapping);
+   debug_printf("\t\t.gfdt = 0x%x\n", (*ptr).gfdt);
+   debug_printf("\t\t.gfdt_src = 0x%x\n", (*ptr).gfdt_src);
+   debug_printf("\t\t.y_offset = 0x%x\n", (*ptr).y_offset);
+   debug_printf("\t\t.x_offset = 0x%x\n", (*ptr).x_offset);
+}
+
+void
+brw_dump_surface_state(const struct brw_surface_state *ptr)
+{
+   debug_printf("\t\t.ss0.cube_pos_z = 0x%x\n", (*ptr).ss0.cube_pos_z);
+   debug_printf("\t\t.ss0.cube_neg_z = 0x%x\n", (*ptr).ss0.cube_neg_z);
+   debug_printf("\t\t.ss0.cube_pos_y = 0x%x\n", (*ptr).ss0.cube_pos_y);
+   debug_printf("\t\t.ss0.cube_neg_y = 0x%x\n", (*ptr).ss0.cube_neg_y);
+   debug_printf("\t\t.ss0.cube_pos_x = 0x%x\n", (*ptr).ss0.cube_pos_x);
+   debug_printf("\t\t.ss0.cube_neg_x = 0x%x\n", (*ptr).ss0.cube_neg_x);
+   debug_printf("\t\t.ss0.mipmap_layout_mode = 0x%x\n", (*ptr).ss0.mipmap_layout_mode);
+   debug_printf("\t\t.ss0.vert_line_stride_ofs = 0x%x\n", (*ptr).ss0.vert_line_stride_ofs);
+   debug_printf("\t\t.ss0.vert_line_stride = 0x%x\n", (*ptr).ss0.vert_line_stride);
+   debug_printf("\t\t.ss0.color_blend = 0x%x\n", (*ptr).ss0.color_blend);
+   debug_printf("\t\t.ss0.writedisable_blue = 0x%x\n", (*ptr).ss0.writedisable_blue);
+   debug_printf("\t\t.ss0.writedisable_green = 0x%x\n", (*ptr).ss0.writedisable_green);
+   debug_printf("\t\t.ss0.writedisable_red = 0x%x\n", (*ptr).ss0.writedisable_red);
+   debug_printf("\t\t.ss0.writedisable_alpha = 0x%x\n", (*ptr).ss0.writedisable_alpha);
+   debug_printf("\t\t.ss0.surface_format = 0x%x\n", (*ptr).ss0.surface_format);
+   debug_printf("\t\t.ss0.data_return_format = 0x%x\n", (*ptr).ss0.data_return_format);
+   debug_printf("\t\t.ss0.surface_type = 0x%x\n", (*ptr).ss0.surface_type);
+   debug_printf("\t\t.ss1.base_addr = 0x%x\n", (*ptr).ss1.base_addr);
+   debug_printf("\t\t.ss2.mip_count = 0x%x\n", (*ptr).ss2.mip_count);
+   debug_printf("\t\t.ss2.width = 0x%x\n", (*ptr).ss2.width);
+   debug_printf("\t\t.ss2.height = 0x%x\n", (*ptr).ss2.height);
+   debug_printf("\t\t.ss3.tile_walk = 0x%x\n", (*ptr).ss3.tile_walk);
+   debug_printf("\t\t.ss3.tiled_surface = 0x%x\n", (*ptr).ss3.tiled_surface);
+   debug_printf("\t\t.ss3.pitch = 0x%x\n", (*ptr).ss3.pitch);
+   debug_printf("\t\t.ss3.depth = 0x%x\n", (*ptr).ss3.depth);
+   debug_printf("\t\t.ss4.multisample_position_palette_index = 0x%x\n", (*ptr).ss4.multisample_position_palette_index);
+   debug_printf("\t\t.ss4.num_multisamples = 0x%x\n", (*ptr).ss4.num_multisamples);
+   debug_printf("\t\t.ss4.render_target_view_extent = 0x%x\n", (*ptr).ss4.render_target_view_extent);
+   debug_printf("\t\t.ss4.min_array_elt = 0x%x\n", (*ptr).ss4.min_array_elt);
+   debug_printf("\t\t.ss4.min_lod = 0x%x\n", (*ptr).ss4.min_lod);
+   debug_printf("\t\t.ss5.llc_mapping = 0x%x\n", (*ptr).ss5.llc_mapping);
+   debug_printf("\t\t.ss5.mlc_mapping = 0x%x\n", (*ptr).ss5.mlc_mapping);
+   debug_printf("\t\t.ss5.gfdt = 0x%x\n", (*ptr).ss5.gfdt);
+   debug_printf("\t\t.ss5.gfdt_src = 0x%x\n", (*ptr).ss5.gfdt_src);
+   debug_printf("\t\t.ss5.y_offset = 0x%x\n", (*ptr).ss5.y_offset);
+   debug_printf("\t\t.ss5.x_offset = 0x%x\n", (*ptr).ss5.x_offset);
+}
+
+void
+brw_dump_system_instruction_pointer(const struct brw_system_instruction_pointer *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.system_instruction_pointer = 0x%x\n", (*ptr).bits0.system_instruction_pointer);
+}
+
+void
+brw_dump_urb_fence(const struct brw_urb_fence *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.vs_realloc = 0x%x\n", (*ptr).header.vs_realloc);
+   debug_printf("\t\t.header.gs_realloc = 0x%x\n", (*ptr).header.gs_realloc);
+   debug_printf("\t\t.header.clp_realloc = 0x%x\n", (*ptr).header.clp_realloc);
+   debug_printf("\t\t.header.sf_realloc = 0x%x\n", (*ptr).header.sf_realloc);
+   debug_printf("\t\t.header.vfe_realloc = 0x%x\n", (*ptr).header.vfe_realloc);
+   debug_printf("\t\t.header.cs_realloc = 0x%x\n", (*ptr).header.cs_realloc);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.vs_fence = 0x%x\n", (*ptr).bits0.vs_fence);
+   debug_printf("\t\t.bits0.gs_fence = 0x%x\n", (*ptr).bits0.gs_fence);
+   debug_printf("\t\t.bits0.clp_fence = 0x%x\n", (*ptr).bits0.clp_fence);
+   debug_printf("\t\t.bits1.sf_fence = 0x%x\n", (*ptr).bits1.sf_fence);
+   debug_printf("\t\t.bits1.vf_fence = 0x%x\n", (*ptr).bits1.vf_fence);
+   debug_printf("\t\t.bits1.cs_fence = 0x%x\n", (*ptr).bits1.cs_fence);
+}
+
+void
+brw_dump_urb_immediate(const struct brw_urb_immediate *ptr)
+{
+   debug_printf("\t\t.opcode = 0x%x\n", (*ptr).opcode);
+   debug_printf("\t\t.offset = 0x%x\n", (*ptr).offset);
+   debug_printf("\t\t.swizzle_control = 0x%x\n", (*ptr).swizzle_control);
+   debug_printf("\t\t.allocate = 0x%x\n", (*ptr).allocate);
+   debug_printf("\t\t.used = 0x%x\n", (*ptr).used);
+   debug_printf("\t\t.complete = 0x%x\n", (*ptr).complete);
+   debug_printf("\t\t.response_length = 0x%x\n", (*ptr).response_length);
+   debug_printf("\t\t.msg_length = 0x%x\n", (*ptr).msg_length);
+   debug_printf("\t\t.msg_target = 0x%x\n", (*ptr).msg_target);
+   debug_printf("\t\t.end_of_thread = 0x%x\n", (*ptr).end_of_thread);
+}
+
+void
+brw_dump_vb_array_state(const struct brw_vb_array_state *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.vb[0].vb0.pitch = 0x%x\n", (*ptr).vb[0].vb0.pitch);
+   debug_printf("\t\t.vb[0].vb0.access_type = 0x%x\n", (*ptr).vb[0].vb0.access_type);
+   debug_printf("\t\t.vb[0].vb0.vb_index = 0x%x\n", (*ptr).vb[0].vb0.vb_index);
+   debug_printf("\t\t.vb[0].start_addr = 0x%x\n", (*ptr).vb[0].start_addr);
+   debug_printf("\t\t.vb[0].max_index = 0x%x\n", (*ptr).vb[0].max_index);
+   debug_printf("\t\t.vb[0].instance_data_step_rate = 0x%x\n", (*ptr).vb[0].instance_data_step_rate);
+   debug_printf("\t\t.vb[1].vb0.pitch = 0x%x\n", (*ptr).vb[1].vb0.pitch);
+   debug_printf("\t\t.vb[1].vb0.access_type = 0x%x\n", (*ptr).vb[1].vb0.access_type);
+   debug_printf("\t\t.vb[1].vb0.vb_index = 0x%x\n", (*ptr).vb[1].vb0.vb_index);
+   debug_printf("\t\t.vb[1].start_addr = 0x%x\n", (*ptr).vb[1].start_addr);
+   debug_printf("\t\t.vb[1].max_index = 0x%x\n", (*ptr).vb[1].max_index);
+   debug_printf("\t\t.vb[1].instance_data_step_rate = 0x%x\n", (*ptr).vb[1].instance_data_step_rate);
+   debug_printf("\t\t.vb[2].vb0.pitch = 0x%x\n", (*ptr).vb[2].vb0.pitch);
+   debug_printf("\t\t.vb[2].vb0.access_type = 0x%x\n", (*ptr).vb[2].vb0.access_type);
+   debug_printf("\t\t.vb[2].vb0.vb_index = 0x%x\n", (*ptr).vb[2].vb0.vb_index);
+   debug_printf("\t\t.vb[2].start_addr = 0x%x\n", (*ptr).vb[2].start_addr);
+   debug_printf("\t\t.vb[2].max_index = 0x%x\n", (*ptr).vb[2].max_index);
+   debug_printf("\t\t.vb[2].instance_data_step_rate = 0x%x\n", (*ptr).vb[2].instance_data_step_rate);
+   debug_printf("\t\t.vb[3].vb0.pitch = 0x%x\n", (*ptr).vb[3].vb0.pitch);
+   debug_printf("\t\t.vb[3].vb0.access_type = 0x%x\n", (*ptr).vb[3].vb0.access_type);
+   debug_printf("\t\t.vb[3].vb0.vb_index = 0x%x\n", (*ptr).vb[3].vb0.vb_index);
+   debug_printf("\t\t.vb[3].start_addr = 0x%x\n", (*ptr).vb[3].start_addr);
+   debug_printf("\t\t.vb[3].max_index = 0x%x\n", (*ptr).vb[3].max_index);
+   debug_printf("\t\t.vb[3].instance_data_step_rate = 0x%x\n", (*ptr).vb[3].instance_data_step_rate);
+   debug_printf("\t\t.vb[4].vb0.pitch = 0x%x\n", (*ptr).vb[4].vb0.pitch);
+   debug_printf("\t\t.vb[4].vb0.access_type = 0x%x\n", (*ptr).vb[4].vb0.access_type);
+   debug_printf("\t\t.vb[4].vb0.vb_index = 0x%x\n", (*ptr).vb[4].vb0.vb_index);
+   debug_printf("\t\t.vb[4].start_addr = 0x%x\n", (*ptr).vb[4].start_addr);
+   debug_printf("\t\t.vb[4].max_index = 0x%x\n", (*ptr).vb[4].max_index);
+   debug_printf("\t\t.vb[4].instance_data_step_rate = 0x%x\n", (*ptr).vb[4].instance_data_step_rate);
+   debug_printf("\t\t.vb[5].vb0.pitch = 0x%x\n", (*ptr).vb[5].vb0.pitch);
+   debug_printf("\t\t.vb[5].vb0.access_type = 0x%x\n", (*ptr).vb[5].vb0.access_type);
+   debug_printf("\t\t.vb[5].vb0.vb_index = 0x%x\n", (*ptr).vb[5].vb0.vb_index);
+   debug_printf("\t\t.vb[5].start_addr = 0x%x\n", (*ptr).vb[5].start_addr);
+   debug_printf("\t\t.vb[5].max_index = 0x%x\n", (*ptr).vb[5].max_index);
+   debug_printf("\t\t.vb[5].instance_data_step_rate = 0x%x\n", (*ptr).vb[5].instance_data_step_rate);
+   debug_printf("\t\t.vb[6].vb0.pitch = 0x%x\n", (*ptr).vb[6].vb0.pitch);
+   debug_printf("\t\t.vb[6].vb0.access_type = 0x%x\n", (*ptr).vb[6].vb0.access_type);
+   debug_printf("\t\t.vb[6].vb0.vb_index = 0x%x\n", (*ptr).vb[6].vb0.vb_index);
+   debug_printf("\t\t.vb[6].start_addr = 0x%x\n", (*ptr).vb[6].start_addr);
+   debug_printf("\t\t.vb[6].max_index = 0x%x\n", (*ptr).vb[6].max_index);
+   debug_printf("\t\t.vb[6].instance_data_step_rate = 0x%x\n", (*ptr).vb[6].instance_data_step_rate);
+   debug_printf("\t\t.vb[7].vb0.pitch = 0x%x\n", (*ptr).vb[7].vb0.pitch);
+   debug_printf("\t\t.vb[7].vb0.access_type = 0x%x\n", (*ptr).vb[7].vb0.access_type);
+   debug_printf("\t\t.vb[7].vb0.vb_index = 0x%x\n", (*ptr).vb[7].vb0.vb_index);
+   debug_printf("\t\t.vb[7].start_addr = 0x%x\n", (*ptr).vb[7].start_addr);
+   debug_printf("\t\t.vb[7].max_index = 0x%x\n", (*ptr).vb[7].max_index);
+   debug_printf("\t\t.vb[7].instance_data_step_rate = 0x%x\n", (*ptr).vb[7].instance_data_step_rate);
+   debug_printf("\t\t.vb[8].vb0.pitch = 0x%x\n", (*ptr).vb[8].vb0.pitch);
+   debug_printf("\t\t.vb[8].vb0.access_type = 0x%x\n", (*ptr).vb[8].vb0.access_type);
+   debug_printf("\t\t.vb[8].vb0.vb_index = 0x%x\n", (*ptr).vb[8].vb0.vb_index);
+   debug_printf("\t\t.vb[8].start_addr = 0x%x\n", (*ptr).vb[8].start_addr);
+   debug_printf("\t\t.vb[8].max_index = 0x%x\n", (*ptr).vb[8].max_index);
+   debug_printf("\t\t.vb[8].instance_data_step_rate = 0x%x\n", (*ptr).vb[8].instance_data_step_rate);
+   debug_printf("\t\t.vb[9].vb0.pitch = 0x%x\n", (*ptr).vb[9].vb0.pitch);
+   debug_printf("\t\t.vb[9].vb0.access_type = 0x%x\n", (*ptr).vb[9].vb0.access_type);
+   debug_printf("\t\t.vb[9].vb0.vb_index = 0x%x\n", (*ptr).vb[9].vb0.vb_index);
+   debug_printf("\t\t.vb[9].start_addr = 0x%x\n", (*ptr).vb[9].start_addr);
+   debug_printf("\t\t.vb[9].max_index = 0x%x\n", (*ptr).vb[9].max_index);
+   debug_printf("\t\t.vb[9].instance_data_step_rate = 0x%x\n", (*ptr).vb[9].instance_data_step_rate);
+   debug_printf("\t\t.vb[10].vb0.pitch = 0x%x\n", (*ptr).vb[10].vb0.pitch);
+   debug_printf("\t\t.vb[10].vb0.access_type = 0x%x\n", (*ptr).vb[10].vb0.access_type);
+   debug_printf("\t\t.vb[10].vb0.vb_index = 0x%x\n", (*ptr).vb[10].vb0.vb_index);
+   debug_printf("\t\t.vb[10].start_addr = 0x%x\n", (*ptr).vb[10].start_addr);
+   debug_printf("\t\t.vb[10].max_index = 0x%x\n", (*ptr).vb[10].max_index);
+   debug_printf("\t\t.vb[10].instance_data_step_rate = 0x%x\n", (*ptr).vb[10].instance_data_step_rate);
+   debug_printf("\t\t.vb[11].vb0.pitch = 0x%x\n", (*ptr).vb[11].vb0.pitch);
+   debug_printf("\t\t.vb[11].vb0.access_type = 0x%x\n", (*ptr).vb[11].vb0.access_type);
+   debug_printf("\t\t.vb[11].vb0.vb_index = 0x%x\n", (*ptr).vb[11].vb0.vb_index);
+   debug_printf("\t\t.vb[11].start_addr = 0x%x\n", (*ptr).vb[11].start_addr);
+   debug_printf("\t\t.vb[11].max_index = 0x%x\n", (*ptr).vb[11].max_index);
+   debug_printf("\t\t.vb[11].instance_data_step_rate = 0x%x\n", (*ptr).vb[11].instance_data_step_rate);
+   debug_printf("\t\t.vb[12].vb0.pitch = 0x%x\n", (*ptr).vb[12].vb0.pitch);
+   debug_printf("\t\t.vb[12].vb0.access_type = 0x%x\n", (*ptr).vb[12].vb0.access_type);
+   debug_printf("\t\t.vb[12].vb0.vb_index = 0x%x\n", (*ptr).vb[12].vb0.vb_index);
+   debug_printf("\t\t.vb[12].start_addr = 0x%x\n", (*ptr).vb[12].start_addr);
+   debug_printf("\t\t.vb[12].max_index = 0x%x\n", (*ptr).vb[12].max_index);
+   debug_printf("\t\t.vb[12].instance_data_step_rate = 0x%x\n", (*ptr).vb[12].instance_data_step_rate);
+   debug_printf("\t\t.vb[13].vb0.pitch = 0x%x\n", (*ptr).vb[13].vb0.pitch);
+   debug_printf("\t\t.vb[13].vb0.access_type = 0x%x\n", (*ptr).vb[13].vb0.access_type);
+   debug_printf("\t\t.vb[13].vb0.vb_index = 0x%x\n", (*ptr).vb[13].vb0.vb_index);
+   debug_printf("\t\t.vb[13].start_addr = 0x%x\n", (*ptr).vb[13].start_addr);
+   debug_printf("\t\t.vb[13].max_index = 0x%x\n", (*ptr).vb[13].max_index);
+   debug_printf("\t\t.vb[13].instance_data_step_rate = 0x%x\n", (*ptr).vb[13].instance_data_step_rate);
+   debug_printf("\t\t.vb[14].vb0.pitch = 0x%x\n", (*ptr).vb[14].vb0.pitch);
+   debug_printf("\t\t.vb[14].vb0.access_type = 0x%x\n", (*ptr).vb[14].vb0.access_type);
+   debug_printf("\t\t.vb[14].vb0.vb_index = 0x%x\n", (*ptr).vb[14].vb0.vb_index);
+   debug_printf("\t\t.vb[14].start_addr = 0x%x\n", (*ptr).vb[14].start_addr);
+   debug_printf("\t\t.vb[14].max_index = 0x%x\n", (*ptr).vb[14].max_index);
+   debug_printf("\t\t.vb[14].instance_data_step_rate = 0x%x\n", (*ptr).vb[14].instance_data_step_rate);
+   debug_printf("\t\t.vb[15].vb0.pitch = 0x%x\n", (*ptr).vb[15].vb0.pitch);
+   debug_printf("\t\t.vb[15].vb0.access_type = 0x%x\n", (*ptr).vb[15].vb0.access_type);
+   debug_printf("\t\t.vb[15].vb0.vb_index = 0x%x\n", (*ptr).vb[15].vb0.vb_index);
+   debug_printf("\t\t.vb[15].start_addr = 0x%x\n", (*ptr).vb[15].start_addr);
+   debug_printf("\t\t.vb[15].max_index = 0x%x\n", (*ptr).vb[15].max_index);
+   debug_printf("\t\t.vb[15].instance_data_step_rate = 0x%x\n", (*ptr).vb[15].instance_data_step_rate);
+   debug_printf("\t\t.vb[16].vb0.pitch = 0x%x\n", (*ptr).vb[16].vb0.pitch);
+   debug_printf("\t\t.vb[16].vb0.access_type = 0x%x\n", (*ptr).vb[16].vb0.access_type);
+   debug_printf("\t\t.vb[16].vb0.vb_index = 0x%x\n", (*ptr).vb[16].vb0.vb_index);
+   debug_printf("\t\t.vb[16].start_addr = 0x%x\n", (*ptr).vb[16].start_addr);
+   debug_printf("\t\t.vb[16].max_index = 0x%x\n", (*ptr).vb[16].max_index);
+   debug_printf("\t\t.vb[16].instance_data_step_rate = 0x%x\n", (*ptr).vb[16].instance_data_step_rate);
+}
+
+void
+brw_dump_vertex_buffer_state(const struct brw_vertex_buffer_state *ptr)
+{
+   debug_printf("\t\t.vb0.pitch = 0x%x\n", (*ptr).vb0.pitch);
+   debug_printf("\t\t.vb0.access_type = 0x%x\n", (*ptr).vb0.access_type);
+   debug_printf("\t\t.vb0.vb_index = 0x%x\n", (*ptr).vb0.vb_index);
+   debug_printf("\t\t.start_addr = 0x%x\n", (*ptr).start_addr);
+   debug_printf("\t\t.max_index = 0x%x\n", (*ptr).max_index);
+   debug_printf("\t\t.instance_data_step_rate = 0x%x\n", (*ptr).instance_data_step_rate);
+}
+
+void
+brw_dump_vertex_element_packet(const struct brw_vertex_element_packet *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.ve[0].ve0.src_offset = 0x%x\n", (*ptr).ve[0].ve0.src_offset);
+   debug_printf("\t\t.ve[0].ve0.src_format = 0x%x\n", (*ptr).ve[0].ve0.src_format);
+   debug_printf("\t\t.ve[0].ve0.valid = 0x%x\n", (*ptr).ve[0].ve0.valid);
+   debug_printf("\t\t.ve[0].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[0].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[0].ve1.dst_offset = 0x%x\n", (*ptr).ve[0].ve1.dst_offset);
+   debug_printf("\t\t.ve[0].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[0].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[0].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[0].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[0].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[0].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[0].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[0].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[1].ve0.src_offset = 0x%x\n", (*ptr).ve[1].ve0.src_offset);
+   debug_printf("\t\t.ve[1].ve0.src_format = 0x%x\n", (*ptr).ve[1].ve0.src_format);
+   debug_printf("\t\t.ve[1].ve0.valid = 0x%x\n", (*ptr).ve[1].ve0.valid);
+   debug_printf("\t\t.ve[1].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[1].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[1].ve1.dst_offset = 0x%x\n", (*ptr).ve[1].ve1.dst_offset);
+   debug_printf("\t\t.ve[1].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[1].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[1].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[1].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[1].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[1].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[1].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[1].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[2].ve0.src_offset = 0x%x\n", (*ptr).ve[2].ve0.src_offset);
+   debug_printf("\t\t.ve[2].ve0.src_format = 0x%x\n", (*ptr).ve[2].ve0.src_format);
+   debug_printf("\t\t.ve[2].ve0.valid = 0x%x\n", (*ptr).ve[2].ve0.valid);
+   debug_printf("\t\t.ve[2].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[2].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[2].ve1.dst_offset = 0x%x\n", (*ptr).ve[2].ve1.dst_offset);
+   debug_printf("\t\t.ve[2].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[2].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[2].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[2].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[2].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[2].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[2].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[2].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[3].ve0.src_offset = 0x%x\n", (*ptr).ve[3].ve0.src_offset);
+   debug_printf("\t\t.ve[3].ve0.src_format = 0x%x\n", (*ptr).ve[3].ve0.src_format);
+   debug_printf("\t\t.ve[3].ve0.valid = 0x%x\n", (*ptr).ve[3].ve0.valid);
+   debug_printf("\t\t.ve[3].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[3].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[3].ve1.dst_offset = 0x%x\n", (*ptr).ve[3].ve1.dst_offset);
+   debug_printf("\t\t.ve[3].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[3].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[3].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[3].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[3].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[3].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[3].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[3].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[4].ve0.src_offset = 0x%x\n", (*ptr).ve[4].ve0.src_offset);
+   debug_printf("\t\t.ve[4].ve0.src_format = 0x%x\n", (*ptr).ve[4].ve0.src_format);
+   debug_printf("\t\t.ve[4].ve0.valid = 0x%x\n", (*ptr).ve[4].ve0.valid);
+   debug_printf("\t\t.ve[4].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[4].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[4].ve1.dst_offset = 0x%x\n", (*ptr).ve[4].ve1.dst_offset);
+   debug_printf("\t\t.ve[4].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[4].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[4].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[4].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[4].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[4].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[4].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[4].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[5].ve0.src_offset = 0x%x\n", (*ptr).ve[5].ve0.src_offset);
+   debug_printf("\t\t.ve[5].ve0.src_format = 0x%x\n", (*ptr).ve[5].ve0.src_format);
+   debug_printf("\t\t.ve[5].ve0.valid = 0x%x\n", (*ptr).ve[5].ve0.valid);
+   debug_printf("\t\t.ve[5].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[5].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[5].ve1.dst_offset = 0x%x\n", (*ptr).ve[5].ve1.dst_offset);
+   debug_printf("\t\t.ve[5].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[5].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[5].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[5].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[5].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[5].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[5].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[5].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[6].ve0.src_offset = 0x%x\n", (*ptr).ve[6].ve0.src_offset);
+   debug_printf("\t\t.ve[6].ve0.src_format = 0x%x\n", (*ptr).ve[6].ve0.src_format);
+   debug_printf("\t\t.ve[6].ve0.valid = 0x%x\n", (*ptr).ve[6].ve0.valid);
+   debug_printf("\t\t.ve[6].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[6].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[6].ve1.dst_offset = 0x%x\n", (*ptr).ve[6].ve1.dst_offset);
+   debug_printf("\t\t.ve[6].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[6].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[6].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[6].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[6].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[6].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[6].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[6].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[7].ve0.src_offset = 0x%x\n", (*ptr).ve[7].ve0.src_offset);
+   debug_printf("\t\t.ve[7].ve0.src_format = 0x%x\n", (*ptr).ve[7].ve0.src_format);
+   debug_printf("\t\t.ve[7].ve0.valid = 0x%x\n", (*ptr).ve[7].ve0.valid);
+   debug_printf("\t\t.ve[7].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[7].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[7].ve1.dst_offset = 0x%x\n", (*ptr).ve[7].ve1.dst_offset);
+   debug_printf("\t\t.ve[7].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[7].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[7].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[7].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[7].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[7].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[7].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[7].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[8].ve0.src_offset = 0x%x\n", (*ptr).ve[8].ve0.src_offset);
+   debug_printf("\t\t.ve[8].ve0.src_format = 0x%x\n", (*ptr).ve[8].ve0.src_format);
+   debug_printf("\t\t.ve[8].ve0.valid = 0x%x\n", (*ptr).ve[8].ve0.valid);
+   debug_printf("\t\t.ve[8].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[8].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[8].ve1.dst_offset = 0x%x\n", (*ptr).ve[8].ve1.dst_offset);
+   debug_printf("\t\t.ve[8].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[8].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[8].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[8].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[8].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[8].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[8].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[8].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[9].ve0.src_offset = 0x%x\n", (*ptr).ve[9].ve0.src_offset);
+   debug_printf("\t\t.ve[9].ve0.src_format = 0x%x\n", (*ptr).ve[9].ve0.src_format);
+   debug_printf("\t\t.ve[9].ve0.valid = 0x%x\n", (*ptr).ve[9].ve0.valid);
+   debug_printf("\t\t.ve[9].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[9].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[9].ve1.dst_offset = 0x%x\n", (*ptr).ve[9].ve1.dst_offset);
+   debug_printf("\t\t.ve[9].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[9].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[9].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[9].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[9].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[9].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[9].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[9].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[10].ve0.src_offset = 0x%x\n", (*ptr).ve[10].ve0.src_offset);
+   debug_printf("\t\t.ve[10].ve0.src_format = 0x%x\n", (*ptr).ve[10].ve0.src_format);
+   debug_printf("\t\t.ve[10].ve0.valid = 0x%x\n", (*ptr).ve[10].ve0.valid);
+   debug_printf("\t\t.ve[10].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[10].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[10].ve1.dst_offset = 0x%x\n", (*ptr).ve[10].ve1.dst_offset);
+   debug_printf("\t\t.ve[10].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[10].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[10].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[10].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[10].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[10].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[10].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[10].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[11].ve0.src_offset = 0x%x\n", (*ptr).ve[11].ve0.src_offset);
+   debug_printf("\t\t.ve[11].ve0.src_format = 0x%x\n", (*ptr).ve[11].ve0.src_format);
+   debug_printf("\t\t.ve[11].ve0.valid = 0x%x\n", (*ptr).ve[11].ve0.valid);
+   debug_printf("\t\t.ve[11].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[11].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[11].ve1.dst_offset = 0x%x\n", (*ptr).ve[11].ve1.dst_offset);
+   debug_printf("\t\t.ve[11].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[11].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[11].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[11].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[11].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[11].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[11].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[11].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[12].ve0.src_offset = 0x%x\n", (*ptr).ve[12].ve0.src_offset);
+   debug_printf("\t\t.ve[12].ve0.src_format = 0x%x\n", (*ptr).ve[12].ve0.src_format);
+   debug_printf("\t\t.ve[12].ve0.valid = 0x%x\n", (*ptr).ve[12].ve0.valid);
+   debug_printf("\t\t.ve[12].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[12].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[12].ve1.dst_offset = 0x%x\n", (*ptr).ve[12].ve1.dst_offset);
+   debug_printf("\t\t.ve[12].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[12].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[12].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[12].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[12].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[12].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[12].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[12].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[13].ve0.src_offset = 0x%x\n", (*ptr).ve[13].ve0.src_offset);
+   debug_printf("\t\t.ve[13].ve0.src_format = 0x%x\n", (*ptr).ve[13].ve0.src_format);
+   debug_printf("\t\t.ve[13].ve0.valid = 0x%x\n", (*ptr).ve[13].ve0.valid);
+   debug_printf("\t\t.ve[13].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[13].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[13].ve1.dst_offset = 0x%x\n", (*ptr).ve[13].ve1.dst_offset);
+   debug_printf("\t\t.ve[13].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[13].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[13].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[13].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[13].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[13].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[13].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[13].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[14].ve0.src_offset = 0x%x\n", (*ptr).ve[14].ve0.src_offset);
+   debug_printf("\t\t.ve[14].ve0.src_format = 0x%x\n", (*ptr).ve[14].ve0.src_format);
+   debug_printf("\t\t.ve[14].ve0.valid = 0x%x\n", (*ptr).ve[14].ve0.valid);
+   debug_printf("\t\t.ve[14].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[14].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[14].ve1.dst_offset = 0x%x\n", (*ptr).ve[14].ve1.dst_offset);
+   debug_printf("\t\t.ve[14].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[14].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[14].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[14].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[14].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[14].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[14].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[14].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[15].ve0.src_offset = 0x%x\n", (*ptr).ve[15].ve0.src_offset);
+   debug_printf("\t\t.ve[15].ve0.src_format = 0x%x\n", (*ptr).ve[15].ve0.src_format);
+   debug_printf("\t\t.ve[15].ve0.valid = 0x%x\n", (*ptr).ve[15].ve0.valid);
+   debug_printf("\t\t.ve[15].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[15].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[15].ve1.dst_offset = 0x%x\n", (*ptr).ve[15].ve1.dst_offset);
+   debug_printf("\t\t.ve[15].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[15].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[15].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[15].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[15].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[15].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[15].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[15].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[16].ve0.src_offset = 0x%x\n", (*ptr).ve[16].ve0.src_offset);
+   debug_printf("\t\t.ve[16].ve0.src_format = 0x%x\n", (*ptr).ve[16].ve0.src_format);
+   debug_printf("\t\t.ve[16].ve0.valid = 0x%x\n", (*ptr).ve[16].ve0.valid);
+   debug_printf("\t\t.ve[16].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[16].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[16].ve1.dst_offset = 0x%x\n", (*ptr).ve[16].ve1.dst_offset);
+   debug_printf("\t\t.ve[16].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[16].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[16].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[16].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[16].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[16].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[16].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[16].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[17].ve0.src_offset = 0x%x\n", (*ptr).ve[17].ve0.src_offset);
+   debug_printf("\t\t.ve[17].ve0.src_format = 0x%x\n", (*ptr).ve[17].ve0.src_format);
+   debug_printf("\t\t.ve[17].ve0.valid = 0x%x\n", (*ptr).ve[17].ve0.valid);
+   debug_printf("\t\t.ve[17].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[17].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[17].ve1.dst_offset = 0x%x\n", (*ptr).ve[17].ve1.dst_offset);
+   debug_printf("\t\t.ve[17].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[17].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[17].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[17].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[17].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[17].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[17].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[17].ve1.vfcomponent0);
+}
+
+void
+brw_dump_vertex_element_state(const struct brw_vertex_element_state *ptr)
+{
+   debug_printf("\t\t.ve0.src_offset = 0x%x\n", (*ptr).ve0.src_offset);
+   debug_printf("\t\t.ve0.src_format = 0x%x\n", (*ptr).ve0.src_format);
+   debug_printf("\t\t.ve0.valid = 0x%x\n", (*ptr).ve0.valid);
+   debug_printf("\t\t.ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve1.dst_offset = 0x%x\n", (*ptr).ve1.dst_offset);
+   debug_printf("\t\t.ve1.vfcomponent3 = 0x%x\n", (*ptr).ve1.vfcomponent3);
+   debug_printf("\t\t.ve1.vfcomponent2 = 0x%x\n", (*ptr).ve1.vfcomponent2);
+   debug_printf("\t\t.ve1.vfcomponent1 = 0x%x\n", (*ptr).ve1.vfcomponent1);
+   debug_printf("\t\t.ve1.vfcomponent0 = 0x%x\n", (*ptr).ve1.vfcomponent0);
+}
+
+void
+brw_dump_vf_statistics(const struct brw_vf_statistics *ptr)
+{
+   debug_printf("\t\t.statistics_enable = 0x%x\n", (*ptr).statistics_enable);
+   debug_printf("\t\t.opcode = 0x%x\n", (*ptr).opcode);
+}
+
+void
+brw_dump_vs_unit_state(const struct brw_vs_unit_state *ptr)
+{
+   debug_printf("\t\t.thread0.grf_reg_count = 0x%x\n", (*ptr).thread0.grf_reg_count);
+   debug_printf("\t\t.thread0.kernel_start_pointer = 0x%x\n", (*ptr).thread0.kernel_start_pointer);
+   debug_printf("\t\t.thread1.ext_halt_exception_enable = 0x%x\n", (*ptr).thread1.ext_halt_exception_enable);
+   debug_printf("\t\t.thread1.sw_exception_enable = 0x%x\n", (*ptr).thread1.sw_exception_enable);
+   debug_printf("\t\t.thread1.mask_stack_exception_enable = 0x%x\n", (*ptr).thread1.mask_stack_exception_enable);
+   debug_printf("\t\t.thread1.timeout_exception_enable = 0x%x\n", (*ptr).thread1.timeout_exception_enable);
+   debug_printf("\t\t.thread1.illegal_op_exception_enable = 0x%x\n", (*ptr).thread1.illegal_op_exception_enable);
+   debug_printf("\t\t.thread1.depth_coef_urb_read_offset = 0x%x\n", (*ptr).thread1.depth_coef_urb_read_offset);
+   debug_printf("\t\t.thread1.floating_point_mode = 0x%x\n", (*ptr).thread1.floating_point_mode);
+   debug_printf("\t\t.thread1.thread_priority = 0x%x\n", (*ptr).thread1.thread_priority);
+   debug_printf("\t\t.thread1.binding_table_entry_count = 0x%x\n", (*ptr).thread1.binding_table_entry_count);
+   debug_printf("\t\t.thread1.single_program_flow = 0x%x\n", (*ptr).thread1.single_program_flow);
+   debug_printf("\t\t.thread2.per_thread_scratch_space = 0x%x\n", (*ptr).thread2.per_thread_scratch_space);
+   debug_printf("\t\t.thread2.scratch_space_base_pointer = 0x%x\n", (*ptr).thread2.scratch_space_base_pointer);
+   debug_printf("\t\t.thread3.dispatch_grf_start_reg = 0x%x\n", (*ptr).thread3.dispatch_grf_start_reg);
+   debug_printf("\t\t.thread3.urb_entry_read_offset = 0x%x\n", (*ptr).thread3.urb_entry_read_offset);
+   debug_printf("\t\t.thread3.urb_entry_read_length = 0x%x\n", (*ptr).thread3.urb_entry_read_length);
+   debug_printf("\t\t.thread3.const_urb_entry_read_offset = 0x%x\n", (*ptr).thread3.const_urb_entry_read_offset);
+   debug_printf("\t\t.thread3.const_urb_entry_read_length = 0x%x\n", (*ptr).thread3.const_urb_entry_read_length);
+   debug_printf("\t\t.thread4.stats_enable = 0x%x\n", (*ptr).thread4.stats_enable);
+   debug_printf("\t\t.thread4.nr_urb_entries = 0x%x\n", (*ptr).thread4.nr_urb_entries);
+   debug_printf("\t\t.thread4.urb_entry_allocation_size = 0x%x\n", (*ptr).thread4.urb_entry_allocation_size);
+   debug_printf("\t\t.thread4.max_threads = 0x%x\n", (*ptr).thread4.max_threads);
+   debug_printf("\t\t.vs5.sampler_count = 0x%x\n", (*ptr).vs5.sampler_count);
+   debug_printf("\t\t.vs5.sampler_state_pointer = 0x%x\n", (*ptr).vs5.sampler_state_pointer);
+   debug_printf("\t\t.vs6.vs_enable = 0x%x\n", (*ptr).vs6.vs_enable);
+   debug_printf("\t\t.vs6.vert_cache_disable = 0x%x\n", (*ptr).vs6.vert_cache_disable);
+}
+
+void
+brw_dump_wm_unit_state(const struct brw_wm_unit_state *ptr)
+{
+   debug_printf("\t\t.thread0.grf_reg_count = 0x%x\n", (*ptr).thread0.grf_reg_count);
+   debug_printf("\t\t.thread0.kernel_start_pointer = 0x%x\n", (*ptr).thread0.kernel_start_pointer);
+   debug_printf("\t\t.thread1.ext_halt_exception_enable = 0x%x\n", (*ptr).thread1.ext_halt_exception_enable);
+   debug_printf("\t\t.thread1.sw_exception_enable = 0x%x\n", (*ptr).thread1.sw_exception_enable);
+   debug_printf("\t\t.thread1.mask_stack_exception_enable = 0x%x\n", (*ptr).thread1.mask_stack_exception_enable);
+   debug_printf("\t\t.thread1.timeout_exception_enable = 0x%x\n", (*ptr).thread1.timeout_exception_enable);
+   debug_printf("\t\t.thread1.illegal_op_exception_enable = 0x%x\n", (*ptr).thread1.illegal_op_exception_enable);
+   debug_printf("\t\t.thread1.depth_coef_urb_read_offset = 0x%x\n", (*ptr).thread1.depth_coef_urb_read_offset);
+   debug_printf("\t\t.thread1.floating_point_mode = 0x%x\n", (*ptr).thread1.floating_point_mode);
+   debug_printf("\t\t.thread1.thread_priority = 0x%x\n", (*ptr).thread1.thread_priority);
+   debug_printf("\t\t.thread1.binding_table_entry_count = 0x%x\n", (*ptr).thread1.binding_table_entry_count);
+   debug_printf("\t\t.thread1.single_program_flow = 0x%x\n", (*ptr).thread1.single_program_flow);
+   debug_printf("\t\t.thread2.per_thread_scratch_space = 0x%x\n", (*ptr).thread2.per_thread_scratch_space);
+   debug_printf("\t\t.thread2.scratch_space_base_pointer = 0x%x\n", (*ptr).thread2.scratch_space_base_pointer);
+   debug_printf("\t\t.thread3.dispatch_grf_start_reg = 0x%x\n", (*ptr).thread3.dispatch_grf_start_reg);
+   debug_printf("\t\t.thread3.urb_entry_read_offset = 0x%x\n", (*ptr).thread3.urb_entry_read_offset);
+   debug_printf("\t\t.thread3.urb_entry_read_length = 0x%x\n", (*ptr).thread3.urb_entry_read_length);
+   debug_printf("\t\t.thread3.const_urb_entry_read_offset = 0x%x\n", (*ptr).thread3.const_urb_entry_read_offset);
+   debug_printf("\t\t.thread3.const_urb_entry_read_length = 0x%x\n", (*ptr).thread3.const_urb_entry_read_length);
+   debug_printf("\t\t.wm4.stats_enable = 0x%x\n", (*ptr).wm4.stats_enable);
+   debug_printf("\t\t.wm4.depth_buffer_clear = 0x%x\n", (*ptr).wm4.depth_buffer_clear);
+   debug_printf("\t\t.wm4.sampler_count = 0x%x\n", (*ptr).wm4.sampler_count);
+   debug_printf("\t\t.wm4.sampler_state_pointer = 0x%x\n", (*ptr).wm4.sampler_state_pointer);
+   debug_printf("\t\t.wm5.enable_8_pix = 0x%x\n", (*ptr).wm5.enable_8_pix);
+   debug_printf("\t\t.wm5.enable_16_pix = 0x%x\n", (*ptr).wm5.enable_16_pix);
+   debug_printf("\t\t.wm5.enable_32_pix = 0x%x\n", (*ptr).wm5.enable_32_pix);
+   debug_printf("\t\t.wm5.enable_con_32_pix = 0x%x\n", (*ptr).wm5.enable_con_32_pix);
+   debug_printf("\t\t.wm5.enable_con_64_pix = 0x%x\n", (*ptr).wm5.enable_con_64_pix);
+   debug_printf("\t\t.wm5.legacy_global_depth_bias = 0x%x\n", (*ptr).wm5.legacy_global_depth_bias);
+   debug_printf("\t\t.wm5.line_stipple = 0x%x\n", (*ptr).wm5.line_stipple);
+   debug_printf("\t\t.wm5.depth_offset = 0x%x\n", (*ptr).wm5.depth_offset);
+   debug_printf("\t\t.wm5.polygon_stipple = 0x%x\n", (*ptr).wm5.polygon_stipple);
+   debug_printf("\t\t.wm5.line_aa_region_width = 0x%x\n", (*ptr).wm5.line_aa_region_width);
+   debug_printf("\t\t.wm5.line_endcap_aa_region_width = 0x%x\n", (*ptr).wm5.line_endcap_aa_region_width);
+   debug_printf("\t\t.wm5.early_depth_test = 0x%x\n", (*ptr).wm5.early_depth_test);
+   debug_printf("\t\t.wm5.thread_dispatch_enable = 0x%x\n", (*ptr).wm5.thread_dispatch_enable);
+   debug_printf("\t\t.wm5.program_uses_depth = 0x%x\n", (*ptr).wm5.program_uses_depth);
+   debug_printf("\t\t.wm5.program_computes_depth = 0x%x\n", (*ptr).wm5.program_computes_depth);
+   debug_printf("\t\t.wm5.program_uses_killpixel = 0x%x\n", (*ptr).wm5.program_uses_killpixel);
+   debug_printf("\t\t.wm5.legacy_line_rast = 0x%x\n", (*ptr).wm5.legacy_line_rast);
+   debug_printf("\t\t.wm5.transposed_urb_read_enable = 0x%x\n", (*ptr).wm5.transposed_urb_read_enable);
+   debug_printf("\t\t.wm5.max_threads = 0x%x\n", (*ptr).wm5.max_threads);
+   debug_printf("\t\t.global_depth_offset_constant = %f\n", (*ptr).global_depth_offset_constant);
+   debug_printf("\t\t.global_depth_offset_scale = %f\n", (*ptr).global_depth_offset_scale);
+   debug_printf("\t\t.wm8.grf_reg_count_1 = 0x%x\n", (*ptr).wm8.grf_reg_count_1);
+   debug_printf("\t\t.wm8.kernel_start_pointer_1 = 0x%x\n", (*ptr).wm8.kernel_start_pointer_1);
+   debug_printf("\t\t.wm9.grf_reg_count_2 = 0x%x\n", (*ptr).wm9.grf_reg_count_2);
+   debug_printf("\t\t.wm9.kernel_start_pointer_2 = 0x%x\n", (*ptr).wm9.kernel_start_pointer_2);
+   debug_printf("\t\t.wm10.grf_reg_count_3 = 0x%x\n", (*ptr).wm10.grf_reg_count_3);
+   debug_printf("\t\t.wm10.kernel_start_pointer_3 = 0x%x\n", (*ptr).wm10.kernel_start_pointer_3);
+}
+
diff --git a/src/gallium/drivers/i965/brw_structs_dump.h b/src/gallium/drivers/i965/brw_structs_dump.h
new file mode 100644
index 0000000000..7c02dbfe33
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_structs_dump.h
@@ -0,0 +1,276 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Dump i965 data structures.
+ *
+ * Generated automatically from brw_structs.h by brw_structs_dump.py.
+ */
+
+#ifndef BRW_STRUCTS_DUMP_H
+#define BRW_STRUCTS_DUMP_H
+
+struct brw_3d_control;
+struct brw_3d_primitive;
+struct brw_aa_line_parameters;
+struct brw_binding_table_pointers;
+struct brw_blend_constant_color;
+struct brw_cc0;
+struct brw_cc1;
+struct brw_cc2;
+struct brw_cc3;
+struct brw_cc4;
+struct brw_cc5;
+struct brw_cc6;
+struct brw_cc7;
+struct brw_cc_unit_state;
+struct brw_cc_viewport;
+struct brw_clip_unit_state;
+struct brw_clipper_viewport;
+struct brw_constant_buffer;
+struct brw_cs_urb_state;
+struct brw_depthbuffer;
+struct brw_depthbuffer_g4x;
+struct brw_drawrect;
+struct brw_global_depth_offset_clamp;
+struct brw_gs_unit_state;
+struct brw_indexbuffer;
+struct brw_line_stipple;
+struct brw_mi_flush;
+struct brw_pipe_control;
+struct brw_pipeline_select;
+struct brw_pipelined_state_pointers;
+struct brw_polygon_stipple;
+struct brw_polygon_stipple_offset;
+struct brw_sampler_default_color;
+struct brw_sampler_state;
+struct brw_sf_unit_state;
+struct brw_sf_viewport;
+struct brw_ss0;
+struct brw_ss1;
+struct brw_ss2;
+struct brw_ss3;
+struct brw_state_base_address;
+struct brw_state_prefetch;
+struct brw_surf_ss0;
+struct brw_surf_ss1;
+struct brw_surf_ss2;
+struct brw_surf_ss3;
+struct brw_surf_ss4;
+struct brw_surf_ss5;
+struct brw_surface_state;
+struct brw_system_instruction_pointer;
+struct brw_urb_fence;
+struct brw_urb_immediate;
+struct brw_vb_array_state;
+struct brw_vertex_buffer_state;
+struct brw_vertex_element_packet;
+struct brw_vertex_element_state;
+struct brw_vf_statistics;
+struct brw_vs_unit_state;
+struct brw_wm_unit_state;
+
+void
+brw_dump_3d_control(const struct brw_3d_control *ptr);
+
+void
+brw_dump_3d_primitive(const struct brw_3d_primitive *ptr);
+
+void
+brw_dump_aa_line_parameters(const struct brw_aa_line_parameters *ptr);
+
+void
+brw_dump_binding_table_pointers(const struct brw_binding_table_pointers *ptr);
+
+void
+brw_dump_blend_constant_color(const struct brw_blend_constant_color *ptr);
+
+void
+brw_dump_cc0(const struct brw_cc0 *ptr);
+
+void
+brw_dump_cc1(const struct brw_cc1 *ptr);
+
+void
+brw_dump_cc2(const struct brw_cc2 *ptr);
+
+void
+brw_dump_cc3(const struct brw_cc3 *ptr);
+
+void
+brw_dump_cc4(const struct brw_cc4 *ptr);
+
+void
+brw_dump_cc5(const struct brw_cc5 *ptr);
+
+void
+brw_dump_cc6(const struct brw_cc6 *ptr);
+
+void
+brw_dump_cc7(const struct brw_cc7 *ptr);
+
+void
+brw_dump_cc_unit_state(const struct brw_cc_unit_state *ptr);
+
+void
+brw_dump_cc_viewport(const struct brw_cc_viewport *ptr);
+
+void
+brw_dump_clip_unit_state(const struct brw_clip_unit_state *ptr);
+
+void
+brw_dump_clipper_viewport(const struct brw_clipper_viewport *ptr);
+
+void
+brw_dump_constant_buffer(const struct brw_constant_buffer *ptr);
+
+void
+brw_dump_cs_urb_state(const struct brw_cs_urb_state *ptr);
+
+void
+brw_dump_depthbuffer(const struct brw_depthbuffer *ptr);
+
+void
+brw_dump_depthbuffer_g4x(const struct brw_depthbuffer_g4x *ptr);
+
+void
+brw_dump_drawrect(const struct brw_drawrect *ptr);
+
+void
+brw_dump_global_depth_offset_clamp(const struct brw_global_depth_offset_clamp *ptr);
+
+void
+brw_dump_gs_unit_state(const struct brw_gs_unit_state *ptr);
+
+void
+brw_dump_indexbuffer(const struct brw_indexbuffer *ptr);
+
+void
+brw_dump_line_stipple(const struct brw_line_stipple *ptr);
+
+void
+brw_dump_mi_flush(const struct brw_mi_flush *ptr);
+
+void
+brw_dump_pipe_control(const struct brw_pipe_control *ptr);
+
+void
+brw_dump_pipeline_select(const struct brw_pipeline_select *ptr);
+
+void
+brw_dump_pipelined_state_pointers(const struct brw_pipelined_state_pointers *ptr);
+
+void
+brw_dump_polygon_stipple(const struct brw_polygon_stipple *ptr);
+
+void
+brw_dump_polygon_stipple_offset(const struct brw_polygon_stipple_offset *ptr);
+
+void
+brw_dump_sampler_default_color(const struct brw_sampler_default_color *ptr);
+
+void
+brw_dump_sampler_state(const struct brw_sampler_state *ptr);
+
+void
+brw_dump_sf_unit_state(const struct brw_sf_unit_state *ptr);
+
+void
+brw_dump_sf_viewport(const struct brw_sf_viewport *ptr);
+
+void
+brw_dump_ss0(const struct brw_ss0 *ptr);
+
+void
+brw_dump_ss1(const struct brw_ss1 *ptr);
+
+void
+brw_dump_ss2(const struct brw_ss2 *ptr);
+
+void
+brw_dump_ss3(const struct brw_ss3 *ptr);
+
+void
+brw_dump_state_base_address(const struct brw_state_base_address *ptr);
+
+void
+brw_dump_state_prefetch(const struct brw_state_prefetch *ptr);
+
+void
+brw_dump_surf_ss0(const struct brw_surf_ss0 *ptr);
+
+void
+brw_dump_surf_ss1(const struct brw_surf_ss1 *ptr);
+
+void
+brw_dump_surf_ss2(const struct brw_surf_ss2 *ptr);
+
+void
+brw_dump_surf_ss3(const struct brw_surf_ss3 *ptr);
+
+void
+brw_dump_surf_ss4(const struct brw_surf_ss4 *ptr);
+
+void
+brw_dump_surf_ss5(const struct brw_surf_ss5 *ptr);
+
+void
+brw_dump_surface_state(const struct brw_surface_state *ptr);
+
+void
+brw_dump_system_instruction_pointer(const struct brw_system_instruction_pointer *ptr);
+
+void
+brw_dump_urb_fence(const struct brw_urb_fence *ptr);
+
+void
+brw_dump_urb_immediate(const struct brw_urb_immediate *ptr);
+
+void
+brw_dump_vb_array_state(const struct brw_vb_array_state *ptr);
+
+void
+brw_dump_vertex_buffer_state(const struct brw_vertex_buffer_state *ptr);
+
+void
+brw_dump_vertex_element_packet(const struct brw_vertex_element_packet *ptr);
+
+void
+brw_dump_vertex_element_state(const struct brw_vertex_element_state *ptr);
+
+void
+brw_dump_vf_statistics(const struct brw_vf_statistics *ptr);
+
+void
+brw_dump_vs_unit_state(const struct brw_vs_unit_state *ptr);
+
+void
+brw_dump_wm_unit_state(const struct brw_wm_unit_state *ptr);
+
+
+#endif /* BRW_STRUCTS_DUMP_H */
diff --git a/src/gallium/drivers/i965/brw_structs_dump.py b/src/gallium/drivers/i965/brw_structs_dump.py
new file mode 100755
index 0000000000..6dba49ad91
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_structs_dump.py
@@ -0,0 +1,291 @@
+#!/usr/bin/env python
+'''
+Generates dumpers for the i965 state strucutures using pygccxml.
+
+Run as 
+
+  PYTHONPATH=/path/to/pygccxml-1.0.0 python brw_structs_dump.py
+
+Jose Fonseca <jfonseca@vmware.com>
+'''
+
+copyright = '''
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+ '''
+
+import os
+import sys
+import re
+
+from pygccxml import parser
+from pygccxml import declarations
+
+from pygccxml.declarations import algorithm
+from pygccxml.declarations import decl_visitor
+from pygccxml.declarations import type_traits
+from pygccxml.declarations import type_visitor
+
+
+enums = True
+
+
+def vars_filter(variable):
+    name = variable.name
+    return not re.match('^pad\d*', name) and name != 'dword' 
+
+
+class decl_dumper_t(decl_visitor.decl_visitor_t):
+
+    def __init__(self, stream, instance = '', decl = None):
+        decl_visitor.decl_visitor_t.__init__(self)
+        self.stream = stream
+        self._instance = instance
+        self.decl = decl
+
+    def clone(self):
+        return decl_dumper_t(self.stream, self._instance, self.decl)
+
+    def visit_class(self):
+        class_ = self.decl
+        assert self.decl.class_type in ('struct', 'union')
+
+        for variable in class_.variables(recursive = False):
+            if vars_filter(variable):
+                dump_type(self.stream, self._instance + '.' + variable.name, variable.type)
+
+    def visit_enumeration(self):
+        if enums:
+            self.stream.write('   switch(%s) {\n' % ("(*ptr)" + self._instance,))
+            for name, value in self.decl.values:
+                self.stream.write('   case %s:\n' % (name,))
+                self.stream.write('      debug_printf("\\t\\t%s = %s\\n");\n' % (self._instance, name))
+                self.stream.write('      break;\n')
+            self.stream.write('   default:\n')
+            self.stream.write('      debug_printf("\\t\\t%s = %%i\\n", %s);\n' % (self._instance, "(*ptr)" + self._instance))
+            self.stream.write('      break;\n')
+            self.stream.write('   }\n')
+        else:
+            self.stream.write('   debug_printf("\\t\\t%s = %%i\\n", %s);\n' % (self._instance, "(*ptr)" + self._instance))
+
+
+def dump_decl(stream, instance, decl):
+    dumper = decl_dumper_t(stream, instance, decl)
+    algorithm.apply_visitor(dumper, decl)
+
+
+class type_dumper_t(type_visitor.type_visitor_t):
+
+    def __init__(self, stream, instance, type_):
+        type_visitor.type_visitor_t.__init__(self)
+        self.stream = stream
+        self.instance = instance
+        self.type = type_
+
+    def clone(self):
+        return type_dumper_t(self.instance, self.type)
+
+    def visit_bool(self):
+        self.print_instance('%i')
+        
+    def visit_char(self):
+        #self.print_instance('%i')
+        self.print_instance('0x%x')
+        
+    def visit_unsigned_char(self):
+        #self.print_instance('%u')
+        self.print_instance('0x%x')
+
+    def visit_signed_char(self):
+        #self.print_instance('%i')
+        self.print_instance('0x%x')
+    
+    def visit_wchar(self):
+        self.print_instance('0x%x')
+        
+    def visit_short_int(self):
+        #self.print_instance('%i')
+        self.print_instance('0x%x')
+        
+    def visit_short_unsigned_int(self):
+        #self.print_instance('%u')
+        self.print_instance('0x%x')
+        
+    def visit_int(self):
+        #self.print_instance('%i')
+        self.print_instance('0x%x')
+        
+    def visit_unsigned_int(self):
+        #self.print_instance('%u')
+        self.print_instance('0x%x')
+        
+    def visit_long_int(self):
+        #self.print_instance('%li')
+        self.print_instance('0x%lx')
+        
+    def visit_long_unsigned_int(self):
+        #self.print_instance('%lu')
+        self.print_instance('%0xlx')
+        
+    def visit_long_long_int(self):
+        #self.print_instance('%lli')
+        self.print_instance('%0xllx')
+        
+    def visit_long_long_unsigned_int(self):
+        #self.print_instance('%llu')
+        self.print_instance('0x%llx')
+        
+    def visit_float(self):
+        self.print_instance('%f')
+        
+    def visit_double(self):
+        self.print_instance('%f')
+        
+    def visit_array(self):
+        for i in range(type_traits.array_size(self.type)):
+            dump_type(self.stream, self.instance + '[%i]' % i, type_traits.base_type(self.type))
+
+    def visit_pointer(self):
+        self.print_instance('%p')
+
+    def visit_declarated(self):
+        #stream.write('decl = %r\n' % self.type.decl_string)
+        decl = type_traits.remove_declarated(self.type)
+        dump_decl(self.stream, self.instance, decl)
+
+    def print_instance(self, format):
+        self.stream.write('   debug_printf("\\t\\t%s = %s\\n", %s);\n' % (self.instance, format, "(*ptr)" + self.instance))
+
+
+
+def dump_type(stream, instance, type_):
+    type_ = type_traits.remove_alias(type_)
+    visitor = type_dumper_t(stream, instance, type_)
+    algorithm.apply_visitor(visitor, type_)
+
+
+def dump_struct_interface(stream, class_, suffix = ';'):
+    name = class_.name
+    assert name.startswith('brw_');
+    name = name[:4] + 'dump_' + name[4:]
+    stream.write('void\n')
+    stream.write('%s(const struct %s *ptr)%s\n' % (name, class_.name, suffix))
+
+
+def dump_struct_implementation(stream, decls, class_):
+    dump_struct_interface(stream, class_, suffix = '')
+    stream.write('{\n')
+    dump_decl(stream, '', class_)
+    stream.write('}\n')
+    stream.write('\n')
+
+
+def dump_header(stream):
+    stream.write(copyright.strip() + '\n')
+    stream.write('\n')
+    stream.write('/**\n')
+    stream.write(' * @file\n')
+    stream.write(' * Dump i965 data structures.\n')
+    stream.write(' *\n')
+    stream.write(' * Generated automatically from brw_structs.h by brw_structs_dump.py.\n')
+    stream.write(' */\n')
+    stream.write('\n')
+
+
+def dump_interfaces(decls, global_ns, names):
+    stream = open('brw_structs_dump.h', 'wt')
+    
+    dump_header(stream)
+    
+    stream.write('#ifndef BRW_STRUCTS_DUMP_H\n')
+    stream.write('#define BRW_STRUCTS_DUMP_H\n')
+    stream.write('\n')
+    
+    for name in names:
+        stream.write('struct %s;\n' % (name,))
+    stream.write('\n')
+
+    for name in names:
+        (class_,) = global_ns.classes(name = name)
+        dump_struct_interface(stream, class_)
+        stream.write('\n')
+    stream.write('\n')
+
+    stream.write('#endif /* BRW_STRUCTS_DUMP_H */\n')
+
+
+def dump_implementations(decls, global_ns, names):
+    stream = open('brw_structs_dump.c', 'wt')
+    
+    dump_header(stream)
+
+    stream.write('#include "util/u_debug.h"\n')
+    stream.write('\n')
+    stream.write('#include "brw_types.h"\n')
+    stream.write('#include "brw_structs.h"\n')
+    stream.write('#include "brw_structs_dump.h"\n')
+    stream.write('\n')
+
+    for name in names:
+        (class_,) = global_ns.classes(name = name)
+        dump_struct_implementation(stream, decls, class_)
+
+
+def decl_filter(decl):
+    '''Filter the declarations we're interested in'''
+    name = decl.name
+    return name.startswith('brw_') and name not in ('brw_instruction',) 
+
+
+def main():
+
+    config = parser.config_t(
+        include_paths = [
+            '../../include',
+        ],
+        compiler = 'gcc',
+    )
+
+    headers = [
+        'brw_types.h', 
+        'brw_structs.h', 
+    ]
+
+    decls = parser.parse(headers, config, parser.COMPILATION_MODE.ALL_AT_ONCE)
+    global_ns = declarations.get_global_namespace(decls)
+
+    names = []
+    for class_ in global_ns.classes(decl_filter):
+        names.append(class_.name)
+    names.sort()
+
+    dump_interfaces(decls, global_ns, names)
+    dump_implementations(decls, global_ns, names)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gallium/drivers/i965/brw_swtnl.c b/src/gallium/drivers/i965/brw_swtnl.c
new file mode 100644
index 0000000000..f96301e99e
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_swtnl.c
@@ -0,0 +1,95 @@
+
+#include "brw_context.h"
+#include "brw_pipe_rast.h"
+
+
+#if 0
+
+static GLboolean need_swtnl( struct brw_context *brw )
+{
+   const struct pipe_rasterizer_state *rast = &brw->curr.rast->templ;
+
+   /* If we don't require strict OpenGL conformance, never 
+    * use fallbacks.  If we're forcing fallbacks, always
+    * use fallfacks.
+    */
+   if (brw->flags.no_swtnl)
+      return FALSE;
+
+   if (brw->flags.force_swtnl)
+      return TRUE;
+
+   /* Exceeding hw limits on number of VS inputs?
+    */
+   if (brw->curr.num_vertex_elements == 0 ||
+       brw->curr.num_vertex_elements >= BRW_VEP_MAX) {
+      return TRUE;
+   }
+
+   /* Position array with zero stride?
+    *
+    * XXX: position isn't always at zero...
+    * XXX: eliminate zero-stride arrays
+    */
+   {
+      int ve0_vb = brw->curr.vertex_element[0].vertex_buffer_index;
+      
+      if (brw->curr.vertex_buffer[ve0_vb].stride == 0)
+	 return TRUE;
+   }
+
+   /* XXX: short-circuit
+    */
+   return FALSE;
+
+   if (brw->reduced_primitive == PIPE_PRIM_TRIANGLES) {
+      if (rast->poly_smooth)
+	 return TRUE;
+
+   }
+   
+   if (brw->reduced_primitive == PIPE_PRIM_LINES ||
+       (brw->reduced_primitive == PIPE_PRIM_TRIANGLES &&
+	(rast->fill_cw == PIPE_POLYGON_MODE_LINE ||
+	 rast->fill_ccw == PIPE_POLYGON_MODE_LINE)))
+   {
+      /* BRW hardware will do AA lines, but they are non-conformant it
+       * seems.  TBD whether we keep this fallback:
+       */
+      if (rast->line_smooth)
+	 return TRUE;
+
+      /* XXX: was a fallback in mesa (gs doesn't get enough
+       * information to know when to reset stipple counter), but there
+       * must be a way around it.
+       */
+      if (rast->line_stipple_enable &&
+	  (brw->reduced_primitive == PIPE_PRIM_TRIANGLES ||
+	   brw->primitive == PIPE_PRIM_LINE_LOOP || 
+	   brw->primitive == PIPE_PRIM_LINE_STRIP))
+	 return TRUE;
+   }
+
+   
+   if (brw->reduced_primitive == PIPE_PRIM_POINTS ||
+       (brw->reduced_primitive == PIPE_PRIM_TRIANGLES &&
+	(rast->fill_cw == PIPE_POLYGON_MODE_POINT ||
+	 rast->fill_ccw == PIPE_POLYGON_MODE_POINT)))
+   {
+      if (rast->point_smooth)
+	 return TRUE;
+   }
+
+   /* BRW hardware doesn't handle CLAMP texturing correctly;
+    * brw_wm_sampler_state:translate_wrap_mode() treats CLAMP
+    * as CLAMP_TO_EDGE instead.  If we're using CLAMP, and
+    * we want strict conformance, force the fallback.
+    *
+    * XXX: need a workaround for this.
+    */
+      
+   /* Nothing stopping us from the fast path now */
+   return FALSE;
+}
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_types.h b/src/gallium/drivers/i965/brw_types.h
new file mode 100644
index 0000000000..89e08a5c80
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_types.h
@@ -0,0 +1,21 @@
+#ifndef BRW_TYPES_H
+#define BRW_TYPES_H
+
+#include "pipe/p_compiler.h"
+
+typedef uint32_t GLuint;
+typedef uint8_t GLubyte;
+typedef uint16_t GLushort;
+typedef int32_t GLint;
+typedef int8_t GLbyte;
+typedef int16_t GLshort;
+typedef float GLfloat;
+
+/* no GLenum, translate all away */
+
+typedef uint8_t GLboolean;
+
+#define GL_FALSE FALSE
+#define GL_TRUE TRUE
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_urb.c b/src/gallium/drivers/i965/brw_urb.c
new file mode 100644
index 0000000000..907ec56c6c
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_urb.c
@@ -0,0 +1,263 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+        
+
+
+#include "brw_batchbuffer.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_debug.h"
+
+#define VS 0
+#define GS 1
+#define CLP 2
+#define SF 3
+#define CS 4
+
+/** @file brw_urb.c
+ *
+ * Manages the division of the URB space between the various fixed-function
+ * units.
+ *
+ * See the Thread Initiation Management section of the GEN4 B-Spec, and
+ * the individual *_STATE structures for restrictions on numbers of
+ * entries and threads.
+ */
+
+/*
+ * Generally, a unit requires a min_nr_entries based on how many entries
+ * it produces before the downstream unit gets unblocked and can use and
+ * dereference some of its handles.
+ *
+ * The SF unit preallocates a PUE at the start of thread dispatch, and only
+ * uses that one.  So it requires one entry per thread.
+ *
+ * For CLIP, the SF unit will hold the previous primitive while the
+ * next is getting assembled, meaning that linestrips require 3 CLIP VUEs
+ * (vertices) to ensure continued processing, trifans require 4, and tristrips
+ * require 5.  There can be 1 or 2 threads, and each has the same requirement.
+ *
+ * GS has the same requirement as CLIP, but it never handles tristrips,
+ * so we can lower the minimum to 4 for the POLYGONs (trifans) it produces.
+ * We only run it single-threaded.
+ *
+ * For VS, the number of entries may be 8, 12, 16, or 32 (or 64 on G4X).
+ * Each thread processes 2 preallocated VUEs (vertices) at a time, and they
+ * get streamed down as soon as threads processing earlier vertices get
+ * theirs accepted.
+ *
+ * Each unit will take the number of URB entries we give it (based on the
+ * entry size calculated in brw_vs_emit.c for VUEs, brw_sf_emit.c for PUEs,
+ * and brw_curbe.c for the CURBEs) and decide its maximum number of
+ * threads it can support based on that. in brw_*_state.c.
+ *
+ * XXX: Are the min_entry_size numbers useful?
+ * XXX: Verify min_nr_entries, esp for VS.
+ * XXX: Verify SF min_entry_size.
+ */
+static const struct urb_limits {
+   GLuint min_nr_entries;
+   GLuint preferred_nr_entries;
+   GLuint min_entry_size;
+   GLuint max_entry_size;
+} limits[CS+1] = {
+   { 16, 32, 1, 5 },			/* vs */
+   { 4, 8,  1, 5 },			/* gs */
+   { 5, 10,  1, 5 },			/* clp */
+   { 1, 8,  1, 12 },		        /* sf */
+   { 1, 4,  1, 32 }			/* cs */
+};
+
+
+static GLboolean check_urb_layout( struct brw_context *brw )
+{
+   brw->urb.vs_start = 0;
+   brw->urb.gs_start = brw->urb.nr_vs_entries * brw->urb.vsize;
+   brw->urb.clip_start = brw->urb.gs_start + brw->urb.nr_gs_entries * brw->urb.vsize;
+   brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize;
+   brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize;
+
+   return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= URB_SIZES(brw);
+}
+
+/* Most minimal update, forces re-emit of URB fence packet after GS
+ * unit turned on/off.
+ */
+static int recalculate_urb_fence( struct brw_context *brw )
+{
+   GLuint csize = brw->curbe.total_size;
+   GLuint vsize = brw->vs.prog_data->urb_entry_size;
+   GLuint sfsize = brw->sf.prog_data->urb_entry_size;
+
+   if (csize < limits[CS].min_entry_size)
+      csize = limits[CS].min_entry_size;
+
+   if (vsize < limits[VS].min_entry_size)
+      vsize = limits[VS].min_entry_size;
+
+   if (sfsize < limits[SF].min_entry_size)
+      sfsize = limits[SF].min_entry_size;
+
+   if (brw->urb.vsize < vsize ||
+       brw->urb.sfsize < sfsize ||
+       brw->urb.csize < csize ||
+       (brw->urb.constrained && (brw->urb.vsize > vsize ||
+				 brw->urb.sfsize > sfsize ||
+				 brw->urb.csize > csize))) {
+      
+
+      brw->urb.csize = csize;
+      brw->urb.sfsize = sfsize;
+      brw->urb.vsize = vsize;
+
+      brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;	
+      brw->urb.nr_gs_entries = limits[GS].preferred_nr_entries;	
+      brw->urb.nr_clip_entries = limits[CLP].preferred_nr_entries;
+      brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;	
+      brw->urb.nr_cs_entries = limits[CS].preferred_nr_entries;	
+
+      brw->urb.constrained = 0;
+
+      if (BRW_IS_IGDNG(brw)) {
+         brw->urb.nr_vs_entries = 128;
+         brw->urb.nr_sf_entries = 48;
+         if (check_urb_layout(brw)) {
+            goto done;
+         } else {
+            brw->urb.constrained = 1;
+            brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+            brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
+         }
+      } else if (BRW_IS_G4X(brw)) {
+	 brw->urb.nr_vs_entries = 64;
+	 if (check_urb_layout(brw)) {
+	    goto done;
+	 } else {
+	    brw->urb.constrained = 1;
+	    brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+	 }
+      }
+
+      if (BRW_DEBUG & DEBUG_MIN_URB) {
+	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;	
+	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;	
+	 brw->urb.nr_clip_entries = limits[CLP].min_nr_entries;
+	 brw->urb.nr_sf_entries = limits[SF].min_nr_entries;	
+	 brw->urb.nr_cs_entries = limits[CS].min_nr_entries;	
+	 brw->urb.constrained = 1;
+      }
+
+      if (!check_urb_layout(brw)) {
+	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;	
+	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;	
+	 brw->urb.nr_clip_entries = limits[CLP].min_nr_entries;
+	 brw->urb.nr_sf_entries = limits[SF].min_nr_entries;	
+	 brw->urb.nr_cs_entries = limits[CS].min_nr_entries;	
+
+	 /* Mark us as operating with constrained nr_entries, so that next
+	  * time we recalculate we'll resize the fences in the hope of
+	  * escaping constrained mode and getting back to normal performance.
+	  */
+	 brw->urb.constrained = 1;
+	 
+	 if (!check_urb_layout(brw)) {
+	    /* This is impossible, given the maximal sizes of urb
+	     * entries and the values for minimum nr of entries
+	     * provided above.
+	     */
+	    debug_printf("couldn't calculate URB layout!\n");
+	    exit(1);
+	 }
+	 
+	 if (BRW_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
+	    debug_printf("URB CONSTRAINED\n");
+      }
+
+done:
+      if (BRW_DEBUG & DEBUG_URB)
+	 debug_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+		      brw->urb.vs_start,
+		      brw->urb.gs_start,
+		      brw->urb.clip_start,
+		      brw->urb.sf_start,
+		      brw->urb.cs_start, 
+		      URB_SIZES(brw));
+      
+      brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
+   }
+
+   return 0;
+}
+
+
+const struct brw_tracked_state brw_recalculate_urb_fence = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_CURBE_OFFSETS,
+      .cache = (CACHE_NEW_VS_PROG |
+		CACHE_NEW_SF_PROG)
+   },
+   .prepare = recalculate_urb_fence
+};
+
+
+
+
+
+int brw_upload_urb_fence(struct brw_context *brw)
+{
+   struct brw_urb_fence uf;
+   memset(&uf, 0, sizeof(uf));
+
+   uf.header.opcode = CMD_URB_FENCE;
+   uf.header.length = sizeof(uf)/4-2;
+   uf.header.vs_realloc = 1;
+   uf.header.gs_realloc = 1;
+   uf.header.clp_realloc = 1;
+   uf.header.sf_realloc = 1;
+   uf.header.vfe_realloc = 1;
+   uf.header.cs_realloc = 1;
+
+   /* The ordering below is correct, not the layout in the
+    * instruction.
+    *
+    * There are 256/384 urb reg pairs in total.
+    */
+   uf.bits0.vs_fence  = brw->urb.gs_start;
+   uf.bits0.gs_fence  = brw->urb.clip_start; 
+   uf.bits0.clp_fence = brw->urb.sf_start; 
+   uf.bits1.sf_fence  = brw->urb.cs_start; 
+   uf.bits1.cs_fence  = URB_SIZES(brw);
+
+   BRW_BATCH_STRUCT(brw, &uf);
+   return 0;
+}
diff --git a/src/gallium/drivers/i965/brw_util.h b/src/gallium/drivers/i965/brw_util.h
new file mode 100644
index 0000000000..b5f9a36e7b
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_util.h
@@ -0,0 +1,44 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+          
+
+#ifndef BRW_UTIL_H
+#define BRW_UTIL_H
+
+#include "brw_types.h"
+
+extern GLuint brw_count_bits( GLuint val );
+extern GLuint brw_translate_blend_factor( unsigned factor );
+extern GLuint brw_translate_blend_equation( unsigned mode );
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
new file mode 100644
index 0000000000..ca8ee79550
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -0,0 +1,129 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "tgsi/tgsi_dump.h"           
+
+#include "brw_context.h"
+#include "brw_vs.h"
+#include "brw_state.h"
+
+
+
+static enum pipe_error do_vs_prog( struct brw_context *brw, 
+                                   struct brw_vertex_shader *vp,
+                                   struct brw_vs_prog_key *key,
+                                   struct brw_winsys_buffer **bo_out)
+{
+   enum pipe_error ret;
+   GLuint program_size;
+   const GLuint *program;
+   struct brw_vs_compile c;
+
+   memset(&c, 0, sizeof(c));
+   memcpy(&c.key, key, sizeof(*key));
+
+   brw_init_compile(brw, &c.func);
+   c.vp = vp;
+
+   c.prog_data.nr_outputs = vp->info.num_outputs;
+   c.prog_data.nr_inputs = vp->info.num_inputs;
+
+   if (1)
+      tgsi_dump(c.vp->tokens, 0);
+
+   /* Emit GEN4 code.
+    */
+   brw_vs_emit(&c);
+
+   /* get the program
+    */
+   ret = brw_get_program(&c.func, &program, &program_size);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_cache( &brw->cache, BRW_VS_PROG,
+                           &c.key, brw_vs_prog_key_size(&c.key),
+                           NULL, 0,
+                           program, program_size,
+                           &c.prog_data,
+                           &brw->vs.prog_data,
+                           bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+static enum pipe_error brw_upload_vs_prog(struct brw_context *brw)
+{
+   struct brw_vs_prog_key key;
+   struct brw_vertex_shader *vp = brw->curr.vertex_shader;
+   struct brw_fs_signature *sig = &brw->curr.fragment_shader->signature;
+   enum pipe_error ret;
+
+   memset(&key, 0, sizeof(key));
+
+   key.program_string_id = vp->id;
+   key.nr_userclip = brw->curr.ucp.nr;
+
+   memcpy(&key.fs_signature, sig, brw_fs_signature_size(sig));
+
+
+   /* Make an early check for the key.
+    */
+   if (brw_search_cache(&brw->cache, BRW_VS_PROG,
+                        &key, brw_vs_prog_key_size(&key),
+                        NULL, 0,
+                        &brw->vs.prog_data,
+                        &brw->vs.prog_bo))
+      return PIPE_OK;
+
+   ret = do_vs_prog(brw, vp, &key, &brw->vs.prog_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+/* See brw_vs.c:
+ */
+const struct brw_tracked_state brw_vs_prog = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_CLIP | 
+                PIPE_NEW_RAST |
+                PIPE_NEW_FRAGMENT_SIGNATURE),
+      .brw   = BRW_NEW_VERTEX_PROGRAM,
+      .cache = 0
+   },
+   .prepare = brw_upload_vs_prog
+};
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
new file mode 100644
index 0000000000..944d88c84c
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -0,0 +1,106 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+            
+
+#ifndef BRW_VS_H
+#define BRW_VS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+
+struct brw_vs_prog_key {
+   GLuint program_string_id;
+   GLuint nr_userclip:4;
+   GLuint pad:26;
+   struct brw_fs_signature fs_signature;
+};
+
+#define brw_vs_prog_key_size(s) (offsetof(struct brw_vs_prog_key, fs_signature) + \
+                                 brw_fs_signature_size(&(s)->fs_signature))
+
+
+#define MAX_IF_DEPTH 32
+#define MAX_LOOP_DEPTH 32
+
+struct brw_vs_compile {
+   struct brw_compile func;
+   struct brw_vs_prog_key key;
+   struct brw_vs_prog_data prog_data;
+   struct brw_chipset chipset;
+
+   struct brw_vertex_shader *vp;
+
+   GLuint nr_inputs;
+   GLuint nr_outputs;
+   GLuint nr_immediates;
+   GLfloat immediate[128][4];
+
+   GLuint overflow_grf_start;
+   GLuint overflow_count;
+
+   GLuint first_tmp;
+   GLuint last_tmp;
+
+   struct brw_reg r0;
+   struct brw_reg r1;
+   struct brw_reg regs[TGSI_FILE_COUNT][128];
+   struct brw_reg tmp;
+   struct brw_reg stack;
+
+   struct {	
+       GLboolean used_in_src;
+       struct brw_reg reg;
+   } output_regs[128];
+
+   struct brw_reg userplane[6];
+
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
+
+   struct brw_instruction *if_inst[MAX_IF_DEPTH];
+   struct brw_instruction *loop_inst[MAX_LOOP_DEPTH];
+   GLuint insn;
+   GLuint if_depth;
+   GLuint loop_depth;
+   GLuint end_offset;
+
+   struct brw_indirect stack_index;
+};
+
+
+void brw_vs_emit( struct brw_vs_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
new file mode 100644
index 0000000000..5dcbd597dd
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -0,0 +1,1657 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_shader_tokens.h"
+            
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_info.h"
+
+#include "brw_context.h"
+#include "brw_vs.h"
+#include "brw_debug.h"
+#include "brw_disasm.h"
+
+/* Choose one of the 4 vec4's which can be packed into each 16-wide reg.
+ */
+static INLINE struct brw_reg brw_vec4_grf_repeat( GLuint reg, GLuint slot )
+{
+   int nr = reg + slot/2;
+   int subnr = (slot%2) * 4;
+
+   return stride(brw_vec4_grf(nr, subnr), 0, 4, 1);
+}
+
+
+static struct brw_reg get_tmp( struct brw_vs_compile *c )
+{
+   struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+			       
+static void release_tmps( struct brw_vs_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+static boolean is_position_output( struct brw_vs_compile *c,
+                                   unsigned vs_output )
+{
+   const struct brw_vertex_shader *vs = c->vp;
+   unsigned semantic = vs->info.output_semantic_name[vs_output];
+   unsigned index = vs->info.output_semantic_index[vs_output];
+      
+   return (semantic == TGSI_SEMANTIC_POSITION &&
+           index == 0);
+}
+
+
+static boolean find_output_slot( struct brw_vs_compile *c,
+                                  unsigned vs_output,
+                                  unsigned *fs_input_slot )
+{
+   const struct brw_vertex_shader *vs = c->vp;
+   unsigned semantic = vs->info.output_semantic_name[vs_output];
+   unsigned index = vs->info.output_semantic_index[vs_output];
+   unsigned i;
+
+   for (i = 0; i < c->key.fs_signature.nr_inputs; i++) {
+      if (c->key.fs_signature.input[i].semantic == semantic &&
+          c->key.fs_signature.input[i].semantic_index == index) {
+         *fs_input_slot = i;
+         return TRUE;
+      }
+   }
+
+   return FALSE;
+}
+
+
+/**
+ * Preallocate GRF register before code emit.
+ * Do things as simply as possible.  Allocate and populate all regs
+ * ahead of time.
+ */
+static void brw_vs_alloc_regs( struct brw_vs_compile *c )
+{
+   GLuint i, reg = 0, subreg = 0, mrf;
+   int attributes_in_vue;
+
+   /* Determine whether to use a real constant buffer or use a block
+    * of GRF registers for constants.  The later is faster but only
+    * works if everything fits in the GRF.
+    * XXX this heuristic/check may need some fine tuning...
+    */
+   if (c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1 +
+       c->vp->info.file_max[TGSI_FILE_IMMEDIATE] + 1 +
+       c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 1 + 21 > BRW_MAX_GRF)
+      c->vp->use_const_buffer = GL_TRUE;
+   else {
+      /* XXX: immediates can go elsewhere if necessary:
+       */
+      assert(c->vp->info.file_max[TGSI_FILE_IMMEDIATE] + 1 +
+	     c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 1 + 21 <= BRW_MAX_GRF);
+
+      c->vp->use_const_buffer = GL_FALSE;
+   }
+
+   /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
+
+   /* r0 -- reserved as usual
+    */
+   c->r0 = brw_vec8_grf(reg, 0);
+   reg++;
+
+   /* User clip planes from curbe: 
+    */
+   if (c->key.nr_userclip) {
+      /* Skip over fixed planes:  Or never read them into vs unit?
+       */
+      subreg += 6;
+
+      for (i = 0; i < c->key.nr_userclip; i++, subreg++) {
+	 c->userplane[i] = 
+            stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
+      }     
+
+      /* Deal with curbe alignment:
+       */
+      subreg = align(subreg, 2);
+      /*reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;*/
+   }
+
+
+   /* Immediates: always in the curbe.
+    *
+    * XXX: Can try to encode some immediates as brw immediates
+    * XXX: Make sure ureg sets minimal immediate size and respect it
+    * here.
+    */
+   for (i = 0; i < c->vp->info.immediate_count; i++, subreg++) {
+      c->regs[TGSI_FILE_IMMEDIATE][i] = 
+         stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
+   }
+   c->prog_data.nr_params = c->vp->info.immediate_count * 4;
+
+
+   /* Vertex constant buffer.
+    *
+    * Constants from the buffer can be either cached in the curbe or
+    * loaded as needed from the actual constant buffer.
+    */
+   if (!c->vp->use_const_buffer) {
+      GLuint nr_params = c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1;
+
+      for (i = 0; i < nr_params; i++, subreg++) {
+         c->regs[TGSI_FILE_CONSTANT][i] =
+            stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
+      }
+
+      c->prog_data.nr_params += nr_params * 4;
+   }
+
+   /* All regs allocated
+    */
+   reg += (subreg + 1) / 2;
+   c->prog_data.curb_read_length = reg - 1;
+
+
+   /* Allocate input regs:  
+    */
+   c->nr_inputs = c->vp->info.num_inputs;
+   for (i = 0; i < c->nr_inputs; i++) {
+      c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
+      reg++;
+   }
+
+   /* If there are no inputs, we'll still be reading one attribute's worth
+    * because it's required -- see urb_read_length setting.
+    */
+   if (c->nr_inputs == 0)
+      reg++;
+
+
+
+   /* Allocate outputs.  The non-position outputs go straight into message regs.
+    */
+   c->nr_outputs = c->prog_data.nr_outputs;
+
+   if (c->chipset.is_igdng)
+      mrf = 8;
+   else
+      mrf = 4;
+
+   
+   if (c->key.fs_signature.nr_inputs > BRW_MAX_MRF) {
+      c->overflow_grf_start = reg;
+      c->overflow_count = c->key.fs_signature.nr_inputs - BRW_MAX_MRF;
+      reg += c->overflow_count;
+   }
+
+   /* XXX: need to access vertex output semantics here:
+    */
+   for (i = 0; i < c->nr_outputs; i++) {
+      unsigned slot;
+
+      /* XXX: Put output position in slot zero always.  Clipper, etc,
+       * need access to this reg.
+       */
+      if (is_position_output(c, i)) {
+	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0); /* copy to mrf 0 */
+	 reg++;
+      }
+      else if (find_output_slot(c, i, &slot)) {
+         
+         if (0 /* is_psize_output(c, i) */ ) {
+            /* c->psize_out.grf = reg; */
+            /* c->psize_out.mrf = i; */
+         }
+         
+         /* The first (16-4) outputs can go straight into the message regs.
+          */
+         if (slot + mrf < BRW_MAX_MRF) {
+            c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(slot + mrf);
+         }
+         else {
+            int grf = c->overflow_grf_start + slot - BRW_MAX_MRF;
+            c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(grf, 0);
+         }
+      }
+      else {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_null_reg();
+      }
+   }     
+
+   /* Allocate program temporaries:
+    */
+   
+   for (i = 0; i < c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1; i++) {
+      c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
+      reg++;
+   }
+
+   /* Address reg(s).  Don't try to use the internal address reg until
+    * deref time.
+    */
+   for (i = 0; i < c->vp->info.file_max[TGSI_FILE_ADDRESS]+1; i++) {
+      c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
+					     reg,
+					     0,
+					     BRW_REGISTER_TYPE_D,
+					     BRW_VERTICAL_STRIDE_8,
+					     BRW_WIDTH_8,
+					     BRW_HORIZONTAL_STRIDE_1,
+					     BRW_SWIZZLE_XXXX,
+					     BRW_WRITEMASK_X);
+      reg++;
+   }
+
+   if (c->vp->use_const_buffer) {
+      for (i = 0; i < 3; i++) {
+         c->current_const[i].index = -1;
+         c->current_const[i].reg = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+   }
+
+#if 0
+   for (i = 0; i < 128; i++) {
+      if (c->output_regs[i].used_in_src) {
+         c->output_regs[i].reg = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+   }
+#endif
+
+   if (c->vp->has_flow_control) {
+      c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
+      reg += 2;
+   }
+
+   /* Some opcodes need an internal temporary:
+    */
+   c->first_tmp = reg;
+   c->last_tmp = reg;		/* for allocation purposes */
+
+   /* Each input reg holds data from two vertices.  The
+    * urb_read_length is the number of registers read from *each*
+    * vertex urb, so is half the amount:
+    */
+   c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
+
+   /* Setting this field to 0 leads to undefined behavior according to the
+    * the VS_STATE docs.  Our VUEs will always have at least one attribute
+    * sitting in them, even if it's padding.
+    */
+   if (c->prog_data.urb_read_length == 0)
+      c->prog_data.urb_read_length = 1;
+
+   /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
+    * them to fit the biggest thing they need to.
+    */
+   attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
+
+   if (c->chipset.is_igdng)
+      c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
+   else
+      c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
+
+   c->prog_data.total_grf = reg;
+
+   if (BRW_DEBUG & DEBUG_VS) {
+      debug_printf("%s NumAddrRegs %d\n", __FUNCTION__, 
+		   c->vp->info.file_max[TGSI_FILE_ADDRESS]+1);
+      debug_printf("%s NumTemps %d\n", __FUNCTION__,
+		   c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1);
+      debug_printf("%s reg = %d\n", __FUNCTION__, reg);
+   }
+}
+
+
+/**
+ * If an instruction uses a temp reg both as a src and the dest, we
+ * sometimes need to allocate an intermediate temporary.
+ */
+static void unalias1( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      void (*func)( struct brw_vs_compile *,
+				    struct brw_reg,
+				    struct brw_reg ))
+{
+   if (dst.file == arg0.file && dst.nr == arg0.nr) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
+      func(c, tmp, arg0);
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+   else {
+      func(c, dst, arg0);
+   }
+}
+
+/**
+ * \sa unalias2
+ * Checkes if 2-operand instruction needs an intermediate temporary.
+ */
+static void unalias2( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1,
+		      void (*func)( struct brw_vs_compile *,
+				    struct brw_reg,
+				    struct brw_reg,
+				    struct brw_reg ))
+{
+   if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
+       (dst.file == arg1.file && dst.nr == arg1.nr)) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
+      func(c, tmp, arg0, arg1);
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+   else {
+      func(c, dst, arg0, arg1);
+   }
+}
+
+/**
+ * \sa unalias2
+ * Checkes if 3-operand instruction needs an intermediate temporary.
+ */
+static void unalias3( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1,
+		      struct brw_reg arg2,
+		      void (*func)( struct brw_vs_compile *,
+				    struct brw_reg,
+				    struct brw_reg,
+				    struct brw_reg,
+				    struct brw_reg ))
+{
+   if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
+       (dst.file == arg1.file && dst.nr == arg1.nr) ||
+       (dst.file == arg2.file && dst.nr == arg2.nr)) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
+      func(c, tmp, arg0, arg1, arg2);
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+   else {
+      func(c, dst, arg0, arg1, arg2);
+   }
+}
+
+static void emit_sop( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1, 
+		      GLuint cond)
+{
+   brw_MOV(p, dst, brw_imm_f(0.0f));
+   brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
+   brw_MOV(p, dst, brw_imm_f(1.0f));
+   brw_set_predicate_control_flag_value(p, 0xff);
+}
+
+static void emit_seq( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
+}
+
+static void emit_sne( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
+}
+static void emit_slt( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
+}
+
+static void emit_sle( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
+}
+
+static void emit_sgt( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
+}
+
+static void emit_sge( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+  emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
+}
+
+static void emit_max( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
+   brw_SEL(p, dst, arg1, arg0);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+static void emit_min( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
+   brw_SEL(p, dst, arg0, arg1);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+
+static void emit_math1( struct brw_vs_compile *c,
+			GLuint function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			GLuint precision)
+{
+   /* There are various odd behaviours with SEND on the simulator.  In
+    * addition there are documented issues with the fact that the GEN4
+    * processor doesn't do dependency control properly on SEND
+    * results.  So, on balance, this kludge to get around failures
+    * with writemasked math results looks like it might be necessary
+    * whether that turns out to be a simulator bug or not:
+    */
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp) 
+      tmp = get_tmp(c);
+
+   brw_math(p, 
+	    tmp,
+	    function,
+	    BRW_MATH_SATURATE_NONE,
+	    2,
+	    arg0,
+	    BRW_MATH_DATA_SCALAR,
+	    precision);
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+static void emit_math2( struct brw_vs_compile *c, 
+			GLuint function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			struct brw_reg arg1,
+			GLuint precision)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp) 
+      tmp = get_tmp(c);
+
+   brw_MOV(p, brw_message_reg(3), arg1);
+   
+   brw_math(p, 
+	    tmp,
+	    function,
+	    BRW_MATH_SATURATE_NONE,
+	    2,
+ 	    arg0,
+	    BRW_MATH_DATA_SCALAR,
+	    precision);
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+static void emit_exp_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   
+
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_X) {
+      struct brw_reg tmp = get_tmp(c);
+      struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
+
+      /* tmp_d = floor(arg0.x) */
+      brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
+
+      /* result[0] = 2.0 ^ tmp */
+
+      /* Adjust exponent for floating point: 
+       * exp += 127 
+       */
+      brw_ADD(p, brw_writemask(tmp_d, BRW_WRITEMASK_X), tmp_d, brw_imm_d(127));
+
+      /* Install exponent and sign.  
+       * Excess drops off the edge: 
+       */
+      brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), BRW_WRITEMASK_X), 
+	      tmp_d, brw_imm_d(23));
+
+      release_tmp(c, tmp);
+   }
+
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y) {
+      /* result[1] = arg0.x - floor(arg0.x) */
+      brw_FRC(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0, 0));
+   }
+   
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
+      /* As with the LOG instruction, we might be better off just
+       * doing a taylor expansion here, seeing as we have to do all
+       * the prep work.
+       *
+       * If mathbox partial precision is too low, consider also:
+       * result[3] = result[0] * EXP(result[1])
+       */
+      emit_math1(c, 
+		 BRW_MATH_FUNCTION_EXP, 
+		 brw_writemask(dst, BRW_WRITEMASK_Z),
+		 brw_swizzle1(arg0, 0), 
+		 BRW_MATH_PRECISION_FULL);
+   }  
+
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
+      /* result[3] = 1.0; */
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), brw_imm_f(1));
+   }
+}
+
+
+static void emit_log_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
+   struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
+   GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp) {
+      tmp = get_tmp(c);
+      tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
+   }
+   
+   /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
+    * according to spec:
+    *
+    * These almost look likey they could be joined up, but not really
+    * practical:
+    *
+    * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
+    * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
+    */
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_XZ) {
+      brw_AND(p, 
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_X),
+	      brw_swizzle1(arg0_ud, 0),
+	      brw_imm_ud((1U<<31)-1));
+
+      brw_SHR(p, 
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_X), 
+	      tmp_ud,
+	      brw_imm_ud(23));
+
+      brw_ADD(p, 
+	      brw_writemask(tmp, BRW_WRITEMASK_X), 
+	      retype(tmp_ud, BRW_REGISTER_TYPE_D),	/* does it matter? */
+	      brw_imm_d(-127));
+   }
+
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_YZ) {
+      brw_AND(p, 
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
+	      brw_swizzle1(arg0_ud, 0),
+	      brw_imm_ud((1<<23)-1));
+
+      brw_OR(p, 
+	     brw_writemask(tmp_ud, BRW_WRITEMASK_Y), 
+	     tmp_ud,
+	     brw_imm_ud(127<<23));
+   }
+   
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
+      /* result[2] = result[0] + LOG2(result[1]); */
+
+      /* Why bother?  The above is just a hint how to do this with a
+       * taylor series.  Maybe we *should* use a taylor series as by
+       * the time all the above has been done it's almost certainly
+       * quicker than calling the mathbox, even with low precision.
+       * 
+       * Options are:
+       *    - result[0] + mathbox.LOG2(result[1])
+       *    - mathbox.LOG2(arg0.x)
+       *    - result[0] + inline_taylor_approx(result[1])
+       */
+      emit_math1(c, 
+		 BRW_MATH_FUNCTION_LOG, 
+		 brw_writemask(tmp, BRW_WRITEMASK_Z), 
+		 brw_swizzle1(tmp, 1), 
+		 BRW_MATH_PRECISION_FULL);
+      
+      brw_ADD(p, 
+	      brw_writemask(tmp, BRW_WRITEMASK_Z), 
+	      brw_swizzle1(tmp, 2), 
+	      brw_swizzle1(tmp, 0));
+   }  
+
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
+      /* result[3] = 1.0; */
+      brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_W), brw_imm_f(1));
+   }
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+/* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
+ */
+static void emit_dst_noalias( struct brw_vs_compile *c, 
+			      struct brw_reg dst,
+			      struct brw_reg arg0,
+			      struct brw_reg arg1)
+{
+   struct brw_compile *p = &c->func;
+
+   /* There must be a better way to do this: 
+    */
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_X)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_X), brw_imm_f(1.0));
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y)
+      brw_MUL(p, brw_writemask(dst, BRW_WRITEMASK_Y), arg0, arg1);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Z), arg0);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), arg1);
+}
+
+
+static void emit_xpd( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg t,
+		      struct brw_reg u)
+{
+   brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
+   brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
+}
+
+
+static void emit_lit_noalias( struct brw_vs_compile *c, 
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+   struct brw_reg tmp = dst;
+   GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp) 
+      tmp = get_tmp(c);
+   
+   brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_YZ), brw_imm_f(0)); 
+   brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_XW), brw_imm_f(1)); 
+
+   /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
+    * to get all channels active inside the IF.  In the clipping code
+    * we run with NoMask, so it's not an option and we can use
+    * BRW_EXECUTE_1 for all comparisions.
+    */
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
+   if_insn = brw_IF(p, BRW_EXECUTE_8);
+   {
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0,0));
+
+      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
+      brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_Z),  brw_swizzle1(arg0,1));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+      emit_math2(c, 
+		 BRW_MATH_FUNCTION_POW, 
+		 brw_writemask(dst, BRW_WRITEMASK_Z),
+		 brw_swizzle1(tmp, 2),
+		 brw_swizzle1(arg0, 3),
+		 BRW_MATH_PRECISION_PARTIAL);      
+   }
+
+   brw_ENDIF(p, if_insn);
+
+   release_tmp(c, tmp);
+}
+
+static void emit_lrp_noalias(struct brw_vs_compile *c,
+			     struct brw_reg dst,
+			     struct brw_reg arg0,
+			     struct brw_reg arg1,
+			     struct brw_reg arg2)
+{
+   struct brw_compile *p = &c->func;
+
+   brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
+   brw_MUL(p, brw_null_reg(), dst, arg2);
+   brw_MAC(p, dst, arg0, arg1);
+}
+
+/** 3 or 4-component vector normalization */
+static void emit_nrm( struct brw_vs_compile *c, 
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      int num_comps)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+
+   /* tmp = dot(arg0, arg0) */
+   if (num_comps == 3)
+      brw_DP3(p, tmp, arg0, arg0);
+   else
+      brw_DP4(p, tmp, arg0, arg0);
+
+   /* tmp = 1 / sqrt(tmp) */
+   emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
+
+   /* dst = arg0 * tmp */
+   brw_MUL(p, dst, arg0, tmp);
+
+   release_tmp(c, tmp);
+}
+
+
+static struct brw_reg
+get_constant(struct brw_vs_compile *c,
+	     GLuint argIndex,
+	     GLuint index,
+	     GLboolean relAddr)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg const_reg;
+   struct brw_reg const2_reg;
+
+   assert(argIndex < 3);
+
+   if (c->current_const[argIndex].index != index || relAddr) {
+      struct brw_reg addrReg = c->regs[TGSI_FILE_ADDRESS][0];
+
+      c->current_const[argIndex].index = index;
+
+#if 0
+      printf("  fetch const[%d] for arg %d into reg %d\n",
+             src.Index, argIndex, c->current_const[argIndex].reg.nr);
+#endif
+      /* need to fetch the constant now */
+      brw_dp_READ_4_vs(p,
+                       c->current_const[argIndex].reg,/* writeback dest */
+                       0,                             /* oword */
+                       relAddr,                       /* relative indexing? */
+                       addrReg,                       /* address register */
+                       16 * index,               /* byte offset */
+                       SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
+                       );
+
+      if (relAddr) {
+         /* second read */
+         const2_reg = get_tmp(c);
+
+         /* use upper half of address reg for second read */
+         addrReg = stride(addrReg, 0, 4, 0);
+         addrReg.subnr = 16;
+
+         brw_dp_READ_4_vs(p,
+                          const2_reg,              /* writeback dest */
+                          1,                       /* oword */
+                          relAddr,                 /* relative indexing? */
+                          addrReg,                 /* address register */
+                          16 * index,         /* byte offset */
+                          SURF_INDEX_VERT_CONST_BUFFER
+                          );
+      }
+   }
+
+   const_reg = c->current_const[argIndex].reg;
+
+   if (relAddr) {
+      /* merge the two Owords into the constant register */
+      /* const_reg[7..4] = const2_reg[7..4] */
+      brw_MOV(p,
+              suboffset(stride(const_reg, 0, 4, 1), 4),
+              suboffset(stride(const2_reg, 0, 4, 1), 4));
+      release_tmp(c, const2_reg);
+   }
+   else {
+      /* replicate lower four floats into upper half (to get XYZWXYZW) */
+      const_reg = stride(const_reg, 0, 4, 0);
+      const_reg.subnr = 0;
+   }
+
+   return const_reg;
+}
+
+
+#if 0
+
+/* TODO: relative addressing!
+ */
+static struct brw_reg get_reg( struct brw_vs_compile *c,
+			       enum tgsi_file_type file,
+			       GLuint index )
+{
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+   case TGSI_FILE_CONSTANT:
+      assert(c->regs[file][index].nr != 0);
+      return c->regs[file][index];
+
+   case TGSI_FILE_ADDRESS:
+      assert(index == 0);
+      return c->regs[file][index];
+
+   case TGSI_FILE_NULL:			/* undef values */
+      return brw_null_reg();
+
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+#endif
+
+
+/**
+ * Indirect addressing:  get reg[[arg] + offset].
+ */
+static struct brw_reg deref( struct brw_vs_compile *c,
+			     struct brw_reg arg,
+			     GLint offset)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = vec4(get_tmp(c));
+   struct brw_reg addr_reg = c->regs[TGSI_FILE_ADDRESS][0];
+   struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
+   GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
+   struct brw_reg indirect = brw_vec4_indirect(0,0);
+
+   {
+      brw_push_insn_state(p);
+      brw_set_access_mode(p, BRW_ALIGN_1);
+
+      /* This is pretty clunky - load the address register twice and
+       * fetch each 4-dword value in turn.  There must be a way to do
+       * this in a single pass, but I couldn't get it to work.
+       */
+      brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
+      brw_MOV(p, tmp, indirect);
+
+      brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
+      brw_MOV(p, suboffset(tmp, 4), indirect);
+
+      brw_pop_insn_state(p);
+   }
+   
+   /* NOTE: tmp not released */
+   return vec8(tmp);
+}
+
+
+/**
+ * Get brw reg corresponding to the instruction's [argIndex] src reg.
+ * TODO: relative addressing!
+ */
+static struct brw_reg
+get_src_reg( struct brw_vs_compile *c,
+	     GLuint argIndex,
+	     GLuint file,
+	     GLint index,
+	     GLboolean relAddr )
+{
+
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+      if (relAddr) {
+         return deref(c, c->regs[file][0], index);
+      }
+      else {
+         assert(c->regs[file][index].nr != 0);
+         return c->regs[file][index];
+      }
+
+   case TGSI_FILE_IMMEDIATE:
+      return c->regs[file][index];
+
+   case TGSI_FILE_CONSTANT:
+      if (c->vp->use_const_buffer) {
+         return get_constant(c, argIndex, index, relAddr);
+      }
+      else if (relAddr) {
+         return deref(c, c->regs[TGSI_FILE_CONSTANT][0], index);
+      }
+      else {
+         assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
+         return c->regs[TGSI_FILE_CONSTANT][index];
+      }
+   case TGSI_FILE_ADDRESS:
+      assert(index == 0);
+      return c->regs[file][index];
+
+   case TGSI_FILE_NULL:
+      /* this is a normal case since we loop over all three src args */
+      return brw_null_reg();
+
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+
+static void emit_arl( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
+   
+   if (need_tmp) 
+      tmp = get_tmp(c);
+
+   brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
+   brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
+
+   if (need_tmp)
+      release_tmp(c, tmp);
+}
+
+
+/**
+ * Return the brw reg for the given instruction's src argument.
+ */
+static struct brw_reg get_arg( struct brw_vs_compile *c,
+                               const struct tgsi_full_src_register *src,
+                               GLuint argIndex )
+{
+   struct brw_reg reg;
+
+   if (src->Register.File == TGSI_FILE_NULL)
+      return brw_null_reg();
+
+   reg = get_src_reg(c, argIndex,
+		     src->Register.File,
+		     src->Register.Index,
+		     src->Register.Indirect);
+
+   /* Convert 3-bit swizzle to 2-bit.  
+    */
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->Register.SwizzleX,
+				       src->Register.SwizzleY,
+				       src->Register.SwizzleZ,
+				       src->Register.SwizzleW);
+
+   reg.negate = src->Register.Negate ? 1 : 0;   
+
+   /* XXX: abs, absneg
+    */
+
+   return reg;
+}
+
+
+/**
+ * Get brw register for the given program dest register.
+ */
+static struct brw_reg get_dst( struct brw_vs_compile *c,
+			       unsigned file,
+			       unsigned index,
+			       unsigned writemask )
+{
+   struct brw_reg reg;
+
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_OUTPUT:
+      assert(c->regs[file][index].nr != 0);
+      reg = c->regs[file][index];
+      break;
+   case TGSI_FILE_ADDRESS:
+      assert(index == 0);
+      reg = c->regs[file][index];
+      break;
+   case TGSI_FILE_NULL:
+      /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
+      reg = brw_null_reg();
+      break;
+   default:
+      assert(0);
+      reg = brw_null_reg();
+   }
+
+   reg.dw1.bits.writemask = writemask;
+
+   return reg;
+}
+
+
+
+
+/**
+ * Post-vertex-program processing.  Send the results to the URB.
+ */
+static void emit_vertex_write( struct brw_vs_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg m0 = brw_message_reg(0);
+   struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_HPOS];
+   struct brw_reg ndc;
+   int eot;
+   int i;
+   GLuint len_vertext_header = 2;
+
+   /* Build ndc coords */
+   ndc = get_tmp(c);
+   /* ndc = 1.0 / pos.w */
+   emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
+   /* ndc.xyz = pos * ndc */
+   brw_MUL(p, brw_writemask(ndc, BRW_WRITEMASK_XYZ), pos, ndc);
+
+   /* Update the header for point size, user clipping flags, and -ve rhw
+    * workaround.
+    */
+   if (c->prog_data.writes_psiz ||
+       c->key.nr_userclip || 
+       c->chipset.is_965)
+   {
+      struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+      GLuint i;
+
+      brw_MOV(p, header1, brw_imm_ud(0));
+
+      brw_set_access_mode(p, BRW_ALIGN_16);	
+
+      if (c->prog_data.writes_psiz) {
+	 struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_PSIZ];
+	 brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
+	 brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
+      }
+
+      for (i = 0; i < c->key.nr_userclip; i++) {
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
+	 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<i));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      }
+
+      /* i965 clipping workaround: 
+       * 1) Test for -ve rhw
+       * 2) If set, 
+       *      set ndc = (0,0,0,0)
+       *      set ucp[6] = 1
+       *
+       * Later, clipping will detect ucp[6] and ensure the primitive is
+       * clipped against all fixed planes.
+       */
+      if (c->chipset.is_965) {
+	 brw_CMP(p,
+		 vec8(brw_null_reg()),
+		 BRW_CONDITIONAL_L,
+		 brw_swizzle1(ndc, 3),
+		 brw_imm_f(0));
+   
+	 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<6));
+	 brw_MOV(p, ndc, brw_imm_f(0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      }
+
+      brw_set_access_mode(p, BRW_ALIGN_1);	/* why? */
+      brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
+      brw_set_access_mode(p, BRW_ALIGN_16);
+
+      release_tmp(c, header1);
+   }
+   else {
+      brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
+   }
+
+   /* Emit the (interleaved) headers for the two vertices - an 8-reg
+    * of zeros followed by two sets of NDC coordinates:
+    */
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, offset(m0, 2), ndc);
+
+   if (c->chipset.is_igdng) {
+       /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
+       brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
+       /* m4, m5 contain the distances from vertex to the user clip planeXXX. 
+        * Seems it is useless for us.
+        * m6 is used for aligning, so that the remainder of vertex element is 
+        * reg-aligned.
+        */
+       brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
+       len_vertext_header = 6;
+   } else {
+       brw_MOV(p, offset(m0, 3), pos);
+       len_vertext_header = 2;
+   }
+
+   eot = (c->overflow_count == 0);
+
+   brw_urb_WRITE(p, 
+		 brw_null_reg(), /* dest */
+		 0,		/* starting mrf reg nr */
+		 c->r0,		/* src */
+		 0,		/* allocate */
+		 1,		/* used */
+		 MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
+		 0,		/* response len */
+		 eot, 		/* eot */
+		 eot, 		/* writes complete */
+		 0, 		/* urb destination offset */
+		 BRW_URB_SWIZZLE_INTERLEAVE);
+
+   /* Not all of the vertex outputs/results fit into the MRF.
+    * Move the overflowed attributes from the GRF to the MRF and
+    * issue another brw_urb_WRITE().
+    */
+   for (i = 0; i < c->overflow_count; i += BRW_MAX_MRF) {
+      unsigned nr = MIN2(c->overflow_count - i, BRW_MAX_MRF);
+      GLuint j;
+
+      eot = (i + nr >= c->overflow_count);
+
+      /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
+       * at mrf[4] atm...
+       */
+      for (j = 0; j < nr; j++) {
+	 brw_MOV(p, brw_message_reg(4+j), 
+                 brw_vec8_grf(c->overflow_grf_start + i + j, 0));
+      }
+
+      brw_urb_WRITE(p,
+                    brw_null_reg(), /* dest */
+                    4,              /* starting mrf reg nr */
+                    c->r0,          /* src */
+                    0,              /* allocate */
+                    1,              /* used */
+                    nr+1,          /* msg len */
+                    0,              /* response len */
+                    eot,            /* eot */
+                    eot,            /* writes complete */
+                    i-1,            /* urb destination offset */
+                    BRW_URB_SWIZZLE_INTERLEAVE);
+   }
+}
+
+
+/**
+ * Called after code generation to resolve subroutine calls and the
+ * END instruction.
+ * \param end_inst  points to brw code for END instruction
+ * \param last_inst  points to last instruction emitted before vertex write
+ */
+static void 
+post_vs_emit( struct brw_vs_compile *c,
+              struct brw_instruction *end_inst,
+              struct brw_instruction *last_inst )
+{
+   GLint offset;
+
+   brw_resolve_cals(&c->func);
+
+   /* patch up the END code to jump past subroutines, etc */
+   offset = last_inst - end_inst;
+   if (offset > 1) {
+      brw_set_src1(end_inst, brw_imm_d(offset * 16));
+   } else {
+      end_inst->header.opcode = BRW_OPCODE_NOP;
+   }
+}
+
+static uint32_t
+get_predicate(const struct tgsi_full_instruction *inst)
+{
+   /* XXX: disabling for now
+    */
+#if 0
+   if (inst->dst.CondMask == COND_TR)
+      return BRW_PREDICATE_NONE;
+
+   /* All of GLSL only produces predicates for COND_NE and one channel per
+    * vector.  Fail badly if someone starts doing something else, as it might
+    * mean infinite looping or something.
+    *
+    * We'd like to support all the condition codes, but our hardware doesn't
+    * quite match the Mesa IR, which is modeled after the NV extensions.  For
+    * those, the instruction may update the condition codes or not, then any
+    * later instruction may use one of those condition codes.  For gen4, the
+    * instruction may update the flags register based on one of the condition
+    * codes output by the instruction, and then further instructions may
+    * predicate on that.  We can probably support this, but it won't
+    * necessarily be easy.
+    */
+/*   assert(inst->dst.CondMask == COND_NE); */
+
+   switch (inst->dst.CondSwizzle) {
+   case SWIZZLE_XXXX:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_X;
+   case SWIZZLE_YYYY:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+   case SWIZZLE_ZZZZ:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+   case SWIZZLE_WWWW:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_W;
+   default:
+      debug_printf("Unexpected predicate: 0x%08x\n",
+		    inst->dst.CondMask);
+      return BRW_PREDICATE_NORMAL;
+   }
+#else
+   return BRW_PREDICATE_NORMAL;
+#endif
+}
+
+static void emit_insn(struct brw_vs_compile *c,
+		      const struct tgsi_full_instruction *inst)
+{
+   unsigned opcode = inst->Instruction.Opcode;
+   unsigned label = inst->Label.Label;
+   struct brw_compile *p = &c->func;
+   struct brw_reg args[3], dst;
+   GLuint i;
+
+#if 0
+   printf("%d: ", insn);
+   _mesa_print_instruction(inst);
+#endif
+
+   /* Get argument regs.
+    */
+   for (i = 0; i < 3; i++) {
+      args[i] = get_arg(c, &inst->Src[i], i);
+   }
+
+   /* Get dest regs.  Note that it is possible for a reg to be both
+    * dst and arg, given the static allocation of registers.  So
+    * care needs to be taken emitting multi-operation instructions.
+    */ 
+   dst = get_dst(c, 
+		 inst->Dst[0].Register.File,
+		 inst->Dst[0].Register.Index,
+		 inst->Dst[0].Register.WriteMask);
+
+   /* XXX: saturate
+    */
+   if (inst->Instruction.Saturate != TGSI_SAT_NONE) {
+      debug_printf("Unsupported saturate in vertex shader");
+   }
+
+   switch (opcode) {
+   case TGSI_OPCODE_ABS:
+      brw_MOV(p, dst, brw_abs(args[0]));
+      break;
+   case TGSI_OPCODE_ADD:
+      brw_ADD(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_COS:
+      emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_DP3:
+      brw_DP3(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DP4:
+      brw_DP4(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DPH:
+      brw_DPH(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_NRM:
+      emit_nrm(c, dst, args[0], 3);
+      break;
+   case TGSI_OPCODE_NRM4:
+      emit_nrm(c, dst, args[0], 4);
+      break;
+   case TGSI_OPCODE_DST:
+      unalias2(c, dst, args[0], args[1], emit_dst_noalias); 
+      break;
+   case TGSI_OPCODE_EXP:
+      unalias1(c, dst, args[0], emit_exp_noalias);
+      break;
+   case TGSI_OPCODE_EX2:
+      emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_ARL:
+      emit_arl(c, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FLR:
+      brw_RNDD(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FRC:
+      brw_FRC(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_LOG:
+      unalias1(c, dst, args[0], emit_log_noalias);
+      break;
+   case TGSI_OPCODE_LG2:
+      emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_LIT:
+      unalias1(c, dst, args[0], emit_lit_noalias);
+      break;
+   case TGSI_OPCODE_LRP:
+      unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
+      break;
+   case TGSI_OPCODE_MAD:
+      brw_MOV(p, brw_acc_reg(), args[2]);
+      brw_MAC(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MAX:
+      emit_max(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MIN:
+      emit_min(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MOV:
+      brw_MOV(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_MUL:
+      brw_MUL(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL); 
+      break;
+   case TGSI_OPCODE_RCP:
+      emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_RSQ:
+      emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, 
+                 brw_swizzle(args[0], 0,0,0,0), BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_SEQ:
+      emit_seq(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SIN:
+      emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_SNE:
+      emit_sne(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGE:
+      emit_sge(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGT:
+      emit_sgt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLT:
+      emit_slt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLE:
+      emit_sle(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SUB:
+      brw_ADD(p, dst, args[0], negate(args[1]));
+      break;
+   case TGSI_OPCODE_TRUNC:
+      /* round toward zero */
+      brw_RNDZ(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_IF:
+      assert(c->if_depth < MAX_IF_DEPTH);
+      c->if_inst[c->if_depth] = brw_IF(p, BRW_EXECUTE_8);
+      /* Note that brw_IF smashes the predicate_control field. */
+      c->if_inst[c->if_depth]->header.predicate_control = get_predicate(inst);
+      c->if_depth++;
+      break;
+   case TGSI_OPCODE_ELSE:
+      c->if_inst[c->if_depth-1] = brw_ELSE(p, c->if_inst[c->if_depth-1]);
+      break;
+   case TGSI_OPCODE_ENDIF:
+      assert(c->if_depth > 0);
+      brw_ENDIF(p, c->if_inst[--c->if_depth]);
+      break;			
+   case TGSI_OPCODE_BGNLOOP:
+      c->loop_inst[c->loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_BRK:
+      brw_set_predicate_control(p, get_predicate(inst));
+      brw_BREAK(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_CONT:
+      brw_set_predicate_control(p, get_predicate(inst));
+      brw_CONT(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_ENDLOOP: 
+   {
+      struct brw_instruction *inst0, *inst1;
+      GLuint br = 1;
+
+      c->loop_depth--;
+
+      if (c->chipset.is_igdng)
+	 br = 2;
+
+      inst0 = inst1 = brw_WHILE(p, c->loop_inst[c->loop_depth]);
+      /* patch all the BREAK/CONT instructions from last BEGINLOOP */
+      while (inst0 > c->loop_inst[c->loop_depth]) {
+	 inst0--;
+	 if (inst0->header.opcode == TGSI_OPCODE_BRK) {
+	    inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	    inst0->bits3.if_else.pop_count = 0;
+	 }
+	 else if (inst0->header.opcode == TGSI_OPCODE_CONT) {
+	    inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	    inst0->bits3.if_else.pop_count = 0;
+	 }
+      }
+   }
+   break;
+   case TGSI_OPCODE_BRA:
+      brw_set_predicate_control(p, get_predicate(inst));
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_CAL:
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_ADD(p, deref_1d(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(4));
+      brw_save_call(p, label, p->nr_insn);
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+   case TGSI_OPCODE_RET:
+      brw_ADD(p, get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(-4));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_MOV(p, brw_ip_reg(), deref_1d(c->stack_index, 0));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      break;
+   case TGSI_OPCODE_END:	
+      c->end_offset = p->nr_insn;
+      /* this instruction will get patched later to jump past subroutine
+       * code, etc.
+       */
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+   case TGSI_OPCODE_BGNSUB:
+      brw_save_label(p, p->nr_insn, p->nr_insn);
+      break;
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+   default:
+      debug_printf("Unsupported opcode %i (%s) in vertex shader",
+		   opcode, 
+		   tgsi_get_opcode_name(opcode));
+   }
+
+   /* Set the predication update on the last instruction of the native
+    * instruction sequence.
+    *
+    * This would be problematic if it was set on a math instruction,
+    * but that shouldn't be the case with the current GLSL compiler.
+    */
+#if 0
+   /* XXX: disabled
+    */
+   if (inst->CondUpdate) {
+      struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
+
+      assert(hw_insn->header.destreg__conditionalmod == 0);
+      hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
+   }
+#endif
+
+   release_tmps(c);
+}
+
+
+/* Emit the vertex program instructions here.
+ */
+void brw_vs_emit(struct brw_vs_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   const struct tgsi_token *tokens = c->vp->tokens;
+   struct brw_instruction *end_inst, *last_inst;
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction *inst;
+
+   if (BRW_DEBUG & DEBUG_VS)
+      tgsi_dump(c->vp->tokens, 0); 
+
+   c->stack_index = brw_indirect(0, 0);
+
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   
+
+   /* Static register allocation
+    */
+   brw_vs_alloc_regs(c);
+
+   if (c->vp->has_flow_control) {
+      brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
+   }
+
+   /* Instructions
+    */
+   tgsi_parse_init( &parse, tokens );
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+	 break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         inst = &parse.FullToken.FullInstruction;
+	 emit_insn( c, inst );
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+   tgsi_parse_free( &parse );
+
+   end_inst = &p->store[c->end_offset];
+   last_inst = &p->store[p->nr_insn];
+
+   /* The END instruction will be patched to jump to this code */
+   emit_vertex_write(c);
+
+   post_vs_emit(c, end_inst, last_inst);
+
+   if (BRW_DEBUG & DEBUG_VS) {
+      debug_printf("vs-native:\n");
+      brw_disasm(stderr, p->store, p->nr_insn);
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_vs_state.c b/src/gallium/drivers/i965/brw_vs_state.c
new file mode 100644
index 0000000000..dadbb622e4
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_vs_state.c
@@ -0,0 +1,201 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+            
+#include "util/u_math.h"
+
+
+#include "brw_debug.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+struct brw_vs_unit_key {
+   unsigned int total_grf;
+   unsigned int urb_entry_read_length;
+   unsigned int curb_entry_read_length;
+
+   unsigned int curbe_offset;
+
+   unsigned int nr_urb_entries, urb_size;
+
+   unsigned int nr_surfaces;
+};
+
+static void
+vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_VS_PROG */
+   key->total_grf = brw->vs.prog_data->total_grf;
+   key->urb_entry_read_length = brw->vs.prog_data->urb_read_length;
+   key->curb_entry_read_length = brw->vs.prog_data->curb_read_length;
+
+   /* BRW_NEW_URB_FENCE */
+   key->nr_urb_entries = brw->urb.nr_vs_entries;
+   key->urb_size = brw->urb.vsize;
+
+   /* BRW_NEW_NR_VS_SURFACES */
+   key->nr_surfaces = brw->vs.nr_surfaces;
+
+   /* PIPE_NEW_CLIP */
+   if (brw->curr.ucp.nr) {
+      /* Note that we read in the userclip planes as well, hence
+       * clip_start:
+       */
+      key->curbe_offset = brw->curbe.clip_start;
+   }
+   else {
+      key->curbe_offset = brw->curbe.vs_start;
+   }
+}
+
+static enum pipe_error
+vs_unit_create_from_key(struct brw_context *brw, 
+                        struct brw_vs_unit_key *key,
+                        struct brw_winsys_reloc *reloc,
+                        struct brw_winsys_buffer **bo_out)
+{
+   enum pipe_error ret;
+   struct brw_vs_unit_state vs;
+   int chipset_max_threads;
+
+   memset(&vs, 0, sizeof(vs));
+
+   vs.thread0.kernel_start_pointer = 0; /* reloc */
+   vs.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
+   vs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   /* Choosing multiple program flow means that we may get 2-vertex threads,
+    * which will have the channel mask for dwords 4-7 enabled in the thread,
+    * and those dwords will be written to the second URB handle when we
+    * brw_urb_WRITE() results.
+    */
+   vs.thread1.single_program_flow = 0;
+
+   if (BRW_IS_IGDNG(brw))
+      vs.thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else
+      vs.thread1.binding_table_entry_count = key->nr_surfaces;
+
+   vs.thread3.urb_entry_read_length = key->urb_entry_read_length;
+   vs.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
+   vs.thread3.dispatch_grf_start_reg = 1;
+   vs.thread3.urb_entry_read_offset = 0;
+   vs.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
+
+   if (BRW_IS_IGDNG(brw))
+       vs.thread4.nr_urb_entries = key->nr_urb_entries >> 2;
+   else
+       vs.thread4.nr_urb_entries = key->nr_urb_entries;
+
+   vs.thread4.urb_entry_allocation_size = key->urb_size - 1;
+
+   if (BRW_IS_IGDNG(brw))
+      chipset_max_threads = 72;
+   else if (BRW_IS_G4X(brw))
+      chipset_max_threads = 32;
+   else
+      chipset_max_threads = 16;
+
+   vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2,
+				  1, chipset_max_threads) - 1;
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      vs.thread4.max_threads = 0;
+
+   /* No samplers for ARB_vp programs:
+    */
+   /* It has to be set to 0 for IGDNG
+    */
+   vs.vs5.sampler_count = 0;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      vs.thread4.stats_enable = 1;
+
+   /* Vertex program always enabled:
+    */
+   vs.vs6.vs_enable = 1;
+
+   ret = brw_upload_cache(&brw->cache, BRW_VS_UNIT,
+                          key, sizeof(*key),
+                          reloc, 1,
+                          &vs, sizeof(vs),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+static int prepare_vs_unit(struct brw_context *brw)
+{
+   struct brw_vs_unit_key key;
+   enum pipe_error ret;
+   struct brw_winsys_reloc reloc[1];
+   unsigned grf_reg_count;
+
+   vs_unit_populate_key(brw, &key);
+
+   grf_reg_count = (align(key.total_grf, 16) / 16 - 1);
+
+   /* Emit VS program relocation */
+   make_reloc(&reloc[0],
+              BRW_USAGE_STATE,
+              grf_reg_count << 1,
+              offsetof(struct brw_vs_unit_state, thread0),
+              brw->vs.prog_bo);
+
+
+   if (brw_search_cache(&brw->cache, BRW_VS_UNIT,
+                        &key, sizeof(key),
+                        reloc, 1,
+                        NULL,
+                        &brw->vs.state_bo))
+      return PIPE_OK;
+
+   ret = vs_unit_create_from_key(brw, &key, reloc, &brw->vs.state_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_vs_unit = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_CLIP),
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+                BRW_NEW_NR_VS_SURFACES |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .prepare = prepare_vs_unit,
+};
diff --git a/src/gallium/drivers/i965/brw_vs_surface_state.c b/src/gallium/drivers/i965/brw_vs_surface_state.c
new file mode 100644
index 0000000000..424bb0d0df
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_vs_surface_state.c
@@ -0,0 +1,231 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_winsys.h"
+
+/* XXX: disabled true constant buffer functionality
+ */
+
+
+/* Creates a new VS constant buffer reflecting the current VS program's
+ * constants, if needed by the VS program.
+ *
+ * Otherwise, constants go through the CURBEs using the brw_constant_buffer
+ * state atom.
+ */
+#if 0
+static struct brw_winsys_buffer *
+brw_vs_update_constant_buffer(struct brw_context *brw)
+{
+   /* XXX: true constant buffers
+    */
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   drm_intel_bo *const_buffer;
+
+   /* BRW_NEW_VERTEX_PROGRAM */
+   if (!vp->use_const_buffer)
+      return NULL;
+
+   const_buffer = brw->sws->bo_alloc(brw->sws, 
+				     BRW_BUFFER_TYPE_SHADER_CONSTANTS,
+				     size, 64);
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   brw->sws->bo_subdata(const_buffer, 0, size, params->ParameterValues,
+                        NULL, 0);
+
+   return const_buffer;
+}
+#endif
+
+/**
+ * Update the surface state for a VS constant buffer.
+ *
+ * Sets brw->vs.surf_bo[surf] and brw->vp->const_buffer.
+ */
+#if 0
+static void
+brw_update_vs_constant_surface( struct brw_context *brw,
+                                GLuint surf)
+{
+   struct brw_surface_key key;
+   struct pipe_resource *cb = brw->curr.vs_constants;
+   enum pipe_error ret;
+
+   assert(surf == 0);
+
+   /* If we're in this state update atom, we need to update VS constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   ret = brw_vs_update_constant_buffer(brw, &vp->const_buffer);
+   if (ret)
+      return ret;
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (vp->const_buffer == NULL) {
+      bo_reference(brw->vs.surf_bo[surf], NULL);
+      return PIPE_OK;
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+   key.bo = vp->const_buffer;
+   key.depthmode = GL_NONE;
+   key.pitch = params->NumParameters;
+   key.width = params->NumParameters;
+   key.height = 1;
+   key.depth = 1;
+   key.cpp = 16;
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   if (brw_search_cache(&brw->surface_cache,
+                        BRW_SS_SURFACE,
+                        &key, sizeof(key),
+                        &key.bo, key.bo ? 1 : 0,
+                        NULL,
+                        &brw->vs.surf_bo[surf]))
+      return PIPE_OK;
+
+   ret = brw_create_constant_surface(brw, &key
+                                     &brw->vs.surf_bo[surf]);
+   if (ret)
+      return ret;
+   
+   return PIPE_OK;
+}
+#endif
+
+
+/**
+ * Constructs the binding table for the VS surface state.
+ */
+static enum pipe_error
+brw_vs_get_binding_table(struct brw_context *brw,
+                         struct brw_winsys_buffer **bo_out)
+{
+#if 0
+   static GLuint data[BRW_VS_MAX_SURF]; /* always zero */
+   struct brw_winsys_reloc reloc[BRW_VS_MAX_SURF];
+   int i;
+
+   /* Emit binding table relocations to surface state */
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      make_reloc(&reloc[i],
+                 BRW_USAGE_STATE,
+                 0,
+                 i * 4,
+                 brw->vs.surf_bo[i]);
+   }
+   
+   ret = brw_cache_data( &brw->surface_cache, 
+                         BRW_SS_SURF_BIND,
+                         NULL, 0,
+                         reloc, nr_reloc,
+                         data, sizeof data,
+                         NULL, NULL,
+                         bo_out);
+   if (ret)
+      return ret;
+
+   FREE(data);
+   return PIPE_OK;
+#else
+   return PIPE_OK;
+#endif
+}
+
+/**
+ * Vertex shader surfaces (constant buffer).
+ *
+ * This consumes the state updates for the constant buffer needing
+ * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
+ * CACHE_NEW_SURF_BIND for the binding table upload.
+ */
+static enum pipe_error prepare_vs_surfaces(struct brw_context *brw )
+{
+   enum pipe_error ret;
+
+#if 0
+   int i;
+   int nr_surfaces = 0;
+
+   brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER);
+
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      if (brw->vs.surf_bo[i] != NULL) {
+	 nr_surfaces = i + 1;
+      }
+   }
+
+   if (brw->vs.nr_surfaces != nr_surfaces) {
+      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
+      brw->vs.nr_surfaces = nr_surfaces;
+   }
+#endif
+
+   /* Note that we don't end up updating the bind_bo if we don't have a
+    * surface to be pointing at.  This should be relatively harmless, as it
+    * just slightly increases our working set size.
+    */
+   if (brw->vs.nr_surfaces != 0) {
+      ret = brw_vs_get_binding_table(brw, &brw->vs.bind_bo);
+      if (ret)
+         return ret;
+   }
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_vs_surfaces = {
+   .dirty = {
+      .mesa = (PIPE_NEW_VERTEX_CONSTANTS |
+	       PIPE_NEW_VERTEX_SHADER),
+      .brw = 0,
+      .cache = 0
+   },
+   .prepare = prepare_vs_surfaces,
+};
+
+
+
diff --git a/src/gallium/drivers/i965/brw_winsys.h b/src/gallium/drivers/i965/brw_winsys.h
new file mode 100644
index 0000000000..f30c7f1813
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_winsys.h
@@ -0,0 +1,293 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 Jakob Bornecrantz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef BRW_WINSYS_H
+#define BRW_WINSYS_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+
+struct brw_winsys;
+struct pipe_fence_handle;
+
+/* Not sure why the winsys needs this:
+ */
+#define BRW_BATCH_SIZE (32*1024)
+
+struct brw_winsys_screen;
+
+/* Need a tiny bit of information inside the abstract buffer struct:
+ */
+struct brw_winsys_buffer {
+   struct pipe_reference reference;
+   struct brw_winsys_screen *sws;
+   unsigned size;
+};
+
+
+/* Should be possible to validate usages above against buffer creation
+ * types, below:
+ */
+enum brw_buffer_type
+{
+   BRW_BUFFER_TYPE_TEXTURE,
+   BRW_BUFFER_TYPE_SCANOUT,          /**< a texture used for scanning out from */
+   BRW_BUFFER_TYPE_VERTEX,
+   BRW_BUFFER_TYPE_CURBE,
+   BRW_BUFFER_TYPE_QUERY,
+   BRW_BUFFER_TYPE_SHADER_CONSTANTS,
+   BRW_BUFFER_TYPE_SHADER_SCRATCH,
+   BRW_BUFFER_TYPE_BATCH,
+   BRW_BUFFER_TYPE_GENERAL_STATE,
+   BRW_BUFFER_TYPE_SURFACE_STATE,
+   BRW_BUFFER_TYPE_PIXEL,            /* image uploads, pbo's, etc */
+   BRW_BUFFER_TYPE_GENERIC,          /* unknown */
+   BRW_BUFFER_TYPE_MAX               /* Count of possible values */
+};
+
+
+/* Describe the usage of a particular buffer in a relocation.  The DRM
+ * winsys will translate these back to GEM read/write domain flags.
+ */
+enum brw_buffer_usage {
+   BRW_USAGE_STATE,         /* INSTRUCTION, 0 */
+   BRW_USAGE_QUERY_RESULT,  /* INSTRUCTION, INSTRUCTION */
+   BRW_USAGE_RENDER_TARGET, /* RENDER,      0 */
+   BRW_USAGE_DEPTH_BUFFER,  /* RENDER,      RENDER */
+   BRW_USAGE_BLIT_SOURCE,   /* RENDER,      0 */
+   BRW_USAGE_BLIT_DEST,     /* RENDER,      RENDER */
+   BRW_USAGE_SAMPLER,       /* SAMPLER,     0 */
+   BRW_USAGE_VERTEX,        /* VERTEX,      0 */
+   BRW_USAGE_SCRATCH,       /* 0,           0 */
+   BRW_USAGE_MAX
+};
+
+enum brw_buffer_data_type {
+   BRW_DATA_GS_CC_VP,
+   BRW_DATA_GS_CC_UNIT,
+   BRW_DATA_GS_WM_PROG,
+   BRW_DATA_GS_SAMPLER_DEFAULT_COLOR,
+   BRW_DATA_GS_SAMPLER,
+   BRW_DATA_GS_WM_UNIT,
+   BRW_DATA_GS_SF_PROG,
+   BRW_DATA_GS_SF_VP,
+   BRW_DATA_GS_SF_UNIT,
+   BRW_DATA_GS_VS_UNIT,
+   BRW_DATA_GS_VS_PROG,
+   BRW_DATA_GS_GS_UNIT,
+   BRW_DATA_GS_GS_PROG,
+   BRW_DATA_GS_CLIP_VP,
+   BRW_DATA_GS_CLIP_UNIT,
+   BRW_DATA_GS_CLIP_PROG,
+   BRW_DATA_SS_SURFACE,
+   BRW_DATA_SS_SURF_BIND,
+   BRW_DATA_CONSTANT_BUFFER,
+   BRW_DATA_BATCH_BUFFER,
+   BRW_DATA_OTHER,
+   BRW_DATA_MAX
+};
+
+
+/* Matches the i915_drm definitions:
+ */
+#define BRW_TILING_NONE  0
+#define BRW_TILING_X     1
+#define BRW_TILING_Y     2
+
+
+/* Relocations to be applied with subdata in a call to sws->bo_subdata, below.
+ *
+ * Effectively this encodes:
+ *
+ *    (unsigned *)(subdata + offset) = bo->offset + delta
+ */
+struct brw_winsys_reloc {
+   enum brw_buffer_usage usage; /* debug only */
+   unsigned delta;
+   unsigned offset;
+   struct brw_winsys_buffer *bo;
+};
+
+static INLINE void make_reloc(struct brw_winsys_reloc *reloc,
+                              enum brw_buffer_usage usage,
+                              unsigned delta,
+                              unsigned offset,
+                              struct brw_winsys_buffer *bo)
+{
+   reloc->usage = usage;
+   reloc->delta = delta;
+   reloc->offset = offset;
+   reloc->bo = bo;              /* Note - note taking a reference yet */
+}
+
+
+
+struct brw_winsys_screen {
+
+
+   /**
+    * Buffer functions.
+    */
+
+   /*@{*/
+   /**
+    * Create a buffer.
+    */
+   enum pipe_error (*bo_alloc)(struct brw_winsys_screen *sws,
+                               enum brw_buffer_type type,
+                               unsigned size,
+                               unsigned alignment,
+                               struct brw_winsys_buffer **bo_out);
+
+   enum pipe_error (*bo_from_handle)(struct brw_winsys_screen *sws,
+                                     struct winsys_handle *whandle,
+                                     unsigned *stride,
+                                     unsigned *tiling,
+                                     struct brw_winsys_buffer **bo_out);
+
+   enum pipe_error (*bo_get_handle)(struct brw_winsys_buffer *buffer,
+                                    struct winsys_handle *whandle,
+                                    unsigned stride);
+
+   /* Destroy a buffer when our refcount goes to zero:
+    */
+   void (*bo_destroy)(struct brw_winsys_buffer *buffer);
+
+   /* delta -- added to b2->offset, and written into buffer
+    * offset -- location above value is written to within buffer
+    */
+   enum pipe_error (*bo_emit_reloc)(struct brw_winsys_buffer *buffer,
+                                    enum brw_buffer_usage usage,
+                                    unsigned delta,
+                                    unsigned offset,
+                                    struct brw_winsys_buffer *b2);
+
+   enum pipe_error (*bo_exec)(struct brw_winsys_buffer *buffer,
+                              unsigned bytes_used);
+
+   enum pipe_error (*bo_subdata)(struct brw_winsys_buffer *buffer,
+                                 enum brw_buffer_data_type data_type,
+                                 size_t offset,
+                                 size_t size,
+                                 const void *data,
+                                 const struct brw_winsys_reloc *reloc,
+                                 unsigned nr_reloc );
+
+   boolean (*bo_is_busy)(struct brw_winsys_buffer *buffer);
+   boolean (*bo_references)(struct brw_winsys_buffer *a,
+                            struct brw_winsys_buffer *b);
+
+   /* XXX: couldn't this be handled by returning true/false on
+    * bo_emit_reloc?
+    */
+   enum pipe_error (*check_aperture_space)(struct brw_winsys_screen *iws,
+                                           struct brw_winsys_buffer **buffers,
+                                           unsigned count);
+
+   /**
+    * Map a buffer.
+    */
+   void *(*bo_map)(struct brw_winsys_buffer *buffer,
+                   enum brw_buffer_data_type data_type,
+                   unsigned offset,
+                   unsigned length,
+                   boolean write,
+                   boolean discard,
+                   boolean flush_explicit);
+
+   void (*bo_flush_range)(struct brw_winsys_buffer *buffer,
+                          unsigned offset,
+                          unsigned length);
+
+   /**
+    * Unmap a buffer.
+    */
+   void (*bo_unmap)(struct brw_winsys_buffer *buffer);
+   /*@}*/
+
+   
+   /* Wait for buffer to go idle.  Similar to map+unmap, but doesn't
+    * mark buffer contents as dirty.
+    */
+   void (*bo_wait_idle)(struct brw_winsys_buffer *buffer);
+   
+   /**
+    * Destroy the winsys.
+    */
+   void (*destroy)(struct brw_winsys_screen *iws);
+};
+
+static INLINE void *
+bo_map_read(struct brw_winsys_screen *sws, struct brw_winsys_buffer *buf)
+{
+   return sws->bo_map( buf,
+                       BRW_DATA_OTHER,
+                       0, buf->size,
+                       FALSE, FALSE, FALSE );
+}
+
+static INLINE void
+bo_reference(struct brw_winsys_buffer **ptr, struct brw_winsys_buffer *buf)
+{
+   struct brw_winsys_buffer *old_buf = *ptr;
+
+   if (pipe_reference(&(*ptr)->reference, &buf->reference))
+      old_buf->sws->bo_destroy(old_buf);
+
+   *ptr = buf;
+}
+
+
+/**
+ * Create brw pipe_screen.
+ */
+struct pipe_screen *brw_create_screen(struct brw_winsys_screen *iws, unsigned pci_id);
+
+
+
+/*************************************************************************
+ * Cooperative dumping between winsys and driver.  TODO: make this
+ * driver-only by wrapping calls to winsys->bo_subdata().
+ */
+
+#ifdef DEBUG
+extern int BRW_DUMP;
+#else
+#define BRW_DUMP 0
+#endif 
+
+#define DUMP_ASM	        0x1
+#define DUMP_STATE	        0x2
+#define DUMP_BATCH	        0x4
+
+void brw_dump_data( unsigned pci_id,
+		    enum brw_buffer_data_type data_type,
+		    unsigned offset,
+		    const void *data,
+		    size_t size );
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_winsys_debug.c b/src/gallium/drivers/i965/brw_winsys_debug.c
new file mode 100644
index 0000000000..f8f6a539bc
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_winsys_debug.c
@@ -0,0 +1,87 @@
+#include "brw_winsys.h"
+#include "brw_disasm.h"
+#include "brw_structs_dump.h"
+#include "brw_structs.h"
+#include "intel_decode.h"
+
+
+void brw_dump_data( unsigned pci_id,
+		    enum brw_buffer_data_type data_type,
+		    unsigned offset,
+		    const void *data,
+		    size_t size )
+{
+   if (BRW_DUMP & DUMP_ASM) {
+      switch (data_type) {
+      case BRW_DATA_GS_WM_PROG:
+      case BRW_DATA_GS_SF_PROG:
+      case BRW_DATA_GS_VS_PROG:
+      case BRW_DATA_GS_GS_PROG:
+      case BRW_DATA_GS_CLIP_PROG:
+         brw_disasm( stderr, data, size / sizeof(struct brw_instruction) );
+         break;
+      default:
+         break;
+      }
+   }
+
+   if (BRW_DUMP & DUMP_STATE) {
+      switch (data_type) {
+      case BRW_DATA_GS_CC_VP:
+         brw_dump_cc_viewport( data );
+         break;
+      case BRW_DATA_GS_CC_UNIT:
+         brw_dump_cc_unit_state( data );
+         break;
+      case BRW_DATA_GS_SAMPLER_DEFAULT_COLOR:
+         brw_dump_sampler_default_color( data );
+         break;
+      case BRW_DATA_GS_SAMPLER:
+         brw_dump_sampler_state( data );
+         break;
+      case BRW_DATA_GS_WM_UNIT:
+         brw_dump_wm_unit_state( data );
+         break;
+      case BRW_DATA_GS_SF_VP:
+         brw_dump_sf_viewport( data );
+         break;
+      case BRW_DATA_GS_SF_UNIT:
+         brw_dump_sf_unit_state( data );
+         break;
+      case BRW_DATA_GS_VS_UNIT:
+         brw_dump_vs_unit_state( data );
+         break;
+      case BRW_DATA_GS_GS_UNIT:
+         brw_dump_gs_unit_state( data );
+         break;
+      case BRW_DATA_GS_CLIP_VP:
+         brw_dump_clipper_viewport( data );
+         break;
+      case BRW_DATA_GS_CLIP_UNIT:
+         brw_dump_clip_unit_state( data );
+         break;
+      case BRW_DATA_SS_SURFACE:
+         brw_dump_surface_state( data );
+         break;
+      case BRW_DATA_SS_SURF_BIND:
+         break;
+      case BRW_DATA_OTHER:
+         break;
+      case BRW_DATA_CONSTANT_BUFFER:
+         break;
+      default:
+         break;
+      }
+   }
+
+   if (BRW_DUMP & DUMP_BATCH) {
+      switch (data_type) {
+      case BRW_DATA_BATCH_BUFFER:
+         intel_decode(data, size / 4, offset, pci_id);
+         break;
+      default:
+         break;
+      }
+   }
+}
+
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
new file mode 100644
index 0000000000..5d66e61fbc
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -0,0 +1,319 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+#include "tgsi/tgsi_info.h"
+
+#include "brw_context.h"
+#include "brw_screen.h"
+#include "brw_wm.h"
+#include "brw_state.h"
+#include "brw_debug.h"
+#include "brw_resource.h"
+#include "brw_pipe_rast.h"
+
+
+/** Return number of src args for given instruction */
+GLuint brw_wm_nr_args( GLuint opcode )
+{
+   switch (opcode) {
+   case WM_FRONTFACING:
+   case WM_PIXELXY:
+      return 0;
+   case WM_CINTERP:
+   case WM_WPOSXY:
+   case WM_DELTAXY:
+      return 1;
+   case WM_LINTERP:
+   case WM_PIXELW:
+      return 2;
+   case WM_FB_WRITE:
+   case WM_PINTERP:
+      return 3;
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXD:
+      /* sampler arg is held as a field in the instruction, not in an
+       * actual register:
+       */
+      return tgsi_get_opcode_info(opcode)->num_src - 1;
+
+   default:
+      assert(opcode < MAX_OPCODE);
+      return tgsi_get_opcode_info(opcode)->num_src;
+   }
+}
+
+
+GLuint brw_wm_is_scalar_result( GLuint opcode )
+{
+   switch (opcode) {
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_LG2:
+   case TGSI_OPCODE_POW:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_SIN:
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+   case TGSI_OPCODE_DST:
+      return 1;
+      
+   default:
+      return 0;
+   }
+}
+
+
+/**
+ * Do GPU code generation for shaders without flow control.  Shaders
+ * without flow control instructions can more readily be analysed for
+ * SSA-style optimizations.
+ */
+static void
+brw_wm_linear_shader_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+   /* Augment fragment program.  Add instructions for pre- and
+    * post-fragment-program tasks such as interpolation and fogging.
+    */
+   brw_wm_pass_fp(c);
+
+   /* Translate to intermediate representation.  Build register usage
+    * chains.
+    */
+   brw_wm_pass0(c);
+
+   /* Dead code removal.
+    */
+   brw_wm_pass1(c);
+
+   /* Register allocation.
+    * Divide by two because we operate on 16 pixels at a time and require
+    * two GRF entries for each logical shader register.
+    */
+   c->grf_limit = BRW_WM_MAX_GRF / 2;
+
+   brw_wm_pass2(c);
+
+   /* how many general-purpose registers are used */
+   c->prog_data.total_grf = c->max_wm_grf;
+
+   /* Scratch space is used for register spilling */
+   if (c->last_scratch) {
+      c->prog_data.total_scratch = c->last_scratch + 0x40;
+   }
+   else {
+      c->prog_data.total_scratch = 0;
+   }
+
+   /* Emit GEN4 code.
+    */
+   brw_wm_emit(c);
+}
+
+
+/**
+ * All Mesa program -> GPU code generation goes through this function.
+ * Depending on the instructions used (i.e. flow control instructions)
+ * we'll use one of two code generators.
+ */
+static enum pipe_error do_wm_prog( struct brw_context *brw,
+                                   struct brw_fragment_shader *fp, 
+                                   struct brw_wm_prog_key *key,
+                                   struct brw_winsys_buffer **bo_out)
+{
+   enum pipe_error ret;
+   struct brw_wm_compile *c;
+   const GLuint *program;
+   GLuint program_size;
+
+   if (brw->wm.compile_data == NULL) {
+      brw->wm.compile_data = MALLOC(sizeof(*brw->wm.compile_data));
+      if (!brw->wm.compile_data) 
+         return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+
+   c = brw->wm.compile_data;
+   memset(c, 0, sizeof *c);
+
+   c->key = *key;
+   c->fp = fp;
+   c->env_param = NULL; /*brw->intel.ctx.FragmentProgram.Parameters;*/
+
+   brw_init_compile(brw, &c->func);
+
+   /*
+    * Shader which use GLSL features such as flow control are handled
+    * differently from "simple" shaders.
+    */
+   if (fp->has_flow_control) {
+      c->dispatch_width = 8;
+      /* XXX: GLSL support
+       */
+      exit(1);
+      /* brw_wm_branching_shader_emit(brw, c); */
+   }
+   else {
+      c->dispatch_width = 16;
+      brw_wm_linear_shader_emit(brw, c);
+   }
+
+   if (BRW_DEBUG & DEBUG_WM)
+      debug_printf("\n");
+
+   /* get the program
+    */
+   ret = brw_get_program(&c->func, &program, &program_size);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_cache( &brw->cache, BRW_WM_PROG,
+                           &c->key, sizeof(c->key),
+                           NULL, 0,
+                           program, program_size,
+                           &c->prog_data,
+                           &brw->wm.prog_data,
+                           bo_out );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+
+static void brw_wm_populate_key( struct brw_context *brw,
+				 struct brw_wm_prog_key *key )
+{
+   unsigned lookup, line_aa;
+   unsigned i;
+
+   memset(key, 0, sizeof(*key));
+
+   /* PIPE_NEW_FRAGMENT_SHADER
+    * PIPE_NEW_DEPTH_STENCIL_ALPHA
+    */
+   lookup = (brw->curr.zstencil->iz_lookup |
+	     brw->curr.fragment_shader->iz_lookup);
+
+
+   /* PIPE_NEW_RAST
+    * BRW_NEW_REDUCED_PRIMITIVE 
+    */
+   switch (brw->reduced_primitive) {
+   case PIPE_PRIM_POINTS:
+      line_aa = AA_NEVER;
+      break;
+   case PIPE_PRIM_LINES:
+      line_aa = (brw->curr.rast->templ.line_smooth ? 
+                 AA_ALWAYS : AA_NEVER);
+      break;
+   default:
+      line_aa = brw->curr.rast->unfilled_aa_line;
+      break;
+   }
+	 
+   brw_wm_lookup_iz(line_aa,
+		    lookup,
+		    brw->curr.fragment_shader->uses_depth,
+		    key);
+
+   /* PIPE_NEW_RAST */
+   key->flat_shade = brw->curr.rast->templ.flatshade;
+
+
+   /* PIPE_NEW_BOUND_TEXTURES */
+   for (i = 0; i < brw->curr.num_fragment_sampler_views; i++) {
+      const struct brw_texture *tex = brw_texture(brw->curr.fragment_sampler_views[i]->texture);
+	 
+      if (tex->b.b.format == PIPE_FORMAT_UYVY)
+	 key->yuvtex_mask |= 1 << i;
+
+      if (tex->b.b.format == PIPE_FORMAT_YUYV)
+	 key->yuvtex_swap_mask |= 1 << i;
+
+      /* XXX: shadow texture
+       */
+      /* key->shadowtex_mask |= 1<<i; */
+   }
+
+   /* CACHE_NEW_VS_PROG */
+   key->vp_nr_outputs = brw->vs.prog_data->nr_outputs;
+
+   key->nr_cbufs = brw->curr.fb.nr_cbufs;
+
+   key->nr_inputs = brw->curr.fragment_shader->info.num_inputs;
+
+   /* The unique fragment program ID */
+   key->program_string_id = brw->curr.fragment_shader->id;
+}
+
+
+static enum pipe_error brw_prepare_wm_prog(struct brw_context *brw)
+{
+   struct brw_wm_prog_key key;
+   struct brw_fragment_shader *fs = brw->curr.fragment_shader;
+   enum pipe_error ret;
+     
+   brw_wm_populate_key(brw, &key);
+
+   /* Make an early check for the key.
+    */
+   if (brw_search_cache(&brw->cache, BRW_WM_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->wm.prog_data,
+                        &brw->wm.prog_bo))
+      return PIPE_OK;
+
+   ret = do_wm_prog(brw, fs, &key, &brw->wm.prog_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+const struct brw_tracked_state brw_wm_prog = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_FRAGMENT_SHADER |
+		PIPE_NEW_DEPTH_STENCIL_ALPHA |
+		PIPE_NEW_RAST |
+		PIPE_NEW_NR_CBUFS |
+		PIPE_NEW_BOUND_TEXTURES),
+      .brw   = (BRW_NEW_WM_INPUT_DIMENSIONS |
+		BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = CACHE_NEW_VS_PROG,
+   },
+   .prepare = brw_prepare_wm_prog
+};
+
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
new file mode 100644
index 0000000000..f1ca9f6369
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -0,0 +1,344 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+              
+
+#ifndef BRW_WM_H
+#define BRW_WM_H
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define SATURATE (1<<5)
+
+/* A big lookup table is used to figure out which and how many
+ * additional regs will inserted before the main payload in the WM
+ * program execution.  These mainly relate to depth and stencil
+ * processing and the early-depth-test optimization.
+ */
+#define IZ_PS_KILL_ALPHATEST_BIT    0x1
+#define IZ_PS_COMPUTES_DEPTH_BIT    0x2
+#define IZ_DEPTH_WRITE_ENABLE_BIT   0x4
+#define IZ_DEPTH_TEST_ENABLE_BIT    0x8
+#define IZ_STENCIL_WRITE_ENABLE_BIT 0x10
+#define IZ_STENCIL_TEST_ENABLE_BIT  0x20
+#define IZ_BIT_MAX                  0x40
+
+#define AA_NEVER     0
+#define AA_SOMETIMES 1
+#define AA_ALWAYS    2
+
+struct brw_wm_prog_key {
+   GLuint source_depth_reg:3;
+   GLuint aa_dest_stencil_reg:3;
+   GLuint dest_depth_reg:3;
+   GLuint nr_depth_regs:3;
+   GLuint computes_depth:1;
+   GLuint source_depth_to_render_target:1;
+   GLuint flat_shade:1;
+   GLuint runtime_check_aads_emit:1;
+
+   GLuint shadowtex_mask:16;
+   GLuint yuvtex_mask:16;
+   GLuint yuvtex_swap_mask:16;	/* UV swaped */
+
+   GLuint vp_nr_outputs:6;
+   GLuint nr_inputs:6;
+   GLuint nr_cbufs:3;
+   GLuint has_flow_control:1;
+
+   GLuint program_string_id;
+};
+
+
+/* A bit of a glossary:
+ *
+ * brw_wm_value: A computed value or program input.  Values are
+ * constant, they are created once and are never modified.  When a
+ * fragment program register is written or overwritten, new values are
+ * created fresh, preserving the rule that values are constant.
+ *
+ * brw_wm_ref: A reference to a value.  Wherever a value used is by an
+ * instruction or as a program output, that is tracked with an
+ * instance of this struct.  All references to a value occur after it
+ * is created.  After the last reference, a value is dead and can be
+ * discarded.
+ *
+ * brw_wm_grf: Represents a physical hardware register.  May be either
+ * empty or hold a value.  Register allocation is the process of
+ * assigning values to grf registers.  This occurs in pass2 and the
+ * brw_wm_grf struct is not used before that.
+ *
+ * Fragment program registers: These are time-varying constructs that
+ * are hard to reason about and which we translate away in pass0.  A
+ * single fragment program register element (eg. temp[0].x) will be
+ * translated to one or more brw_wm_value structs, one for each time
+ * that temp[0].x is written to during the program. 
+ */
+
+
+
+/* Used in pass2 to track register allocation.
+ */
+struct brw_wm_grf {
+   struct brw_wm_value *value;
+   GLuint nextuse;
+};
+
+struct brw_wm_value {
+   struct brw_reg hw_reg;	/* emitted to this reg, may not always be there */
+   struct brw_wm_ref *lastuse;
+   struct brw_wm_grf *resident; 
+   GLuint contributes_to_output:1;
+   GLuint spill_slot:16;	/* if non-zero, spill immediately after calculation */
+};
+
+struct brw_wm_ref {
+   struct brw_reg hw_reg;	/* nr filled in in pass2, everything else, pass0 */
+   struct brw_wm_value *value;
+   struct brw_wm_ref *prevuse;
+   GLuint unspill_reg:7;	/* unspill to reg */
+   GLuint emitted:1;
+   GLuint insn:24;
+};
+
+struct brw_wm_instruction {
+   struct brw_wm_value *dst[4];
+   struct brw_wm_ref *src[3][4];
+   GLuint opcode:8;
+   GLuint saturate:1;
+   GLuint writemask:4;
+   GLuint sampler:4;
+   GLuint tex_unit:4;   /* texture/sampler unit for texture instructions */
+   GLuint target:4;     /* TGSI_TEXTURE_x for texture instructions,
+                         * target binding table index for FB_WRITE
+                         */
+   GLuint eot:1;    	/* End of thread indicator for FB_WRITE*/
+};
+
+
+#define BRW_WM_MAX_INSN  2048
+#define BRW_WM_MAX_GRF   128		/* hardware limit */
+#define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
+#define BRW_WM_MAX_REF   (BRW_WM_MAX_INSN * 12)
+#define BRW_WM_MAX_PARAM 256
+#define BRW_WM_MAX_CONST 256
+#define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS
+#define BRW_WM_MAX_SUBROUTINE 16
+
+
+/* New opcodes to track internal operations required for WM unit.
+ * These are added early so that the registers used can be tracked,
+ * freed and reused like those of other instructions.
+ */
+#define MAX_OPCODE        TGSI_OPCODE_LAST
+#define WM_PIXELXY        (MAX_OPCODE)
+#define WM_DELTAXY        (MAX_OPCODE + 1)
+#define WM_PIXELW         (MAX_OPCODE + 2)
+#define WM_LINTERP        (MAX_OPCODE + 3)
+#define WM_PINTERP        (MAX_OPCODE + 4)
+#define WM_CINTERP        (MAX_OPCODE + 5)
+#define WM_WPOSXY         (MAX_OPCODE + 6)
+#define WM_FB_WRITE       (MAX_OPCODE + 7)
+#define WM_FRONTFACING    (MAX_OPCODE + 8)
+#define MAX_WM_OPCODE     (MAX_OPCODE + 9)
+
+#define BRW_FILE_PAYLOAD   (TGSI_FILE_COUNT)
+#define PAYLOAD_DEPTH      (PIPE_MAX_SHADER_INPUTS) /* ?? */
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+struct brw_fp_src {
+   unsigned file:4;
+   unsigned index:16;
+   unsigned swizzle:8;
+   unsigned indirect:1;
+   unsigned negate:1;
+   unsigned abs:1;
+};
+
+struct brw_fp_dst {
+   unsigned file:4;
+   unsigned index:16;
+   unsigned writemask:4;
+   unsigned indirect:1;
+   unsigned saturate:1;
+};
+
+struct brw_fp_instruction {
+   struct brw_fp_dst dst;
+   struct brw_fp_src src[3];
+   unsigned opcode:8;
+   unsigned target:8; /* XXX: special usage for FB_WRITE */
+   unsigned tex_unit:4;
+   unsigned sampler:4;
+   unsigned pad:8;
+};
+
+
+struct brw_wm_compile {
+   struct brw_compile func;
+   struct brw_wm_prog_key key;
+   struct brw_wm_prog_data prog_data;
+
+   struct brw_fragment_shader *fp;
+
+   GLfloat (*env_param)[4];
+
+   enum {
+      START,
+      PASS2_DONE
+   } state;
+
+   /* Initial pass - translate fp instructions to fp instructions,
+    * simplifying and adding instructions for interpolation and
+    * framebuffer writes.
+    */
+   struct {
+      GLfloat v[4];
+      unsigned nr;
+   } immediate[BRW_WM_MAX_CONST+3];
+   GLuint nr_immediates;
+   
+   struct brw_fp_instruction fp_instructions[BRW_WM_MAX_INSN];
+   GLuint nr_fp_insns;
+   GLuint fp_temp;
+   GLuint fp_interp_emitted;
+   GLuint fp_fragcolor_emitted;
+   GLuint fp_first_internal_temp;
+
+   struct brw_fp_src fp_pixel_xy;
+   struct brw_fp_src fp_delta_xy;
+   struct brw_fp_src fp_pixel_w;
+
+
+   /* Subsequent passes using SSA representation:
+    */
+   struct brw_wm_value vreg[BRW_WM_MAX_VREG];
+   GLuint nr_vreg;
+
+   struct brw_wm_value creg[BRW_WM_MAX_PARAM];
+   GLuint nr_creg;
+
+   struct {
+      struct brw_wm_value depth[4]; /* includes r0/r1 */
+      struct brw_wm_value input_interp[PIPE_MAX_SHADER_INPUTS];
+   } payload;
+
+
+   const struct brw_wm_ref *pass0_fp_reg[BRW_FILE_PAYLOAD+1][256][4];
+
+   struct brw_wm_ref undef_ref;
+   struct brw_wm_value undef_value;
+
+   struct brw_wm_ref refs[BRW_WM_MAX_REF];
+   GLuint nr_refs;
+
+   struct brw_wm_instruction instruction[BRW_WM_MAX_INSN];
+   GLuint nr_insns;
+
+   struct brw_wm_grf pass2_grf[BRW_WM_MAX_GRF/2];
+
+   GLuint grf_limit;
+   GLuint max_wm_grf;
+   GLuint last_scratch;
+
+   GLuint cur_inst;  /**< index of current instruction */
+
+   GLboolean out_of_regs;  /**< ran out of GRF registers? */
+
+   /** Mapping from Mesa registers to hardware registers */
+   struct {
+      GLboolean inited;
+      struct brw_reg reg;
+   } wm_regs[BRW_FILE_PAYLOAD+1][256][4];
+
+   GLboolean used_grf[BRW_WM_MAX_GRF];
+   GLuint first_free_grf;
+   struct brw_reg stack;
+   struct brw_reg emit_mask_reg;
+   GLuint tmp_regs[BRW_WM_MAX_GRF];
+   GLuint tmp_index;
+   GLuint tmp_max;
+   GLuint subroutines[BRW_WM_MAX_SUBROUTINE];
+   GLuint dispatch_width;
+
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
+
+   GLuint error;
+};
+
+
+GLuint brw_wm_nr_args( GLuint opcode );
+GLuint brw_wm_is_scalar_result( GLuint opcode );
+
+int brw_wm_pass_fp( struct brw_wm_compile *c );
+void brw_wm_pass0( struct brw_wm_compile *c );
+void brw_wm_pass1( struct brw_wm_compile *c );
+void brw_wm_pass2( struct brw_wm_compile *c );
+void brw_wm_emit( struct brw_wm_compile *c );
+
+void brw_wm_print_value( struct brw_wm_compile *c,
+			 struct brw_wm_value *value );
+
+void brw_wm_print_ref( struct brw_wm_compile *c,
+		       struct brw_wm_ref *ref );
+
+void brw_wm_print_insn( struct brw_wm_compile *c,
+			struct brw_wm_instruction *inst );
+
+void brw_wm_print_program( struct brw_wm_compile *c,
+			   const char *stage );
+
+void brw_wm_print_fp_program( struct brw_wm_compile *c,
+                              const char *stage );
+
+void brw_wm_lookup_iz( GLuint line_aa,
+		       GLuint lookup,
+		       GLboolean ps_uses_depth,
+		       struct brw_wm_prog_key *key );
+
+void brw_wm_branching_shader_emit(struct brw_context *brw, struct brw_wm_compile *c);
+
+void emit_ddxy(struct brw_compile *p,
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       GLboolean is_ddx,
+	       const struct brw_reg *arg0);
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_wm_constant_buffer.c b/src/gallium/drivers/i965/brw_wm_constant_buffer.c
new file mode 100644
index 0000000000..df5cd0398c
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_constant_buffer.c
@@ -0,0 +1,165 @@
+/* XXX: Constant buffers disabled
+ */
+
+
+/**
+ * Create the constant buffer surface.  Vertex/fragment shader constants will be
+ * read from this buffer with Data Port Read instructions/messages.
+ */
+enum pipe_error
+brw_create_constant_surface( struct brw_context *brw,
+                             struct brw_surface_key *key,
+                             struct brw_winsys_buffer **bo_out )
+{
+   const GLint w = key->width - 1;
+   struct brw_winsys_buffer *bo;
+   struct brw_winsys_reloc reloc[1];
+   enum pipe_error ret;
+
+      /* Emit relocation to surface contents */
+   make_reloc(&reloc[0],
+              BRW_USAGE_SAMPLER,
+              0,
+              offsetof(struct brw_surface_state, ss1),
+              key->bo);
+
+   
+   memset(&surf, 0, sizeof(surf));
+
+   surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   surf.ss0.surface_type = BRW_SURFACE_BUFFER;
+   surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   surf.ss1.base_addr = 0; /* reloc */
+
+   surf.ss2.width = w & 0x7f;            /* bits 6:0 of size or width */
+   surf.ss2.height = (w >> 7) & 0x1fff;  /* bits 19:7 of size or width */
+   surf.ss3.depth = (w >> 20) & 0x7f;    /* bits 26:20 of size or width */
+   surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */
+   brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
+ 
+   ret = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
+                          key, sizeof(*key),
+                          reloc, Elements(reloc),
+                          &surf, sizeof(surf),
+                          NULL, NULL,
+                          &bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+
+/**
+ * Update the surface state for a WM constant buffer.
+ * The constant buffer will be (re)allocated here if needed.
+ */
+static enum pipe_error
+brw_update_wm_constant_surface( struct brw_context *brw,
+                                GLuint surf)
+{
+   struct brw_surface_key key;
+   struct brw_fragment_shader *fp = brw->curr.fragment_shader;
+   struct pipe_resource *cbuf = brw->curr.fragment_constants;
+   int pitch = cbuf->size / (4 * sizeof(float));
+   enum pipe_error ret;
+
+   /* If we're in this state update atom, we need to update WM constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   ret = brw_wm_update_constant_buffer(brw, &fp->const_buffer);
+   if (ret)
+      return ret;
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (cbuf == NULL) {
+      bo_reference(&brw->wm.surf_bo[surf], NULL);
+      return PIPE_OK;
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   key.ss0.surface_type = BRW_SURFACE_BUFFER;
+   key.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   key.bo = brw_buffer(cbuf)->bo;
+
+   key.ss2.width = (pitch-1) & 0x7f;            /* bits 6:0 of size or width */
+   key.ss2.height = ((pitch-1) >> 7) & 0x1fff;  /* bits 19:7 of size or width */
+   key.ss3.depth = ((pitch-1) >> 20) & 0x7f;    /* bits 26:20 of size or width */
+   key.ss3.pitch = (pitch * 4 * sizeof(float)) - 1; /* ignored?? */
+   brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
+
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   if (brw_search_cache(&brw->surface_cache,
+                        BRW_SS_SURFACE,
+                        &key, sizeof(key),
+                        &key.bo, 1,
+                        NULL,
+                        &brw->wm.surf_bo[surf]))
+      return PIPE_OK;
+
+   ret = brw_create_constant_surface(brw, &key, &brw->wm.surf_bo[surf]);
+   if (ret)
+      return ret;
+
+   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+   return PIPE_OK;
+}
+
+/**
+ * Updates surface / buffer for fragment shader constant buffer, if
+ * one is required.
+ *
+ * This consumes the state updates for the constant buffer, and produces
+ * BRW_NEW_WM_SURFACES to get picked up by brw_prepare_wm_surfaces for
+ * inclusion in the binding table.
+ */
+static enum pipe_error prepare_wm_constant_surface(struct brw_context *brw )
+{
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
+
+   ret = brw_wm_update_constant_buffer(brw,
+                                       &fp->const_buffer);
+   if (ret)
+      return ret;
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (fp->const_buffer == 0) {
+      if (brw->wm.surf_bo[surf] != NULL) {
+	 bo_reference(&brw->wm.surf_bo[surf], NULL);
+	 brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+      }
+      return PIPE_OK;
+   }
+
+   ret = brw_update_wm_constant_surface(ctx, surf);
+   if (ret)
+      return ret;
+
+   return PIPE_OK
+}
+
+const struct brw_tracked_state brw_wm_constant_surface = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_FRAGMENT_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_wm_constant_surface,
+};
diff --git a/src/gallium/drivers/i965/brw_wm_debug.c b/src/gallium/drivers/i965/brw_wm_debug.c
new file mode 100644
index 0000000000..e2767264e7
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_debug.c
@@ -0,0 +1,257 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "tgsi/tgsi_info.h"
+
+#include "brw_context.h"
+#include "brw_wm.h"
+
+static void print_writemask( unsigned writemask )
+{
+   if (writemask != BRW_WRITEMASK_XYZW)
+      debug_printf(".%s%s%s%s", 
+		   (writemask & BRW_WRITEMASK_X) ? "x" : "",
+		   (writemask & BRW_WRITEMASK_Y) ? "y" : "",
+		   (writemask & BRW_WRITEMASK_Z) ? "z" : "",
+		   (writemask & BRW_WRITEMASK_W) ? "w" : "");
+}
+
+static void print_swizzle( unsigned swizzle )
+{
+   char *swz = "xyzw";
+   if (swizzle != BRW_SWIZZLE_XYZW)
+      debug_printf(".%c%c%c%c", 
+		   swz[BRW_GET_SWZ(swizzle, X)],
+		   swz[BRW_GET_SWZ(swizzle, Y)],
+		   swz[BRW_GET_SWZ(swizzle, Z)],
+		   swz[BRW_GET_SWZ(swizzle, W)]);
+}
+
+static void print_opcode( unsigned opcode )
+{
+   switch (opcode) {
+   case WM_PIXELXY:
+      debug_printf("PIXELXY");
+      break;
+   case WM_DELTAXY:
+      debug_printf("DELTAXY");
+      break;
+   case WM_PIXELW:
+      debug_printf("PIXELW");
+      break;
+   case WM_WPOSXY:
+      debug_printf("WPOSXY");
+      break;
+   case WM_PINTERP:
+      debug_printf("PINTERP");
+      break;
+   case WM_LINTERP:
+      debug_printf("LINTERP");
+      break;
+   case WM_CINTERP:
+      debug_printf("CINTERP");
+      break;
+   case WM_FB_WRITE:
+      debug_printf("FB_WRITE");
+      break;
+   case WM_FRONTFACING:
+      debug_printf("FRONTFACING");
+      break;
+   default:
+      debug_printf("%s", tgsi_get_opcode_info(opcode)->mnemonic);
+      break;
+   }
+}
+
+void brw_wm_print_value( struct brw_wm_compile *c,
+		       struct brw_wm_value *value )
+{
+   assert(value);
+   if (c->state >= PASS2_DONE) 
+      brw_print_reg(value->hw_reg);
+   else if( value == &c->undef_value )
+      debug_printf("undef");
+   else if( value - c->vreg >= 0 &&
+	    value - c->vreg < BRW_WM_MAX_VREG)
+      debug_printf("r%d", value - c->vreg);
+   else if (value - c->creg >= 0 &&
+	    value - c->creg < BRW_WM_MAX_PARAM)
+      debug_printf("c%d", value - c->creg);
+   else if (value - c->payload.input_interp >= 0 &&
+	    value - c->payload.input_interp < PIPE_MAX_SHADER_INPUTS)
+      debug_printf("i%d", value - c->payload.input_interp);
+   else if (value - c->payload.depth >= 0 &&
+	    value - c->payload.depth < PIPE_MAX_SHADER_INPUTS)
+      debug_printf("d%d", value - c->payload.depth);
+   else 
+      debug_printf("?");
+}
+
+void brw_wm_print_ref( struct brw_wm_compile *c,
+		       struct brw_wm_ref *ref )
+{
+   struct brw_reg hw_reg = ref->hw_reg;
+
+   if (ref->unspill_reg)
+      debug_printf("UNSPILL(%x)/", ref->value->spill_slot);
+
+   if (c->state >= PASS2_DONE)
+      brw_print_reg(ref->hw_reg);
+   else {
+      debug_printf("%s", hw_reg.negate ? "-" : "");
+      debug_printf("%s", hw_reg.abs ? "abs/" : "");
+      brw_wm_print_value(c, ref->value);
+      if ((hw_reg.nr&1) || hw_reg.subnr) {
+	 debug_printf("->%d.%d", (hw_reg.nr&1), hw_reg.subnr);
+      }
+   }
+}
+
+void brw_wm_print_insn( struct brw_wm_compile *c,
+			struct brw_wm_instruction *inst )
+{
+   GLuint i, arg;
+   GLuint nr_args = brw_wm_nr_args(inst->opcode);
+
+   debug_printf("[");
+   for (i = 0; i < 4; i++) {
+      if (inst->dst[i]) {
+	 brw_wm_print_value(c, inst->dst[i]);
+	 if (inst->dst[i]->spill_slot)
+	    debug_printf("/SPILL(%x)",inst->dst[i]->spill_slot);
+      }
+      else
+	 debug_printf("#");
+      if (i < 3)      
+	 debug_printf(",");
+   }
+   debug_printf("]");
+   print_writemask(inst->writemask);
+   
+   debug_printf(" = ");
+   print_opcode(inst->opcode);
+  
+   if (inst->saturate)
+      debug_printf("_SAT");
+
+   for (arg = 0; arg < nr_args; arg++) {
+
+      debug_printf(" [");
+
+      for (i = 0; i < 4; i++) {
+	 if (inst->src[arg][i]) {
+	    brw_wm_print_ref(c, inst->src[arg][i]);
+	 }
+	 else
+	    debug_printf("%%");
+
+	 if (i < 3) 
+	    debug_printf(",");
+	 else
+	    debug_printf("]");
+      }
+   }
+   debug_printf("\n");
+}
+
+void brw_wm_print_program( struct brw_wm_compile *c,
+			   const char *stage )
+{
+   GLuint insn;
+
+   debug_printf("%s:\n", stage);
+   for (insn = 0; insn < c->nr_insns; insn++)
+      brw_wm_print_insn(c, &c->instruction[insn]);
+   debug_printf("\n");
+}
+
+static const char *file_strings[TGSI_FILE_COUNT+1] = {
+   "NULL",
+   "CONST",
+   "IN",
+   "OUT",
+   "TEMP",
+   "SAMPLER",
+   "ADDR",
+   "IMM",
+   "PRED",
+   "SV",
+   "PAYLOAD"
+};
+
+static void brw_wm_print_fp_insn( struct brw_wm_compile *c,
+                                  struct brw_fp_instruction *inst )
+{
+   GLuint i;
+   GLuint nr_args = brw_wm_nr_args(inst->opcode);
+
+   print_opcode(inst->opcode);
+   if (inst->dst.saturate)
+      debug_printf("_SAT");
+   debug_printf(" ");
+
+   if (inst->dst.indirect)
+      debug_printf("[");
+
+   debug_printf("%s[%d]",
+                file_strings[inst->dst.file],
+                inst->dst.index );
+   print_writemask(inst->dst.writemask);
+
+   if (inst->dst.indirect)
+      debug_printf("]");
+
+   debug_printf(nr_args ? ", " : "\n");
+   
+   for (i = 0; i < nr_args; i++) {
+      debug_printf("%s%s%s[%d]%s",
+                   inst->src[i].negate ? "-" : "",
+                   inst->src[i].abs ? "ABS(" : "",
+                   file_strings[inst->src[i].file],
+                   inst->src[i].index,
+                   inst->src[i].abs ? ")" : "");
+      print_swizzle(inst->src[i].swizzle);
+      debug_printf("%s", i == nr_args - 1 ? "\n" : ", ");
+   }
+}
+
+
+void brw_wm_print_fp_program( struct brw_wm_compile *c,
+                              const char *stage )
+{
+   GLuint insn;
+
+   debug_printf("%s:\n", stage);
+   for (insn = 0; insn < c->nr_fp_insns; insn++)
+      brw_wm_print_fp_insn(c, &c->fp_instructions[insn]);
+   debug_printf("\n");
+}
+
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
new file mode 100644
index 0000000000..8f983a60ae
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -0,0 +1,1521 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_math.h"
+#include "tgsi/tgsi_info.h"
+
+#include "brw_context.h"
+#include "brw_wm.h"
+#include "brw_debug.h"
+#include "brw_disasm.h"
+
+/* Not quite sure how correct this is - need to understand horiz
+ * vs. vertical strides a little better.
+ */
+static INLINE struct brw_reg sechalf( struct brw_reg reg )
+{
+   if (reg.vstride)
+      reg.nr++;
+   return reg;
+}
+
+/* Payload R0:
+ *
+ * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 quads,
+ *         corresponding to each of the 16 execution channels.
+ * R0.1..8 -- ?
+ * R1.0 -- triangle vertex 0.X
+ * R1.1 -- triangle vertex 0.Y
+ * R1.2 -- quad 0 x,y coords (2 packed uwords)
+ * R1.3 -- quad 1 x,y coords (2 packed uwords)
+ * R1.4 -- quad 2 x,y coords (2 packed uwords)
+ * R1.5 -- quad 3 x,y coords (2 packed uwords)
+ * R1.6 -- ?
+ * R1.7 -- ?
+ * R1.8 -- ?
+ */
+
+
+static void emit_pixel_xy(struct brw_compile *p,
+			  const struct brw_reg *dst,
+			  GLuint mask)
+{
+   struct brw_reg r1 = brw_vec1_grf(1, 0);
+   struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
+
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+   /* Calculate pixel centers by adding 1 or 0 to each of the
+    * micro-tile coordinates passed in r1.
+    */
+   if (mask & BRW_WRITEMASK_X) {
+      brw_ADD(p,
+	      vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)),
+	      stride(suboffset(r1_uw, 4), 2, 4, 0),
+	      brw_imm_v(0x10101010));
+   }
+
+   if (mask & BRW_WRITEMASK_Y) {
+      brw_ADD(p,
+	      vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)),
+	      stride(suboffset(r1_uw,5), 2, 4, 0),
+	      brw_imm_v(0x11001100));
+   }
+
+   brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+}
+
+
+
+static void emit_delta_xy(struct brw_compile *p,
+			  const struct brw_reg *dst,
+			  GLuint mask,
+			  const struct brw_reg *arg0)
+{
+   struct brw_reg r1 = brw_vec1_grf(1, 0);
+
+   /* Calc delta X,Y by subtracting origin in r1 from the pixel
+    * centers.
+    */
+   if (mask & BRW_WRITEMASK_X) {
+      brw_ADD(p,
+	      dst[0],
+	      retype(arg0[0], BRW_REGISTER_TYPE_UW),
+	      negate(r1));
+   }
+
+   if (mask & BRW_WRITEMASK_Y) {
+      brw_ADD(p,
+	      dst[1],
+	      retype(arg0[1], BRW_REGISTER_TYPE_UW),
+	      negate(suboffset(r1,1)));
+
+   }
+}
+
+static void emit_wpos_xy(struct brw_wm_compile *c,
+			 const struct brw_reg *dst,
+			 GLuint mask,
+			 const struct brw_reg *arg0)
+{
+   struct brw_compile *p = &c->func;
+
+   if (mask & BRW_WRITEMASK_X) {
+      /* X' = X */
+      brw_MOV(p,
+	      dst[0],
+	      retype(arg0[0], BRW_REGISTER_TYPE_W));
+   }
+
+   /* XXX: is this needed any more, or is this a NOOP?
+    */
+   if (mask & BRW_WRITEMASK_Y) {
+#if 0
+      /* Y' = height - 1 - Y */
+      brw_ADD(p,
+	      dst[1],
+	      negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
+	      brw_imm_d(c->key.drawable_height - 1));
+#else
+      brw_MOV(p,
+	      dst[0],
+	      retype(arg0[0], BRW_REGISTER_TYPE_W));
+#endif
+   }
+}
+
+
+static void emit_pixel_w( struct brw_compile *p,
+			  const struct brw_reg *dst,
+			  GLuint mask,
+			  const struct brw_reg *arg0,
+			  const struct brw_reg *deltas)
+{
+   /* Don't need this if all you are doing is interpolating color, for
+    * instance.
+    */
+   if (mask & BRW_WRITEMASK_W) {      
+      struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
+
+      /* Calc 1/w - just linterp wpos[3] optimized by putting the
+       * result straight into a message reg.
+       */
+      brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
+      brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
+
+      /* Calc w */
+      brw_math_16( p, dst[3],
+		   BRW_MATH_FUNCTION_INV,
+		   BRW_MATH_SATURATE_NONE,
+		   2, brw_null_reg(),
+		   BRW_MATH_PRECISION_FULL);
+   }
+}
+
+
+
+static void emit_linterp( struct brw_compile *p, 
+			 const struct brw_reg *dst,
+			 GLuint mask,
+			 const struct brw_reg *arg0,
+			 const struct brw_reg *deltas )
+{
+   struct brw_reg interp[4];
+   GLuint nr = arg0[0].nr;
+   GLuint i;
+
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
+	 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
+      }
+   }
+}
+
+
+static void emit_pinterp( struct brw_compile *p, 
+			  const struct brw_reg *dst,
+			  GLuint mask,
+			  const struct brw_reg *arg0,
+			  const struct brw_reg *deltas,
+			  const struct brw_reg *w)
+{
+   struct brw_reg interp[4];
+   GLuint nr = arg0[0].nr;
+   GLuint i;
+
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
+	 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
+      }
+   }
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_MUL(p, dst[i], dst[i], w[3]);
+      }
+   }
+}
+
+
+static void emit_cinterp( struct brw_compile *p, 
+			 const struct brw_reg *dst,
+			 GLuint mask,
+			 const struct brw_reg *arg0 )
+{
+   struct brw_reg interp[4];
+   GLuint nr = arg0[0].nr;
+   GLuint i;
+
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+         brw_MOV(p, dst[i], suboffset(interp[i],3));	/* TODO: optimize away like other moves */
+      }
+   }
+}
+
+/* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
+static void emit_frontfacing( struct brw_compile *p,
+			      const struct brw_reg *dst,
+			      GLuint mask )
+{
+   struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
+   GLuint i;
+
+   if (!(mask & BRW_WRITEMASK_XYZW))
+      return;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_MOV(p, dst[i], brw_imm_f(0.0));
+      }
+   }
+
+   /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
+    * us front face
+    */
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_MOV(p, dst[i], brw_imm_f(1.0));
+      }
+   }
+   brw_set_predicate_control_flag_value(p, 0xff);
+}
+
+/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
+ * looking like:
+ *
+ * arg0: q0.tl q0.tr q0.bl q0.br q1.tl q1.tr q1.bl q1.br
+ *
+ * and we're trying to produce:
+ *
+ *           DDX                     DDY
+ * dst: (q0.tr - q0.tl)     (q0.tl - q0.bl)
+ *      (q0.tr - q0.tl)     (q0.tr - q0.br)
+ *      (q0.br - q0.bl)     (q0.tl - q0.bl)
+ *      (q0.br - q0.bl)     (q0.tr - q0.br)
+ *      (q1.tr - q1.tl)     (q1.tl - q1.bl)
+ *      (q1.tr - q1.tl)     (q1.tr - q1.br)
+ *      (q1.br - q1.bl)     (q1.tl - q1.bl)
+ *      (q1.br - q1.bl)     (q1.tr - q1.br)
+ *
+ * and add two more quads if in 16-pixel dispatch mode.
+ *
+ * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
+ * for each pair, and vertstride = 2 jumps us 2 elements after processing a
+ * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
+ * between each other.  We could probably do it like ddx and swizzle the right
+ * order later, but bail for now and just produce
+ * ((q0.tl - q0.bl)x4 (q1.tl - q1.bl)x4)
+ */
+void emit_ddxy(struct brw_compile *p,
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       GLboolean is_ddx,
+	       const struct brw_reg *arg0)
+{
+   int i;
+   struct brw_reg src0, src1;
+
+   if (mask & SATURATE)
+      brw_set_saturate(p, 1);
+   for (i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 if (is_ddx) {
+	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
+			   BRW_REGISTER_TYPE_F,
+			   BRW_VERTICAL_STRIDE_2,
+			   BRW_WIDTH_2,
+			   BRW_HORIZONTAL_STRIDE_0,
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
+	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
+			   BRW_REGISTER_TYPE_F,
+			   BRW_VERTICAL_STRIDE_2,
+			   BRW_WIDTH_2,
+			   BRW_HORIZONTAL_STRIDE_0,
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
+	 } else {
+	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
+			   BRW_REGISTER_TYPE_F,
+			   BRW_VERTICAL_STRIDE_4,
+			   BRW_WIDTH_4,
+			   BRW_HORIZONTAL_STRIDE_0,
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
+	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
+			   BRW_REGISTER_TYPE_F,
+			   BRW_VERTICAL_STRIDE_4,
+			   BRW_WIDTH_4,
+			   BRW_HORIZONTAL_STRIDE_0,
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
+	 }
+	 brw_ADD(p, dst[i], src0, negate(src1));
+      }
+   }
+   if (mask & SATURATE)
+      brw_set_saturate(p, 0);
+}
+
+static void emit_alu1( struct brw_compile *p, 
+		       struct brw_instruction *(*func)(struct brw_compile *, 
+						       struct brw_reg, 
+						       struct brw_reg),
+		       const struct brw_reg *dst,
+		       GLuint mask,
+		       const struct brw_reg *arg0 )
+{
+   GLuint i;
+
+   if (mask & SATURATE)
+      brw_set_saturate(p, 1);
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 func(p, dst[i], arg0[i]);
+      }
+   }
+
+   if (mask & SATURATE)
+      brw_set_saturate(p, 0);
+}
+
+
+static void emit_alu2( struct brw_compile *p, 
+		       struct brw_instruction *(*func)(struct brw_compile *, 
+						       struct brw_reg, 
+						       struct brw_reg, 
+						       struct brw_reg),
+		       const struct brw_reg *dst,
+		       GLuint mask,
+		       const struct brw_reg *arg0,
+		       const struct brw_reg *arg1 )
+{
+   GLuint i;
+
+   if (mask & SATURATE)
+      brw_set_saturate(p, 1);
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 func(p, dst[i], arg0[i], arg1[i]);
+      }
+   }
+
+   if (mask & SATURATE)
+      brw_set_saturate(p, 0);
+}
+
+
+static void emit_mad( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1,
+		      const struct brw_reg *arg2 )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_MUL(p, dst[i], arg0[i], arg1[i]);
+
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_ADD(p, dst[i], dst[i], arg2[i]);
+	 brw_set_saturate(p, 0);
+      }
+   }
+}
+
+static void emit_trunc( struct brw_compile *p,
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0)
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_RNDZ(p, dst[i], arg0[i]);
+      }
+   }
+}
+
+static void emit_lrp( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1,
+		      const struct brw_reg *arg2 )
+{
+   GLuint i;
+
+   /* Uses dst as a temporary:
+    */
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {	
+	 /* Can I use the LINE instruction for this? 
+	  */
+	 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
+	 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
+
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MAC(p, dst[i], arg0[i], arg1[i]);
+	 brw_set_saturate(p, 0);
+      }
+   }
+}
+
+static void emit_sop( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      GLuint cond,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {	
+	 brw_MOV(p, dst[i], brw_imm_f(0));
+	 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
+	 brw_MOV(p, dst[i], brw_imm_f(1.0));
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+}
+
+static void emit_slt( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
+}
+
+static void emit_sle( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
+}
+
+static void emit_sgt( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
+}
+
+static void emit_sge( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
+}
+
+static void emit_seq( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
+}
+
+static void emit_sne( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
+}
+
+static void emit_cmp( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1,
+		      const struct brw_reg *arg2 )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {	
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MOV(p, dst[i], arg2[i]);
+	 brw_set_saturate(p, 0);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
+
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MOV(p, dst[i], arg1[i]);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+}
+
+static void emit_max( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {	
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MOV(p, dst[i], arg0[i]);
+	 brw_set_saturate(p, 0);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
+
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MOV(p, dst[i], arg1[i]);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+}
+
+static void emit_min( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {	
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MOV(p, dst[i], arg1[i]);
+	 brw_set_saturate(p, 0);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
+
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MOV(p, dst[i], arg0[i]);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+}
+
+
+static void emit_dp3( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+
+   if (!(mask & BRW_WRITEMASK_XYZW))
+      return; /* Do not emit dead code */
+
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+
+   brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
+   brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
+
+   brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
+   brw_set_saturate(p, 0);
+}
+
+
+static void emit_dp4( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+
+   if (!(mask & BRW_WRITEMASK_XYZW))
+      return; /* Do not emit dead code */
+
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+
+   brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
+   brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
+   brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
+
+   brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+   brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
+   brw_set_saturate(p, 0);
+}
+
+
+static void emit_dph( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   const int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+
+   if (!(mask & BRW_WRITEMASK_XYZW))
+      return; /* Do not emit dead code */
+
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+
+   brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
+   brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
+   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
+
+   brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+   brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
+   brw_set_saturate(p, 0);
+}
+
+
+static void emit_xpd( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   GLuint i;
+
+   assert((mask & BRW_WRITEMASK_W) != BRW_WRITEMASK_W);
+   
+   for (i = 0 ; i < 3; i++) {
+      if (mask & (1<<i)) {
+	 GLuint i2 = (i+2)%3;
+	 GLuint i1 = (i+1)%3;
+
+	 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
+
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
+	 brw_set_saturate(p, 0);
+      }
+   }
+}
+
+
+static void emit_math1( struct brw_compile *p, 
+			GLuint function,
+			const struct brw_reg *dst,
+			GLuint mask,
+			const struct brw_reg *arg0 )
+{
+   int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+
+   if (!(mask & BRW_WRITEMASK_XYZW))
+      return; /* Do not emit dead code */
+
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+
+   brw_MOV(p, brw_message_reg(2), arg0[0]);
+
+   /* Send two messages to perform all 16 operations:
+    */
+   brw_math_16(p, 
+	       dst[dst_chan],
+	       function,
+	       (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	       2,
+	       brw_null_reg(),
+	       BRW_MATH_PRECISION_FULL);
+}
+
+
+static void emit_math2( struct brw_compile *p, 
+			GLuint function,
+			const struct brw_reg *dst,
+			GLuint mask,
+			const struct brw_reg *arg0,
+			const struct brw_reg *arg1)
+{
+   int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+
+   if (!(mask & BRW_WRITEMASK_XYZW))
+      return; /* Do not emit dead code */
+
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+
+   brw_push_insn_state(p);
+
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_MOV(p, brw_message_reg(2), arg0[0]);
+   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+   brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
+
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_MOV(p, brw_message_reg(3), arg1[0]);
+   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+   brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
+
+   
+   /* Send two messages to perform all 16 operations:
+    */
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_math(p, 
+	    dst[dst_chan],
+	    function,
+	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+
+   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+   brw_math(p, 
+	    offset(dst[dst_chan],1),
+	    function,
+	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	    4,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+   
+   brw_pop_insn_state(p);
+}
+		     
+
+
+static void emit_tex( struct brw_wm_compile *c,
+		      const struct brw_wm_instruction *inst,
+		      struct brw_reg *dst,
+		      GLuint dst_flags,
+		      struct brw_reg *coord,
+		      GLuint sampler)
+{
+   struct brw_compile *p = &c->func;
+   GLuint msgLength, responseLength;
+   GLuint i, nr;
+   GLuint emit;
+   GLuint msg_type;
+   GLboolean shadow = FALSE;
+
+   /* How many input regs are there?
+    */
+   switch (inst->target) {
+   case TGSI_TEXTURE_1D:
+      emit = BRW_WRITEMASK_X;
+      nr = 1;
+      break;
+   case TGSI_TEXTURE_SHADOW1D:
+      emit = BRW_WRITEMASK_XW;
+      nr = 4;
+      shadow = TRUE;
+      break;
+   case TGSI_TEXTURE_2D:
+      emit = BRW_WRITEMASK_XY;
+      nr = 2;
+      break;
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      emit = BRW_WRITEMASK_XYW;
+      nr = 4;
+      shadow = TRUE;
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+      emit = BRW_WRITEMASK_XYZ;
+      nr = 3;
+      break;
+   default:
+      /* unexpected target */
+      abort();
+   }
+
+   msgLength = 1;
+
+   for (i = 0; i < nr; i++) {
+      static const GLuint swz[4] = {0,1,2,2};
+      if (emit & (1<<i)) 
+	 brw_MOV(p, brw_message_reg(msgLength+1), coord[swz[i]]);
+      else
+	 brw_MOV(p, brw_message_reg(msgLength+1), brw_imm_f(0));
+      msgLength += 2;
+   }
+
+   responseLength = 8;		/* always */
+
+   if (BRW_IS_IGDNG(p->brw)) {
+       if (shadow)
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG;
+       else
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG;
+   } else {
+       if (shadow)
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
+       else
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
+   }
+
+   brw_SAMPLE(p, 
+	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
+	      1,
+	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
+              BTI_TEXTURE(inst->tex_unit),
+	      sampler,          /* sampler index */
+	      inst->writemask,
+	      msg_type, 
+	      responseLength,
+	      msgLength,
+	      0,	
+	      1,
+	      BRW_SAMPLER_SIMD_MODE_SIMD16);	
+}
+
+
+static void emit_txb( struct brw_wm_compile *c,
+		      const struct brw_wm_instruction *inst,
+		      struct brw_reg *dst,
+		      GLuint dst_flags,
+		      struct brw_reg *coord,
+		      GLuint sampler )
+{
+   struct brw_compile *p = &c->func;
+   GLuint msgLength;
+   GLuint msg_type;
+   /* Shadow ignored for txb.
+    */
+   switch (inst->target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_SHADOW1D:
+      brw_MOV(p, brw_message_reg(2), coord[0]);
+      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      brw_MOV(p, brw_message_reg(2), coord[0]);
+      brw_MOV(p, brw_message_reg(4), coord[1]);
+      brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+      brw_MOV(p, brw_message_reg(2), coord[0]);
+      brw_MOV(p, brw_message_reg(4), coord[1]);
+      brw_MOV(p, brw_message_reg(6), coord[2]);
+      break;
+   default:
+      /* unexpected target */
+      abort();
+   }
+
+   brw_MOV(p, brw_message_reg(8), coord[3]);
+   msgLength = 9;
+
+   if (BRW_IS_IGDNG(p->brw))
+       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG;
+   else
+       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+
+   brw_SAMPLE(p, 
+	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
+	      1,
+	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
+              BTI_TEXTURE(inst->tex_unit),
+	      sampler,          /* sampler index */
+	      inst->writemask,
+	      msg_type,
+	      8,		/* responseLength */
+	      msgLength,
+	      0,	
+	      1,
+	      BRW_SAMPLER_SIMD_MODE_SIMD16);	
+}
+
+
+static void emit_lit( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0 )
+{
+   assert((mask & BRW_WRITEMASK_XW) == 0);
+
+   if (mask & BRW_WRITEMASK_Y) {
+      brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+      brw_MOV(p, dst[1], arg0[0]);
+      brw_set_saturate(p, 0);
+   }
+
+   if (mask & BRW_WRITEMASK_Z) {
+      emit_math2(p, BRW_MATH_FUNCTION_POW,
+		 &dst[2],
+		 BRW_WRITEMASK_X | (mask & SATURATE),
+		 &arg0[1],
+		 &arg0[3]);
+   }
+
+   /* Ordinarily you'd use an iff statement to skip or shortcircuit
+    * some of the POW calculations above, but 16-wide iff statements
+    * seem to lock c1 hardware, so this is a nasty workaround:
+    */
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
+   {
+      if (mask & BRW_WRITEMASK_Y) 
+	 brw_MOV(p, dst[1], brw_imm_f(0));
+
+      if (mask & BRW_WRITEMASK_Z) 
+	 brw_MOV(p, dst[2], brw_imm_f(0)); 
+   }
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+
+/* Kill pixel - set execution mask to zero for those pixels which
+ * fail.
+ */
+static void emit_kil( struct brw_wm_compile *c,
+		      struct brw_reg *arg0)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+   GLuint i;
+   
+   /* XXX - usually won't need 4 compares!
+    */
+   for (i = 0; i < 4; i++) {
+      brw_push_insn_state(p);
+      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));   
+      brw_set_predicate_control_flag_value(p, 0xff);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_AND(p, r0uw, brw_flag_reg(), r0uw);
+      brw_pop_insn_state(p);
+   }
+}
+
+/* KILLP kills the pixels that are currently executing, not based on a test
+ * of the arguments.
+ */
+static void emit_killp( struct brw_wm_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
+   brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
+   brw_pop_insn_state(p);
+}
+
+static void fire_fb_write( struct brw_wm_compile *c,
+			   GLuint base_reg,
+			   GLuint nr,
+			   GLuint target,
+			   GLuint eot )
+{
+   struct brw_compile *p = &c->func;
+   
+   /* Pass through control information:
+    */
+/*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
+   {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_MOV(p, 
+	       brw_message_reg(base_reg + 1),
+	       brw_vec8_grf(1, 0));
+      brw_pop_insn_state(p);
+   }
+
+   /* Send framebuffer write message: */
+/*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
+   brw_fb_WRITE(p,
+		retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+		base_reg,
+		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
+		target,		
+		nr,
+		0, 
+		eot);
+}
+
+
+static void emit_aa( struct brw_wm_compile *c,
+		     struct brw_reg *arg1,
+		     GLuint reg )
+{
+   struct brw_compile *p = &c->func;
+   GLuint comp = c->key.aa_dest_stencil_reg / 2;
+   GLuint off = c->key.aa_dest_stencil_reg % 2;
+   struct brw_reg aa = offset(arg1[comp], off);
+
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
+   brw_MOV(p, brw_message_reg(reg), aa);
+   brw_pop_insn_state(p);
+}
+
+
+/* Post-fragment-program processing.  Send the results to the
+ * framebuffer.
+ * \param arg0  the fragment color
+ * \param arg1  the pass-through depth value
+ * \param arg2  the shader-computed depth value
+ */
+static void emit_fb_write( struct brw_wm_compile *c,
+			   struct brw_reg *arg0,
+			   struct brw_reg *arg1,
+			   struct brw_reg *arg2,
+			   GLuint target,
+			   GLuint eot)
+{
+   struct brw_compile *p = &c->func;
+   GLuint nr = 2;
+   GLuint channel;
+
+   /* Reserve a space for AA - may not be needed:
+    */
+   if (c->key.aa_dest_stencil_reg)
+      nr += 1;
+
+   /* I don't really understand how this achieves the color interleave
+    * (ie RGBARGBA) in the result:  [Do the saturation here]
+    */
+   {
+      brw_push_insn_state(p);
+      
+      for (channel = 0; channel < 4; channel++) {
+	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
+	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
+
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	 brw_MOV(p,
+		 brw_message_reg(nr + channel),
+		 arg0[channel]);
+       
+	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	 brw_MOV(p,
+		 brw_message_reg(nr + channel + 4),
+		 sechalf(arg0[channel]));
+      }
+
+      /* skip over the regs populated above:
+       */
+      nr += 8;
+   
+      brw_pop_insn_state(p);
+   }
+
+   if (c->key.source_depth_to_render_target)
+   {
+      if (c->key.computes_depth) 
+	 brw_MOV(p, brw_message_reg(nr), arg2[2]);
+      else 
+	 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
+
+      nr += 2;
+   }
+
+   if (c->key.dest_depth_reg)
+   {
+      GLuint comp = c->key.dest_depth_reg / 2;
+      GLuint off = c->key.dest_depth_reg % 2;
+
+      if (off != 0) {
+         brw_push_insn_state(p);
+         brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+         brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
+         /* 2nd half? */
+         brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
+         brw_pop_insn_state(p);
+      }
+      else {
+         brw_MOV(p, brw_message_reg(nr), arg1[comp]);
+      }
+      nr += 2;
+   }
+
+   if (!c->key.runtime_check_aads_emit) {
+      if (c->key.aa_dest_stencil_reg)
+	 emit_aa(c, arg1, 2);
+
+      fire_fb_write(c, 0, nr, target, eot);
+   }
+   else {
+      struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+      struct brw_reg ip = brw_ip_reg();
+      struct brw_instruction *jmp;
+      
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+      brw_AND(p, 
+	      v1_null_ud, 
+	      get_element_ud(brw_vec8_grf(1,0), 6), 
+	      brw_imm_ud(1<<26)); 
+
+      jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+      {
+	 emit_aa(c, arg1, 2);
+	 fire_fb_write(c, 0, nr, target, eot);
+	 /* note - thread killed in subroutine */
+      }
+      brw_land_fwd_jump(p, jmp);
+
+      /* ELSE: Shuffle up one register to fill in the hole left for AA:
+       */
+      fire_fb_write(c, 1, nr-1, target, eot);
+   }
+}
+
+
+/**
+ * Move a GPR to scratch memory. 
+ */
+static void emit_spill( struct brw_wm_compile *c,
+			struct brw_reg reg,
+			GLuint slot )
+{
+   struct brw_compile *p = &c->func;
+
+   /*
+     mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
+   */
+   brw_MOV(p, brw_message_reg(2), reg);
+
+   /*
+     mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
+     send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
+   */
+   brw_dp_WRITE_16(p, 
+		   retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
+		   slot);
+}
+
+
+/**
+ * Load a GPR from scratch memory. 
+ */
+static void emit_unspill( struct brw_wm_compile *c,
+			  struct brw_reg reg,
+			  GLuint slot )
+{
+   struct brw_compile *p = &c->func;
+
+   /* Slot 0 is the undef value.
+    */
+   if (slot == 0) {
+      brw_MOV(p, reg, brw_imm_f(0));
+      return;
+   }
+
+   /*
+     mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
+     send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
+   */
+
+   brw_dp_READ_16(p,
+		  retype(vec16(reg), BRW_REGISTER_TYPE_UW),
+		  slot);
+}
+
+
+/**
+ * Retrieve up to 4 GEN4 register pairs for the given wm reg:
+ * Args with unspill_reg != 0 will be loaded from scratch memory.
+ */
+static void get_argument_regs( struct brw_wm_compile *c,
+			       struct brw_wm_ref *arg[],
+			       struct brw_reg *regs )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (arg[i]) {
+	 if (arg[i]->unspill_reg)
+	    emit_unspill(c,
+			 brw_vec8_grf(arg[i]->unspill_reg, 0),
+			 arg[i]->value->spill_slot);
+
+	 regs[i] = arg[i]->hw_reg;
+      }
+      else {
+	 regs[i] = brw_null_reg();
+      }
+   }
+}
+
+
+/**
+ * For values that have a spill_slot!=0, write those regs to scratch memory.
+ */
+static void spill_values( struct brw_wm_compile *c,
+			  struct brw_wm_value *values,
+			  GLuint nr )
+{
+   GLuint i;
+
+   for (i = 0; i < nr; i++)
+      if (values[i].spill_slot) 
+	 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
+}
+
+
+/* Emit the fragment program instructions here.
+ */
+void brw_wm_emit( struct brw_wm_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   GLuint insn;
+
+   brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+
+   /* Check if any of the payload regs need to be spilled:
+    */
+   spill_values(c, c->payload.depth, 4);
+   spill_values(c, c->creg, c->nr_creg);
+   spill_values(c, c->payload.input_interp, PIPE_MAX_SHADER_INPUTS);
+   
+
+   for (insn = 0; insn < c->nr_insns; insn++) {
+
+      struct brw_wm_instruction *inst = &c->instruction[insn];
+      struct brw_reg args[3][4], dst[4];
+      GLuint i, dst_flags;
+      
+      /* Get argument regs:
+       */
+      for (i = 0; i < 3; i++) 
+	 get_argument_regs(c, inst->src[i], args[i]);
+
+      /* Get dest regs:
+       */
+      for (i = 0; i < 4; i++)
+	 if (inst->dst[i])
+	    dst[i] = inst->dst[i]->hw_reg;
+	 else
+	    dst[i] = brw_null_reg();
+      
+      /* Flags
+       */
+      dst_flags = inst->writemask;
+      if (inst->saturate) 
+	 dst_flags |= SATURATE;
+
+      switch (inst->opcode) {
+	 /* Generated instructions for calculating triangle interpolants:
+	  */
+      case WM_PIXELXY:
+	 emit_pixel_xy(p, dst, dst_flags);
+	 break;
+
+      case WM_DELTAXY:
+	 emit_delta_xy(p, dst, dst_flags, args[0]);
+	 break;
+
+      case WM_WPOSXY:
+	 emit_wpos_xy(c, dst, dst_flags, args[0]);
+	 break;
+
+      case WM_PIXELW:
+	 emit_pixel_w(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case WM_LINTERP:
+	 emit_linterp(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case WM_PINTERP:
+	 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
+	 break;
+
+      case WM_CINTERP:
+	 emit_cinterp(p, dst, dst_flags, args[0]);
+	 break;
+
+      case WM_FB_WRITE:
+	 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
+	 break;
+
+      case WM_FRONTFACING:
+	 emit_frontfacing(p, dst, dst_flags);
+	 break;
+
+	 /* Straightforward arithmetic:
+	  */
+      case TGSI_OPCODE_ADD:
+	 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_FRC:
+	 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_FLR:
+	 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_DDX:
+	 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
+	 break;
+
+      case TGSI_OPCODE_DDY:
+	 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
+	 break;
+
+      case TGSI_OPCODE_DP3:
+	 emit_dp3(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_DP4:
+	 emit_dp4(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_DPH:
+	 emit_dph(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_TRUNC:
+	 emit_trunc(p, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_LRP:
+	 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
+	 break;
+
+      case TGSI_OPCODE_MAD:	
+	 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
+	 break;
+
+      case TGSI_OPCODE_MOV:
+	 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_MUL:
+	 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_XPD:
+	 emit_xpd(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+	 /* Higher math functions:
+	  */
+      case TGSI_OPCODE_RCP:
+	 emit_math1(p, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_RSQ:
+	 emit_math1(p, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_SIN:
+	 emit_math1(p, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_COS:
+	 emit_math1(p, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_EX2:
+	 emit_math1(p, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_LG2:
+	 emit_math1(p, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_SCS:
+	 /* There is an scs math function, but it would need some
+	  * fixup for 16-element execution.
+	  */
+	 if (dst_flags & BRW_WRITEMASK_X)
+	    emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|BRW_WRITEMASK_X, args[0]);
+	 if (dst_flags & BRW_WRITEMASK_Y)
+	    emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|BRW_WRITEMASK_X, args[0]);
+	 break;
+
+      case TGSI_OPCODE_POW:
+	 emit_math2(p, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
+	 break;
+
+	 /* Comparisons:
+	  */
+      case TGSI_OPCODE_CMP:
+	 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
+	 break;
+
+      case TGSI_OPCODE_MAX:
+	 emit_max(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_MIN:
+	 emit_min(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_SLT:
+	 emit_slt(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_SLE:
+	 emit_sle(p, dst, dst_flags, args[0], args[1]);
+	break;
+      case TGSI_OPCODE_SGT:
+	 emit_sgt(p, dst, dst_flags, args[0], args[1]);
+	break;
+      case TGSI_OPCODE_SGE:
+	 emit_sge(p, dst, dst_flags, args[0], args[1]);
+	 break;
+      case TGSI_OPCODE_SEQ:
+	 emit_seq(p, dst, dst_flags, args[0], args[1]);
+	break;
+      case TGSI_OPCODE_SNE:
+	 emit_sne(p, dst, dst_flags, args[0], args[1]);
+	break;
+
+      case TGSI_OPCODE_LIT:
+	 emit_lit(p, dst, dst_flags, args[0]);
+	 break;
+
+	 /* Texturing operations:
+	  */
+      case TGSI_OPCODE_TEX:
+	 emit_tex(c, inst, dst, dst_flags, args[0], inst->sampler);
+	 break;
+
+      case TGSI_OPCODE_TXB:
+	 emit_txb(c, inst, dst, dst_flags, args[0], inst->sampler);
+	 break;
+
+      case TGSI_OPCODE_KIL:
+	 emit_kil(c, args[0]);
+	 break;
+
+      case TGSI_OPCODE_KILP:
+	 emit_killp(c);
+	 break;
+
+      default:
+	 debug_printf("Unsupported opcode %i (%s) in fragment shader\n",
+		      inst->opcode, 
+		      tgsi_get_opcode_info(inst->opcode)->mnemonic);
+      }
+      
+      for (i = 0; i < 4; i++)
+	if (inst->dst[i] && inst->dst[i]->spill_slot) 
+	   emit_spill(c, 
+		      inst->dst[i]->hw_reg, 
+		      inst->dst[i]->spill_slot);
+   }
+
+   if (BRW_DEBUG & DEBUG_WM) {
+      debug_printf("wm-native:\n");
+      brw_disasm(stderr, p->store, p->nr_insn);
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
new file mode 100644
index 0000000000..9c67759ad0
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -0,0 +1,1223 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+               
+
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_util.h"
+
+#include "brw_wm.h"
+#include "brw_debug.h"
+
+
+/***********************************************************************
+ * Source regs
+ */
+
+static struct brw_fp_src src_reg(GLuint file, GLuint idx)
+{
+   struct brw_fp_src reg;
+   reg.file = file;
+   reg.index = idx;
+   reg.swizzle = BRW_SWIZZLE_XYZW;
+   reg.indirect = 0;
+   reg.negate = 0;
+   reg.abs = 0;
+   return reg;
+}
+
+static struct brw_fp_src src_reg_from_dst(struct brw_fp_dst dst)
+{
+   return src_reg(dst.file, dst.index);
+}
+
+static struct brw_fp_src src_undef( void )
+{
+   return src_reg(TGSI_FILE_NULL, 0);
+}
+
+static GLboolean src_is_undef(struct brw_fp_src src)
+{
+   return src.file == TGSI_FILE_NULL;
+}
+
+static struct brw_fp_src src_swizzle( struct brw_fp_src reg, int x, int y, int z, int w )
+{
+   unsigned swz = reg.swizzle;
+
+   reg.swizzle = ( BRW_GET_SWZ(swz, x) << 0 |
+		   BRW_GET_SWZ(swz, y) << 2 |
+		   BRW_GET_SWZ(swz, z) << 4 |
+		   BRW_GET_SWZ(swz, w) << 6 );
+
+   return reg;
+}
+
+static struct brw_fp_src src_scalar( struct brw_fp_src reg, int x )
+{
+   return src_swizzle(reg, x, x, x, x);
+}
+
+static struct brw_fp_src src_abs( struct brw_fp_src src )
+{
+   src.negate = 0;
+   src.abs = 1;
+   return src;
+}
+
+static struct brw_fp_src src_negate( struct brw_fp_src src )
+{
+   src.negate = 1;
+   src.abs = 0;
+   return src;
+}
+
+
+static int match_or_expand_immediate( const float *v,
+                                      unsigned nr,
+                                      float *v2,
+                                      unsigned *nr2,
+                                      unsigned *swizzle )
+{
+   unsigned i, j;
+   
+   *swizzle = 0;
+
+   for (i = 0; i < nr; i++) {
+      boolean found = FALSE;
+
+      for (j = 0; j < *nr2 && !found; j++) {
+         if (v[i] == v2[j]) {
+            *swizzle |= j << (i * 2);
+            found = TRUE;
+         }
+      }
+
+      if (!found) {
+         if (*nr2 >= 4) 
+            return FALSE;
+
+         v2[*nr2] = v[i];
+         *swizzle |= *nr2 << (i * 2);
+         (*nr2)++;
+      }
+   }
+
+   return TRUE;
+}
+
+
+
+/* Internally generated immediates: overkill...
+ */
+static struct brw_fp_src src_imm( struct brw_wm_compile *c, 
+				  const GLfloat *v, 
+				  unsigned nr)
+{
+   unsigned i, j;
+   unsigned swizzle;
+
+   /* Could do a first pass where we examine all existing immediates
+    * without expanding.
+    */
+
+   for (i = 0; i < c->nr_immediates; i++) {
+      if (match_or_expand_immediate( v, 
+                                     nr,
+                                     c->immediate[i].v,
+                                     &c->immediate[i].nr, 
+                                     &swizzle ))
+         goto out;
+   }
+
+   if (c->nr_immediates < Elements(c->immediate)) {
+      i = c->nr_immediates++;
+      if (match_or_expand_immediate( v,
+                                     nr,
+                                     c->immediate[i].v,
+                                     &c->immediate[i].nr, 
+                                     &swizzle ))
+         goto out;
+   }
+
+   c->error = 1;
+   return src_undef();
+
+out:
+   /* Make sure that all referenced elements are from this immediate.
+    * Has the effect of making size-one immediates into scalars.
+    */
+   for (j = nr; j < 4; j++)
+      swizzle |= (swizzle & 0x3) << (j * 2);
+
+   return src_swizzle( src_reg( TGSI_FILE_IMMEDIATE, i ),
+		       BRW_GET_SWZ(swizzle, X),
+		       BRW_GET_SWZ(swizzle, Y),
+		       BRW_GET_SWZ(swizzle, Z),
+		       BRW_GET_SWZ(swizzle, W) );
+}
+
+
+
+static struct brw_fp_src src_imm1f( struct brw_wm_compile *c,
+				    GLfloat f )
+{
+   return src_imm(c, &f, 1);
+}
+
+static struct brw_fp_src src_imm4f( struct brw_wm_compile *c,
+				    GLfloat x,
+				    GLfloat y,
+				    GLfloat z,
+				    GLfloat w)
+{
+   GLfloat f[4] = {x,y,z,w};
+   return src_imm(c, f, 4);
+}
+
+
+
+/***********************************************************************
+ * Dest regs
+ */
+
+static struct brw_fp_dst dst_reg(GLuint file, GLuint idx)
+{
+   struct brw_fp_dst reg;
+   reg.file = file;
+   reg.index = idx;
+   reg.writemask = BRW_WRITEMASK_XYZW;
+   reg.indirect = 0;
+   reg.saturate = 0;
+   return reg;
+}
+
+static struct brw_fp_dst dst_mask( struct brw_fp_dst reg, int mask )
+{
+   reg.writemask &= mask;
+   return reg;
+}
+
+static struct brw_fp_dst dst_undef( void )
+{
+   return dst_reg(TGSI_FILE_NULL, 0);
+}
+
+static boolean dst_is_undef( struct brw_fp_dst dst )
+{
+   return dst.file == TGSI_FILE_NULL;
+}
+
+static struct brw_fp_dst dst_saturate( struct brw_fp_dst reg, boolean flag )
+{
+   reg.saturate = flag;
+   return reg;
+}
+
+static struct brw_fp_dst get_temp( struct brw_wm_compile *c )
+{
+   int bit = ffs( ~c->fp_temp );
+
+   if (!bit) {
+      debug_printf("%s: out of temporaries\n", __FILE__);
+   }
+
+   c->fp_temp |= 1<<(bit-1);
+   return dst_reg(TGSI_FILE_TEMPORARY, c->fp_first_internal_temp+(bit-1));
+}
+
+
+static void release_temp( struct brw_wm_compile *c, struct brw_fp_dst temp )
+{
+   c->fp_temp &= ~(1 << (temp.index - c->fp_first_internal_temp));
+}
+
+
+/***********************************************************************
+ * Instructions 
+ */
+
+static struct brw_fp_instruction *get_fp_inst(struct brw_wm_compile *c)
+{
+   return &c->fp_instructions[c->nr_fp_insns++];
+}
+
+static struct brw_fp_instruction * emit_tex_op(struct brw_wm_compile *c,
+					     GLuint op,
+					     struct brw_fp_dst dest,
+					     GLuint tex_unit,
+					     GLuint target,
+					     GLuint sampler,
+					     struct brw_fp_src src0,
+					     struct brw_fp_src src1,
+					     struct brw_fp_src src2 )
+{
+   struct brw_fp_instruction *inst = get_fp_inst(c);
+
+   if (tex_unit || target)
+      assert(op == TGSI_OPCODE_TXP ||
+             op == TGSI_OPCODE_TXB ||
+             op == TGSI_OPCODE_TEX ||
+             op == WM_FB_WRITE);
+
+   inst->opcode = op;
+   inst->dst = dest;
+   inst->tex_unit = tex_unit;
+   inst->target = target;
+   inst->sampler = sampler;
+   inst->src[0] = src0;
+   inst->src[1] = src1;
+   inst->src[2] = src2;
+
+   return inst;
+}
+   
+
+static INLINE void emit_op3(struct brw_wm_compile *c,
+			    GLuint op,
+			    struct brw_fp_dst dest,
+			    struct brw_fp_src src0,
+			    struct brw_fp_src src1,
+			    struct brw_fp_src src2 )
+{
+   emit_tex_op(c, op, dest, 0, 0, 0, src0, src1, src2);
+}
+
+
+static INLINE void emit_op2(struct brw_wm_compile *c,
+			    GLuint op,
+			    struct brw_fp_dst dest,
+			    struct brw_fp_src src0,
+			    struct brw_fp_src src1)
+{
+   emit_tex_op(c, op, dest, 0, 0, 0, src0, src1, src_undef());
+}
+
+static INLINE void emit_op1(struct brw_wm_compile *c,
+			    GLuint op,
+			    struct brw_fp_dst dest,
+			    struct brw_fp_src src0)
+{
+   emit_tex_op(c, op, dest, 0, 0, 0, src0, src_undef(), src_undef());
+}
+
+static INLINE void emit_op0(struct brw_wm_compile *c,
+			   GLuint op,
+			   struct brw_fp_dst dest)
+{
+   emit_tex_op(c, op, dest, 0, 0, 0, src_undef(), src_undef(), src_undef());
+}
+
+
+
+/* Many opcodes produce the same value across all the result channels.
+ * We'd rather not have to support that splatting in the opcode implementations,
+ * and brw_wm_pass*.c wants to optimize them out by shuffling references around
+ * anyway.  We can easily get both by emitting the opcode to one channel, and
+ * then MOVing it to the others, which brw_wm_pass*.c already understands.
+ */
+static void emit_scalar_insn(struct brw_wm_compile *c,
+			     unsigned opcode,
+			     struct brw_fp_dst dst,
+			     struct brw_fp_src src0,
+			     struct brw_fp_src src1,
+			     struct brw_fp_src src2 )
+{
+   unsigned first_chan = ffs(dst.writemask) - 1;
+   unsigned first_mask = 1 << first_chan;
+
+   if (dst.writemask == 0)
+      return;
+
+   emit_op3( c, opcode,
+	     dst_mask(dst, first_mask),
+	     src0, src1, src2 );
+
+   if (dst.writemask != first_mask) {
+      emit_op1(c, TGSI_OPCODE_MOV,
+	       dst_mask(dst, ~first_mask),
+	       src_scalar(src_reg_from_dst(dst), first_chan));
+   }
+}
+
+
+/***********************************************************************
+ * Special instructions for interpolation and other tasks
+ */
+
+static struct brw_fp_src get_pixel_xy( struct brw_wm_compile *c )
+{
+   if (src_is_undef(c->fp_pixel_xy)) {
+      struct brw_fp_dst pixel_xy = get_temp(c);
+      struct brw_fp_src payload_r0_depth = src_reg(BRW_FILE_PAYLOAD, PAYLOAD_DEPTH);
+      
+      
+      /* Emit the out calculations, and hold onto the results.  Use
+       * two instructions as a temporary is required.
+       */   
+      /* pixel_xy.xy = PIXELXY payload[0];
+       */
+      emit_op1(c,
+	       WM_PIXELXY,
+	       dst_mask(pixel_xy, BRW_WRITEMASK_XY),
+	       payload_r0_depth);
+
+      c->fp_pixel_xy = src_reg_from_dst(pixel_xy);
+   }
+
+   return c->fp_pixel_xy;
+}
+
+static struct brw_fp_src get_delta_xy( struct brw_wm_compile *c )
+{
+   if (src_is_undef(c->fp_delta_xy)) {
+      struct brw_fp_dst delta_xy = get_temp(c);
+      struct brw_fp_src pixel_xy = get_pixel_xy(c);
+      struct brw_fp_src payload_r0_depth = src_reg(BRW_FILE_PAYLOAD, PAYLOAD_DEPTH);
+      
+      /* deltas.xy = DELTAXY pixel_xy, payload[0]
+       */
+      emit_op3(c,
+	      WM_DELTAXY,
+	      dst_mask(delta_xy, BRW_WRITEMASK_XY),
+	      pixel_xy, 
+	      payload_r0_depth,
+	      src_undef());
+      
+      c->fp_delta_xy = src_reg_from_dst(delta_xy);
+   }
+
+   return c->fp_delta_xy;
+}
+
+static struct brw_fp_src get_pixel_w( struct brw_wm_compile *c )
+{
+   if (src_is_undef(c->fp_pixel_w)) {
+      struct brw_fp_dst pixel_w = get_temp(c);
+      struct brw_fp_src deltas = get_delta_xy(c);
+
+      /* XXX: assuming position is always first -- valid? 
+       */
+      struct brw_fp_src interp_wpos = src_reg(BRW_FILE_PAYLOAD, 0);
+
+      /* deltas.xyw = DELTAS2 deltas.xy, payload.interp_wpos.x
+       */
+      emit_op3(c,
+	       WM_PIXELW,
+	       dst_mask(pixel_w, BRW_WRITEMASK_W),
+	       interp_wpos,
+	       deltas, 
+	       src_undef());
+      
+
+      c->fp_pixel_w = src_reg_from_dst(pixel_w);
+   }
+
+   return c->fp_pixel_w;
+}
+
+
+/***********************************************************************
+ * Emit INTERP instructions ahead of first use of each attrib.
+ */
+
+static void emit_interp( struct brw_wm_compile *c,
+			 GLuint idx,
+			 GLuint semantic,
+			 GLuint interp_mode )
+{
+   struct brw_fp_dst dst = dst_reg(TGSI_FILE_INPUT, idx);
+   struct brw_fp_src interp = src_reg(BRW_FILE_PAYLOAD, idx);
+   struct brw_fp_src deltas = get_delta_xy(c);
+
+   /* Need to use PINTERP on attributes which have been
+    * multiplied by 1/W in the SF program, and LINTERP on those
+    * which have not:
+    */
+   switch (semantic) {
+   case TGSI_SEMANTIC_POSITION:
+      /* Have to treat wpos.xy specially:
+       */
+      emit_op1(c,
+	      WM_WPOSXY,
+	      dst_mask(dst, BRW_WRITEMASK_XY),
+	      get_pixel_xy(c));
+      
+      /* TGSI_FILE_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
+       */
+      emit_op2(c,
+	       WM_LINTERP,
+	       dst_mask(dst, BRW_WRITEMASK_ZW),
+	       interp,
+	       deltas);
+      break;
+
+   case TGSI_SEMANTIC_COLOR:
+      if (c->key.flat_shade) {
+	 emit_op1(c,
+		 WM_CINTERP,
+		 dst,
+		 interp);
+      }
+      else if (interp_mode == TGSI_INTERPOLATE_LINEAR) {
+	 emit_op2(c,
+		  WM_LINTERP,
+		  dst,
+		  interp,
+		  deltas);
+      }
+      else {
+	 emit_op3(c,
+		  WM_PINTERP,
+		  dst,
+		  interp,
+		  deltas,
+		  get_pixel_w(c));
+      }
+
+      break;
+
+   case TGSI_SEMANTIC_FOG:
+      /* Interpolate the fog coordinate */
+      emit_op3(c,
+	      WM_PINTERP,
+	      dst_mask(dst, BRW_WRITEMASK_X),
+	      interp,
+	      deltas,
+	      get_pixel_w(c));
+
+      emit_op1(c,
+	       TGSI_OPCODE_MOV,
+	       dst_mask(dst, BRW_WRITEMASK_YZ),
+	       src_imm1f(c, 0.0));
+
+      emit_op1(c,
+	       TGSI_OPCODE_MOV,
+	       dst_mask(dst, BRW_WRITEMASK_W),
+	       src_imm1f(c, 1.0));
+      break;
+
+   case TGSI_SEMANTIC_FACE:
+      /* XXX review/test this case */
+      emit_op0(c,
+	       WM_FRONTFACING,
+	       dst_mask(dst, BRW_WRITEMASK_X));
+      
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_YZ),
+	       src_imm1f(c, 0.0));
+
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_W),
+	       src_imm1f(c, 1.0));
+      break;
+
+   case TGSI_SEMANTIC_PSIZE:
+      /* XXX review/test this case */
+      emit_op3(c,
+	       WM_PINTERP,
+	       dst_mask(dst, BRW_WRITEMASK_XY),
+	       interp,
+	       deltas,
+	       get_pixel_w(c));
+
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_Z),
+	      src_imm1f(c, 0.0f));
+
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_W),
+	      src_imm1f(c, 1.0f));
+      break;
+
+   default: 
+      switch (interp_mode) {
+      case TGSI_INTERPOLATE_CONSTANT:
+	 emit_op1(c,
+		  WM_CINTERP,
+		  dst,
+		  interp);
+	 break;
+
+      case TGSI_INTERPOLATE_LINEAR:
+	 emit_op2(c,
+		  WM_LINTERP,
+		  dst,
+		  interp,
+		  deltas);
+	 break;
+
+      case TGSI_INTERPOLATE_PERSPECTIVE:
+	 emit_op3(c,
+		  WM_PINTERP,
+		  dst,
+		  interp,
+		  deltas,
+		  get_pixel_w(c));
+	 break;
+      }
+      break;
+   }
+}
+
+
+/***********************************************************************
+ * Expand various instructions here to simpler forms.  
+ */
+static void precalc_dst( struct brw_wm_compile *c,
+			 struct brw_fp_dst dst,
+			 struct brw_fp_src src0,
+			 struct brw_fp_src src1 )
+{
+   if (dst.writemask & BRW_WRITEMASK_Y) {      
+      /* dst.y = mul src0.y, src1.y
+       */
+      emit_op2(c,
+	       TGSI_OPCODE_MUL,
+	       dst_mask(dst, BRW_WRITEMASK_Y),
+	       src0,
+	       src1);
+   }
+
+   if (dst.writemask & BRW_WRITEMASK_XZ) {
+      /* dst.z = mov src0.zzzz
+       */
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_Z),
+	      src_scalar(src0, Z));
+
+      /* dst.x = imm1f(1.0)
+       */
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_saturate(dst_mask(dst, BRW_WRITEMASK_X), 0),
+	      src_imm1f(c, 1.0));
+   }
+   if (dst.writemask & BRW_WRITEMASK_W) {
+      /* dst.w = mov src1.w
+       */
+      emit_op1(c,
+	       TGSI_OPCODE_MOV,
+	       dst_mask(dst, BRW_WRITEMASK_W),
+	       src1);
+   }
+}
+
+
+static void precalc_lit( struct brw_wm_compile *c,
+			 struct brw_fp_dst dst,
+			 struct brw_fp_src src0 )
+{
+   if (dst.writemask & BRW_WRITEMASK_XW) {
+      /* dst.xw = imm(1.0f)
+       */
+      emit_op1(c,
+	       TGSI_OPCODE_MOV,
+	       dst_saturate(dst_mask(dst, BRW_WRITEMASK_XW), 0),
+	       src_imm1f(c, 1.0f));
+   }
+
+   if (dst.writemask & BRW_WRITEMASK_YZ) {
+      emit_op1(c,
+	       TGSI_OPCODE_LIT,
+	       dst_mask(dst, BRW_WRITEMASK_YZ),
+	       src0);
+   }
+}
+
+
+/**
+ * Some TEX instructions require extra code, cube map coordinate
+ * normalization, or coordinate scaling for RECT textures, etc.
+ * This function emits those extra instructions and the TEX
+ * instruction itself.
+ */
+static void precalc_tex( struct brw_wm_compile *c,
+			 struct brw_fp_dst dst,
+			 unsigned target,
+			 unsigned unit,
+			 struct brw_fp_src src0,
+			 struct brw_fp_src sampler )
+{
+   struct brw_fp_src coord = src_undef();
+   struct brw_fp_dst tmp = dst_undef();
+
+   assert(unit < BRW_MAX_TEX_UNIT);
+
+   /* Cubemap: find longest component of coord vector and normalize
+    * it.
+    */
+   if (target == TGSI_TEXTURE_CUBE) {
+      struct brw_fp_src tmpsrc;
+
+      tmp = get_temp(c);
+      tmpsrc = src_reg_from_dst(tmp);
+
+      /* tmp = abs(src0) */
+      emit_op1(c, 
+	       TGSI_OPCODE_MOV,
+	       tmp,
+	       src_abs(src0));
+
+      /* tmp.X = MAX(tmp.X, tmp.Y) */
+      emit_op2(c, TGSI_OPCODE_MAX,
+	       dst_mask(tmp, BRW_WRITEMASK_X),
+	       src_scalar(tmpsrc, X),
+	       src_scalar(tmpsrc, Y));
+
+      /* tmp.X = MAX(tmp.X, tmp.Z) */
+      emit_op2(c, TGSI_OPCODE_MAX,
+	       dst_mask(tmp, BRW_WRITEMASK_X),
+	       tmpsrc,
+	       src_scalar(tmpsrc, Z));
+
+      /* tmp.X = 1 / tmp.X */
+      emit_op1(c, TGSI_OPCODE_RCP,
+	      dst_mask(tmp, BRW_WRITEMASK_X),
+	      tmpsrc);
+
+      /* tmp = src0 * tmp.xxxx */
+      emit_op2(c, TGSI_OPCODE_MUL,
+	       tmp,
+	       src0,
+	       src_scalar(tmpsrc, X));
+
+      coord = tmpsrc;
+   }
+   else if (target == TGSI_TEXTURE_RECT ||
+	    target == TGSI_TEXTURE_SHADOWRECT) {
+      /* XXX: need a mechanism for internally generated constants.
+       */
+      coord = src0;
+   }
+   else {
+      coord = src0;
+   }
+
+   /* Need to emit YUV texture conversions by hand.  Probably need to
+    * do this here - the alternative is in brw_wm_emit.c, but the
+    * conversion requires allocating a temporary variable which we
+    * don't have the facility to do that late in the compilation.
+    */
+   if (c->key.yuvtex_mask & (1 << unit)) {
+      /* convert ycbcr to RGBA */
+      GLboolean  swap_uv = c->key.yuvtex_swap_mask & (1<<unit);
+      struct brw_fp_dst tmp = get_temp(c);
+      struct brw_fp_src tmpsrc = src_reg_from_dst(tmp);
+      struct brw_fp_src C0 = src_imm4f( c,  -.5, -.0625, -.5, 1.164 );
+      struct brw_fp_src C1 = src_imm4f( c, 1.596, -0.813, 2.018, -.391 );
+     
+      /* tmp     = TEX ...
+       */
+      emit_tex_op(c, 
+                  TGSI_OPCODE_TEX,
+                  dst_saturate(tmp, dst.saturate),
+                  unit,
+                  target,
+                  sampler.index,
+                  coord,
+                  src_undef(),
+                  src_undef());
+
+      /* tmp.xyz =  ADD TMP, C0
+       */
+      emit_op2(c, TGSI_OPCODE_ADD,
+	       dst_mask(tmp, BRW_WRITEMASK_XYZ),
+	       tmpsrc,
+	       C0);
+
+      /* YUV.y   = MUL YUV.y, C0.w
+       */
+      emit_op2(c, TGSI_OPCODE_MUL,
+	       dst_mask(tmp, BRW_WRITEMASK_Y),
+	       tmpsrc,
+	       src_scalar(C0, W));
+
+      /* 
+       * if (UV swaped)
+       *     RGB.xyz = MAD YUV.zzx, C1, YUV.y
+       * else
+       *     RGB.xyz = MAD YUV.xxz, C1, YUV.y
+       */
+
+      emit_op3(c, TGSI_OPCODE_MAD,
+	       dst_mask(dst, BRW_WRITEMASK_XYZ),
+	       ( swap_uv ? 
+		 src_swizzle(tmpsrc, Z,Z,X,X) : 
+		 src_swizzle(tmpsrc, X,X,Z,Z)),
+	       C1,
+	       src_scalar(tmpsrc, Y));
+
+      /*  RGB.y   = MAD YUV.z, C1.w, RGB.y
+       */
+      emit_op3(c,
+	       TGSI_OPCODE_MAD,
+	       dst_mask(dst, BRW_WRITEMASK_Y),
+	       src_scalar(tmpsrc, Z),
+	       src_scalar(C1, W),
+	       src_scalar(src_reg_from_dst(dst), Y));
+
+      release_temp(c, tmp);
+   }
+   else {
+      /* ordinary RGBA tex instruction */
+      emit_tex_op(c, 
+                  TGSI_OPCODE_TEX,
+                  dst,
+                  unit,
+                  target,
+                  sampler.index,
+                  coord,
+                  src_undef(),
+                  src_undef());
+   }
+
+   /* XXX: add GL_EXT_texture_swizzle support to gallium -- by
+    * generating shader varients in mesa state tracker.
+    */
+
+   /* Release this temp if we ended up allocating it:
+    */
+   if (!dst_is_undef(tmp))
+      release_temp(c, tmp);
+}
+
+
+/**
+ * Check if the given TXP instruction really needs the divide-by-W step.
+ */
+static GLboolean projtex( struct brw_wm_compile *c,
+			  unsigned target, 
+			  struct brw_fp_src src )
+{
+   /* Only try to detect the simplest cases.  Could detect (later)
+    * cases where we are trying to emit code like RCP {1.0}, MUL x,
+    * {1.0}, and so on.
+    *
+    * More complex cases than this typically only arise from
+    * user-provided fragment programs anyway:
+    */
+   if (target == TGSI_TEXTURE_CUBE)
+      return GL_FALSE;  /* ut2004 gun rendering !?! */
+   
+   if (src.file == TGSI_FILE_INPUT && 
+       BRW_GET_SWZ(src.swizzle, W) == W &&
+       c->fp->info.input_interpolate[src.index] != TGSI_INTERPOLATE_PERSPECTIVE)
+      return GL_FALSE;
+
+   return GL_TRUE;
+}
+
+
+/**
+ * Emit code for TXP.
+ */
+static void precalc_txp( struct brw_wm_compile *c,
+			 struct brw_fp_dst dst,
+			 unsigned target,
+			 unsigned unit,
+			 struct brw_fp_src src0,
+                         struct brw_fp_src sampler )
+{
+   if (projtex(c, target, src0)) {
+      struct brw_fp_dst tmp = get_temp(c);
+
+      /* tmp0.w = RCP inst.arg[0][3]
+       */
+      emit_op1(c,
+	      TGSI_OPCODE_RCP,
+	      dst_mask(tmp, BRW_WRITEMASK_W),
+	      src_scalar(src0, W));
+
+      /* tmp0.xyz =  MUL inst.arg[0], tmp0.wwww
+       */
+      emit_op2(c,
+	       TGSI_OPCODE_MUL,
+	       dst_mask(tmp, BRW_WRITEMASK_XYZ),
+	       src0,
+	       src_scalar(src_reg_from_dst(tmp), W));
+
+      /* dst = TEX tmp0
+       */
+      precalc_tex(c, 
+		  dst,
+		  target,
+		  unit,
+		  src_reg_from_dst(tmp),
+                  sampler );
+
+      release_temp(c, tmp);
+   }
+   else
+   {
+      /* dst = TEX src0
+       */
+      precalc_tex(c, dst, target, unit, src0, sampler);
+   }
+}
+
+
+/* XXX: note this returns a src_reg.
+ */
+static struct brw_fp_src
+find_output_by_semantic( struct brw_wm_compile *c,
+			 unsigned semantic,
+			 unsigned index )
+{
+   const struct tgsi_shader_info *info = &c->fp->info;
+   unsigned i;
+
+   for (i = 0; i < info->num_outputs; i++)
+      if (info->output_semantic_name[i] == semantic &&
+	  info->output_semantic_index[i] == index)
+	 return src_reg( TGSI_FILE_OUTPUT, i );
+
+   /* If not found, return some arbitrary immediate value:
+    *
+    * XXX: this is a good idea but immediates are up generating extra
+    * curbe entries atm, as they would have in the original driver.
+    */
+   return src_reg( TGSI_FILE_OUTPUT, 0 ); /* src_imm1f(c, 1.0); */
+}
+
+
+static void emit_fb_write( struct brw_wm_compile *c )
+{
+   struct brw_fp_src payload_r0_depth = src_reg(BRW_FILE_PAYLOAD, PAYLOAD_DEPTH);
+   struct brw_fp_src outdepth = find_output_by_semantic(c, TGSI_SEMANTIC_POSITION, 0);
+   GLuint i;
+
+
+   outdepth = src_scalar(outdepth, Z);
+
+   for (i = 0 ; i < c->key.nr_cbufs; i++) {
+      struct brw_fp_src outcolor;
+      
+      outcolor = find_output_by_semantic(c, TGSI_SEMANTIC_COLOR, i);
+
+      /* Use emit_tex_op so that we can specify the inst->target
+       * field, which is abused to contain the FB write target and the
+       * EOT marker
+       */
+      emit_tex_op(c, WM_FB_WRITE,
+		  dst_undef(),
+		  (i == c->key.nr_cbufs - 1), /* EOT */
+		  i,
+                  0,            /* no sampler */
+		  outcolor,
+		  payload_r0_depth,
+		  outdepth);
+   }
+}
+
+
+static struct brw_fp_dst translate_dst( struct brw_wm_compile *c,
+					const struct tgsi_full_dst_register *dst,
+					unsigned saturate )
+{
+   struct brw_fp_dst out;
+
+   out.file = dst->Register.File;
+   out.index = dst->Register.Index;
+   out.writemask = dst->Register.WriteMask;
+   out.indirect = dst->Register.Indirect;
+   out.saturate = (saturate == TGSI_SAT_ZERO_ONE);
+   
+   if (out.indirect) {
+      assert(dst->Indirect.File == TGSI_FILE_ADDRESS);
+      assert(dst->Indirect.Index == 0);
+   }
+   
+   return out;
+}
+
+
+static struct brw_fp_src translate_src( struct brw_wm_compile *c,
+					const struct tgsi_full_src_register *src )
+{
+   struct brw_fp_src out;
+
+   out.file = src->Register.File;
+   out.index = src->Register.Index;
+   out.indirect = src->Register.Indirect;
+
+   out.swizzle = ((src->Register.SwizzleX << 0) |
+		  (src->Register.SwizzleY << 2) |
+		  (src->Register.SwizzleZ << 4) |
+		  (src->Register.SwizzleW << 6));
+   
+   switch (tgsi_util_get_full_src_register_sign_mode( src, 0 )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      out.abs = 1;
+      out.negate = 0;
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      out.abs = 1;
+      out.negate = 1;
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      out.abs = 0;
+      out.negate = 1;
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+   default:
+      out.abs = 0;
+      out.negate = 0;
+      break;
+   }
+
+   if (out.indirect) {
+      assert(src->Indirect.File == TGSI_FILE_ADDRESS);
+      assert(src->Indirect.Index == 0);
+   }
+   
+   return out;
+}
+
+
+
+static void emit_insn( struct brw_wm_compile *c,
+		       const struct tgsi_full_instruction *inst )
+{
+   unsigned opcode = inst->Instruction.Opcode;
+   struct brw_fp_dst dst;
+   struct brw_fp_src src[3];
+   int i;
+
+   dst = translate_dst( c, &inst->Dst[0],
+			inst->Instruction.Saturate );
+
+   for (i = 0; i < inst->Instruction.NumSrcRegs; i++)
+      src[i] = translate_src( c, &inst->Src[i] );
+   
+   switch (opcode) {
+   case TGSI_OPCODE_ABS:
+      emit_op1(c, TGSI_OPCODE_MOV,
+	       dst, 
+	       src_abs(src[0]));
+      break;
+
+   case TGSI_OPCODE_SUB: 
+      emit_op2(c, TGSI_OPCODE_ADD,
+	       dst,
+	       src[0],
+	       src_negate(src[1]));
+      break;
+
+   case TGSI_OPCODE_SCS: 
+      emit_op1(c, TGSI_OPCODE_SCS,
+	       dst_mask(dst, BRW_WRITEMASK_XY),
+	       src[0]);
+      break;
+	 
+   case TGSI_OPCODE_DST:
+      precalc_dst(c, dst, src[0], src[1]);
+      break;
+
+   case TGSI_OPCODE_LIT:
+      precalc_lit(c, dst, src[0]);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      precalc_tex(c, dst,
+		  inst->Texture.Texture,
+		  src[1].index,	/* use sampler unit for tex idx */
+		  src[0],       /* coord */
+                  src[1]);      /* sampler */
+      break;
+
+   case TGSI_OPCODE_TXP:
+      precalc_txp(c, dst,
+		  inst->Texture.Texture,
+		  src[1].index,	/* use sampler unit for tex idx */
+		  src[0],       /* coord */
+                  src[1]);      /* sampler */
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* XXX: TXB not done
+       */
+      precalc_tex(c, dst,
+		  inst->Texture.Texture,
+		  src[1].index,	/* use sampler unit for tex idx*/
+		  src[0],
+                  src[1]);
+      break;
+
+   case TGSI_OPCODE_XPD: 
+      emit_op2(c, TGSI_OPCODE_XPD,
+	       dst_mask(dst, BRW_WRITEMASK_XYZ),
+	       src[0], 
+	       src[1]);
+      break;
+
+   case TGSI_OPCODE_KIL: 
+      emit_op1(c, TGSI_OPCODE_KIL,
+	       dst_mask(dst_undef(), 0),
+	       src[0]);
+      break;
+
+   case TGSI_OPCODE_END:
+      emit_fb_write(c);
+      break;
+   default:
+      if (!c->key.has_flow_control &&
+	  brw_wm_is_scalar_result(opcode))
+	 emit_scalar_insn(c, opcode, dst, src[0], src[1], src[2]);
+      else
+	 emit_op3(c, opcode, dst, src[0], src[1], src[2]);
+      break;
+   }
+}
+
+/**
+ * Initial pass for fragment program code generation.
+ * This function is used by both the GLSL and non-GLSL paths.
+ */
+int brw_wm_pass_fp( struct brw_wm_compile *c )
+{
+   struct brw_fragment_shader *fs = c->fp;
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction *inst;
+   struct tgsi_full_declaration *decl;
+   const float *imm;
+   GLuint size;
+   GLuint i;
+
+   if (BRW_DEBUG & DEBUG_WM) {
+      debug_printf("pre-fp:\n");
+      tgsi_dump(fs->tokens, 0); 
+   }
+
+   c->fp_pixel_xy = src_undef();
+   c->fp_delta_xy = src_undef();
+   c->fp_pixel_w = src_undef();
+   c->nr_fp_insns = 0;
+   c->nr_immediates = 0;
+
+
+   /* Loop over all instructions doing assorted simplifications and
+    * transformations.
+    */
+   tgsi_parse_init( &parse, fs->tokens );
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 /* Turn intput declarations into special WM_* instructions.
+	  *
+	  * XXX: For non-branching shaders, consider deferring variable
+	  * initialization as late as possible to minimize register
+	  * usage.  This is how the original BRW driver worked.
+	  *
+	  * In a branching shader, must preamble instructions at decl
+	  * time, as instruction order in the shader does not
+	  * correspond to the order instructions are executed in the
+	  * wild.
+	  *
+	  * This is where special instructions such as WM_CINTERP,
+	  * WM_LINTERP, WM_PINTERP and WM_WPOSXY are emitted to
+	  * compute shader inputs from the payload registers and pixel
+	  * position.
+	  */
+         decl = &parse.FullToken.FullDeclaration;
+         if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+            unsigned first, last, mask;
+            unsigned attrib;
+
+            first = decl->Range.First;
+            last = decl->Range.Last;
+            mask = decl->Declaration.UsageMask;
+
+            for (attrib = first; attrib <= last; attrib++) {
+	       emit_interp(c, 
+			   attrib, 
+			   decl->Semantic.Name,
+			   decl->Declaration.Interpolate );
+            }
+         }
+	 
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+	 /* Unlike VS programs we can probably manage fine encoding
+	  * immediate values directly into the emitted EU
+	  * instructions, as we probably only need to reference one
+	  * float value per instruction.  Just save the data for now
+	  * and use directly later.
+	  */
+	 i = c->nr_immediates++;
+	 imm = &parse.FullToken.FullImmediate.u[i].Float;
+	 size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+
+	 if (c->nr_immediates >= BRW_WM_MAX_CONST)
+	    return PIPE_ERROR_OUT_OF_MEMORY;
+
+	 for (i = 0; i < size; i++)
+	    c->immediate[c->nr_immediates].v[i] = imm[i];
+
+	 for (; i < 4; i++)
+	    c->immediate[c->nr_immediates].v[i] = 0.0;
+
+	 c->immediate[c->nr_immediates].nr = size;
+	 c->nr_immediates++;
+	 break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         inst = &parse.FullToken.FullInstruction;
+	 emit_insn(c, inst);
+	 break;
+      }
+   }
+
+   if (BRW_DEBUG & DEBUG_WM) {
+      brw_wm_print_fp_program( c, "pass_fp" );
+      debug_printf("\n");
+   }
+
+   return c->error;
+}
+
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
new file mode 100644
index 0000000000..3b3afc39d3
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -0,0 +1,2032 @@
+#include "util/u_math.h"
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_wm.h"
+
+
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
+                                  const struct brw_fp_instruction *inst,
+                                  GLuint component);
+
+
+static void
+reclaim_temps(struct brw_wm_compile *c);
+
+
+/** Mark GRF register as used. */
+static void
+prealloc_grf(struct brw_wm_compile *c, int r)
+{
+   c->used_grf[r] = GL_TRUE;
+}
+
+
+/** Mark given GRF register as not in use. */
+static void
+release_grf(struct brw_wm_compile *c, int r)
+{
+   /*assert(c->used_grf[r]);*/
+   c->used_grf[r] = GL_FALSE;
+   c->first_free_grf = MIN2(c->first_free_grf, r);
+}
+
+
+/** Return index of a free GRF, mark it as used. */
+static int
+alloc_grf(struct brw_wm_compile *c)
+{
+   GLuint r;
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   /* no free temps, try to reclaim some */
+   reclaim_temps(c);
+   c->first_free_grf = 0;
+
+   /* try alloc again */
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   for (r = 0; r < BRW_WM_MAX_GRF; r++) {
+      assert(c->used_grf[r]);
+   }
+
+   /* really, no free GRF regs found */
+   if (!c->out_of_regs) {
+      /* print warning once per compilation */
+      debug_printf("%s: ran out of registers for fragment program", __FUNCTION__);
+      c->out_of_regs = GL_TRUE;
+   }
+
+   return -1;
+}
+
+
+/** Return number of GRF registers used */
+static int
+num_grf_used(const struct brw_wm_compile *c)
+{
+   int r;
+   for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
+      if (c->used_grf[r])
+         return r + 1;
+   return 0;
+}
+
+
+
+/**
+ * Record the mapping of a Mesa register to a hardware register.
+ */
+static void set_reg(struct brw_wm_compile *c, int file, int index, 
+	int component, struct brw_reg reg)
+{
+    c->wm_regs[file][index][component].reg = reg;
+    c->wm_regs[file][index][component].inited = GL_TRUE;
+}
+
+static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
+{
+    struct brw_reg reg;
+
+    /* if we need to allocate another temp, grow the tmp_regs[] array */
+    if (c->tmp_index == c->tmp_max) {
+       int r = alloc_grf(c);
+       if (r < 0) {
+          /*printf("Out of temps in %s\n", __FUNCTION__);*/
+          r = 50; /* XXX random register! */
+       }
+       c->tmp_regs[ c->tmp_max++ ] = r;
+    }
+
+    /* form the GRF register */
+    reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
+    /*printf("alloc_temp %d\n", reg.nr);*/
+    assert(reg.nr < BRW_WM_MAX_GRF);
+    return reg;
+
+}
+
+/**
+ * Save current temp register info.
+ * There must be a matching call to release_tmps().
+ */
+static int mark_tmps(struct brw_wm_compile *c)
+{
+    return c->tmp_index;
+}
+
+static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
+{
+    return brw_vec8_grf( c->tmp_regs[ index ], 0 );
+}
+
+static void release_tmps(struct brw_wm_compile *c, int mark)
+{
+    c->tmp_index = mark;
+}
+
+/**
+ * Convert Mesa src register to brw register.
+ *
+ * Since we're running in SOA mode each Mesa register corresponds to four
+ * hardware registers.  We allocate the hardware registers as needed here.
+ *
+ * \param file  register file, one of PROGRAM_x
+ * \param index  register number
+ * \param component  src component (X=0, Y=1, Z=2, W=3)
+ * \param nr  not used?!?
+ * \param neg  negate value?
+ * \param abs  take absolute value?
+ */
+static struct brw_reg 
+get_reg(struct brw_wm_compile *c, int file, int index, int component,
+        int nr, GLuint neg, GLuint abs)
+{
+    struct brw_reg reg;
+    switch (file) {
+	case TGSI_FILE_NULL:
+	    return brw_null_reg();	
+
+	case TGSI_FILE_CONSTANT:
+	case TGSI_FILE_TEMPORARY:
+	case TGSI_FILE_INPUT:
+	case TGSI_FILE_OUTPUT:
+	case BRW_FILE_PAYLOAD:
+	    break;
+
+	default:
+	   debug_printf("%s: Unexpected file type\n", __FUNCTION__);
+	   return brw_null_reg();
+    }
+
+    assert(index < 256);
+    assert(component < 4);
+
+    /* see if we've already allocated a HW register for this Mesa register */
+    if (c->wm_regs[file][index][component].inited) {
+       /* yes, re-use */
+       reg = c->wm_regs[file][index][component].reg;
+    }
+    else {
+	/* no, allocate new register */
+       int grf = alloc_grf(c);
+       /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
+       if (grf < 0) {
+          /* totally out of temps */
+          grf = 51; /* XXX random register! */
+       }
+
+       reg = brw_vec8_grf(grf, 0);
+       /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
+
+       set_reg(c, file, index, component, reg);
+    }
+
+    if (neg & (1 << component)) {
+	reg = negate(reg);
+    }
+    if (abs)
+	reg = brw_abs(reg);
+    return reg;
+}
+
+
+
+
+/**
+ * Find first/last instruction that references each temporary register.
+ */
+GLboolean
+_mesa_find_temp_intervals(const struct prog_instruction *instructions,
+                          GLuint numInstructions,
+                          GLint intBegin[MAX_PROGRAM_TEMPS],
+                          GLint intEnd[MAX_PROGRAM_TEMPS])
+{
+   struct loop_info
+   {
+      GLuint Start, End;  /**< Start, end instructions of loop */
+   };
+   struct loop_info loopStack[MAX_LOOP_NESTING];
+   GLuint loopStackDepth = 0;
+   GLuint i;
+
+   for (i = 0; i < MAX_PROGRAM_TEMPS; i++){
+      intBegin[i] = intEnd[i] = -1;
+   }
+
+   /* Scan instructions looking for temporary registers */
+   for (i = 0; i < numInstructions; i++) {
+      const struct prog_instruction *inst = instructions + i;
+      if (inst->Opcode == OPCODE_BGNLOOP) {
+         loopStack[loopStackDepth].Start = i;
+         loopStack[loopStackDepth].End = inst->BranchTarget;
+         loopStackDepth++;
+      }
+      else if (inst->Opcode == OPCODE_ENDLOOP) {
+         loopStackDepth--;
+      }
+      else if (inst->Opcode == OPCODE_CAL) {
+         return GL_FALSE;
+      }
+      else {
+         const GLuint numSrc = 3;
+         GLuint j;
+         for (j = 0; j < numSrc; j++) {
+            if (inst->SrcReg[j].File == PROGRAM_TEMPORARY) {
+               const GLuint index = inst->SrcReg[j].Index;
+               if (inst->SrcReg[j].RelAddr)
+                  return GL_FALSE;
+               update_interval(intBegin, intEnd, index, i);
+               if (loopStackDepth > 0) {
+                  /* extend temp register's interval to end of loop */
+                  GLuint loopEnd = loopStack[loopStackDepth - 1].End;
+                  update_interval(intBegin, intEnd, index, loopEnd);
+               }
+            }
+         }
+         if (inst->DstReg.File == PROGRAM_TEMPORARY) {
+            const GLuint index = inst->DstReg.Index;
+            if (inst->DstReg.RelAddr)
+               return GL_FALSE;
+            update_interval(intBegin, intEnd, index, i);
+            if (loopStackDepth > 0) {
+               /* extend temp register's interval to end of loop */
+               GLuint loopEnd = loopStack[loopStackDepth - 1].End;
+               update_interval(intBegin, intEnd, index, loopEnd);
+            }
+         }
+      }
+   }
+
+   return GL_TRUE;
+}
+
+
+/**
+ * This is called if we run out of GRF registers.  Examine the live intervals
+ * of temp regs in the program and free those which won't be used again.
+ */
+static void
+reclaim_temps(struct brw_wm_compile *c)
+{
+   GLint intBegin[BRW_WM_MAX_TEMPS];
+   GLint intEnd[BRW_WM_MAX_TEMPS];
+   int index;
+
+   /*printf("Reclaim temps:\n");*/
+
+   _mesa_find_temp_intervals(c->fp_instructions, c->nr_fp_insns,
+                             intBegin, intEnd);
+
+   for (index = 0; index < BRW_WM_MAX_TEMPS; index++) {
+      if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
+         /* program temp[i] can be freed */
+         int component;
+         /*printf("  temp[%d] is dead\n", index);*/
+         for (component = 0; component < 4; component++) {
+            if (c->wm_regs[TGSI_FILE_TEMPORARY][index][component].inited) {
+               int r = c->wm_regs[TGSI_FILE_TEMPORARY][index][component].reg.nr;
+               release_grf(c, r);
+               /*
+               printf("  Reclaim temp %d, reg %d at inst %d\n",
+                      index, r, c->cur_inst);
+               */
+               c->wm_regs[TGSI_FILE_TEMPORARY][index][component].inited = GL_FALSE;
+            }
+         }
+      }
+   }
+}
+
+
+
+
+/**
+ * Preallocate registers.  This sets up the Mesa to hardware register
+ * mapping for certain registers, such as constants (uniforms/state vars)
+ * and shader inputs.
+ */
+static void prealloc_reg(struct brw_wm_compile *c)
+{
+    int i, j;
+    struct brw_reg reg;
+    int urb_read_length = 0;
+    GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
+    GLuint reg_index = 0;
+
+    memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
+    c->first_free_grf = 0;
+
+    for (i = 0; i < 4; i++) {
+        if (i < c->key.nr_depth_regs) 
+            reg = brw_vec8_grf(i * 2, 0);
+        else
+            reg = brw_vec8_grf(0, 0);
+	set_reg(c, TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH, i, reg);
+    }
+    reg_index += 2 * c->key.nr_depth_regs;
+
+    /* constants */
+    {
+        const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
+        const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
+
+        /* use a real constant buffer, or just use a section of the GRF? */
+        /* XXX this heuristic may need adjustment... */
+        if ((nr_params + nr_temps) * 4 + reg_index > 80)
+           c->fp->use_const_buffer = GL_TRUE;
+        else
+           c->fp->use_const_buffer = GL_FALSE;
+        /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
+
+        if (c->fp->use_const_buffer) {
+           /* We'll use a real constant buffer and fetch constants from
+            * it with a dataport read message.
+            */
+
+           /* number of float constants in CURBE */
+           c->prog_data.nr_params = 0;
+        }
+        else {
+           const struct gl_program_parameter_list *plist = 
+              c->fp->program.Base.Parameters;
+           int index = 0;
+
+           /* number of float constants in CURBE */
+           c->prog_data.nr_params = 4 * nr_params;
+
+           /* loop over program constants (float[4]) */
+           for (i = 0; i < nr_params; i++) {
+              /* loop over XYZW channels */
+              for (j = 0; j < 4; j++, index++) {
+                 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
+                 /* Save pointer to parameter/constant value.
+                  * Constants will be copied in prepare_constant_buffer()
+                  */
+                 c->prog_data.param[index] = &plist->ParameterValues[i][j];
+                 set_reg(c, TGSI_FILE_STATE_VAR, i, j, reg);
+              }
+           }
+           /* number of constant regs used (each reg is float[8]) */
+           c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
+           reg_index += c->nr_creg;
+        }
+    }
+
+    /* fragment shader inputs */
+    for (i = 0; i < VERT_RESULT_MAX; i++) {
+       int fp_input;
+
+       if (i >= VERT_RESULT_VAR0)
+	  fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
+       else if (i <= VERT_RESULT_TEX7)
+	  fp_input = i;
+       else
+	  fp_input = -1;
+
+       if (fp_input >= 0 && inputs & (1 << fp_input)) {
+	  urb_read_length = reg_index;
+	  reg = brw_vec8_grf(reg_index, 0);
+	  for (j = 0; j < 4; j++)
+	     set_reg(c, TGSI_FILE_PAYLOAD, fp_input, j, reg);
+       }
+       if (c->key.nr_vp_outputs > i) {
+	  reg_index += 2;
+       }
+    }
+
+    c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
+    c->prog_data.urb_read_length = urb_read_length;
+    c->prog_data.curb_read_length = c->nr_creg;
+    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index++;
+    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index += 2;
+
+    /* mark GRF regs [0..reg_index-1] as in-use */
+    for (i = 0; i < reg_index; i++)
+       prealloc_grf(c, i);
+
+    /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
+    prealloc_grf(c, 126);
+    prealloc_grf(c, 127);
+
+    for (i = 0; i < c->nr_fp_insns; i++) {
+	const struct brw_fp_instruction *inst = &c->fp_instructions[i];
+	struct brw_reg dst[4];
+
+	switch (inst->Opcode) {
+	case OPCODE_TEX:
+	case OPCODE_TXB:
+	    /* Allocate the channels of texture results contiguously,
+	     * since they are written out that way by the sampler unit.
+	     */
+	    for (j = 0; j < 4; j++) {
+		dst[j] = get_dst_reg(c, inst, j);
+		if (j != 0)
+		    assert(dst[j].nr == dst[j - 1].nr + 1);
+	    }
+	    break;
+	default:
+	    break;
+	}
+    }
+
+    /* An instruction may reference up to three constants.
+     * They'll be found in these registers.
+     * XXX alloc these on demand!
+     */
+    if (c->fp->use_const_buffer) {
+       for (i = 0; i < 3; i++) {
+          c->current_const[i].index = -1;
+          c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
+       }
+    }
+#if 0
+    printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
+    printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
+#endif
+}
+
+
+/**
+ * Check if any of the instruction's src registers are constants, uniforms,
+ * or statevars.  If so, fetch any constants that we don't already have in
+ * the three GRF slots.
+ */
+static void fetch_constants(struct brw_wm_compile *c,
+                            const struct brw_fp_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   /* loop over instruction src regs */
+   for (i = 0; i < 3; i++) {
+      const struct prog_src_register *src = &inst->SrcReg[i];
+      if (src->File == TGSI_FILE_IMMEDIATE ||
+          src->File == TGSI_FILE_CONSTANT) {
+	 c->current_const[i].index = src->Index;
+
+#if 0
+	 printf("  fetch const[%d] for arg %d into reg %d\n",
+		src->Index, i, c->current_const[i].reg.nr);
+#endif
+
+	 /* need to fetch the constant now */
+	 brw_dp_READ_4(p,
+		       c->current_const[i].reg,  /* writeback dest */
+		       src->RelAddr,             /* relative indexing? */
+		       16 * src->Index,          /* byte offset */
+		       SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
+		       );
+      }
+   }
+}
+
+
+/**
+ * Convert Mesa dst register to brw register.
+ */
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c, 
+                                  const struct brw_fp_instruction *inst,
+                                  GLuint component)
+{
+    const int nr = 1;
+    return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
+	    0, 0);
+}
+
+
+static struct brw_reg
+get_src_reg_const(struct brw_wm_compile *c,
+                  const struct brw_fp_instruction *inst,
+                  GLuint srcRegIndex, GLuint component)
+{
+   /* We should have already fetched the constant from the constant
+    * buffer in fetch_constants().  Now we just have to return a
+    * register description that extracts the needed component and
+    * smears it across all eight vector components.
+    */
+   const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
+   struct brw_reg const_reg;
+
+   assert(component < 4);
+   assert(srcRegIndex < 3);
+   assert(c->current_const[srcRegIndex].index != -1);
+   const_reg = c->current_const[srcRegIndex].reg;
+
+   /* extract desired float from the const_reg, and smear */
+   const_reg = stride(const_reg, 0, 1, 0);
+   const_reg.subnr = component * 4;
+
+   if (src->Negate)
+      const_reg = negate(const_reg);
+   if (src->Abs)
+      const_reg = brw_abs(const_reg);
+
+#if 0
+   printf("  form const[%d].%d for arg %d, reg %d\n",
+          c->current_const[srcRegIndex].index,
+          component,
+          srcRegIndex,
+          const_reg.nr);
+#endif
+
+   return const_reg;
+}
+
+
+/**
+ * Convert Mesa src register to brw register.
+ */
+static struct brw_reg get_src_reg(struct brw_wm_compile *c, 
+                                  const struct brw_fp_instruction *inst,
+                                  GLuint srcRegIndex, GLuint channel)
+{
+    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
+    const GLuint nr = 1;
+    const GLuint component = BRW_GET_SWZ(src->Swizzle, channel);
+
+    /* Extended swizzle terms */
+    if (component == SWIZZLE_ZERO) {
+       return brw_imm_f(0.0F);
+    }
+    else if (component == SWIZZLE_ONE) {
+       return brw_imm_f(1.0F);
+    }
+
+    if (c->fp->use_const_buffer &&
+        (src->File == TGSI_FILE_STATE_VAR ||
+         src->File == TGSI_FILE_CONSTANT ||
+         src->File == TGSI_FILE_UNIFORM)) {
+       return get_src_reg_const(c, inst, srcRegIndex, component);
+    }
+    else {
+       /* other type of source register */
+       return get_reg(c, src->File, src->Index, component, nr, 
+                      src->Negate, src->Abs);
+    }
+}
+
+
+/**
+ * Same as \sa get_src_reg() but if the register is a immediate, emit
+ * a brw_reg encoding the immediate.
+ * Note that a brw instruction only allows one src operand to be a immediate.
+ * For instructions with more than one operand, only the second can be a
+ * immediate.  This means that we treat some immediates as constants
+ * (which why TGSI_FILE_IMMEDIATE is checked in fetch_constants()).
+ * 
+ */
+static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c, 
+                                      const struct brw_fp_instruction *inst,
+                                      GLuint srcRegIndex, GLuint channel)
+{
+    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
+    if (src->File == TGSI_FILE_IMMEDIATE) {
+       /* an immediate */
+       const int component = BRW_GET_SWZ(src->Swizzle, channel);
+       const GLfloat *param =
+          c->fp->program.Base.Parameters->ParameterValues[src->Index];
+       GLfloat value = param[component];
+       if (src->Negate)
+          value = -value;
+       if (src->Abs)
+          value = FABSF(value);
+#if 0
+       printf("  form immed value %f for chan %d\n", value, channel);
+#endif
+       return brw_imm_f(value);
+    }
+    else {
+       return get_src_reg(c, inst, srcRegIndex, channel);
+    }
+}
+
+
+/**
+ * Subroutines are minimal support for resusable instruction sequences.
+ * They are implemented as simply as possible to minimise overhead: there
+ * is no explicit support for communication between the caller and callee
+ * other than saving the return address in a temporary register, nor is
+ * there any automatic local storage.  This implies that great care is
+ * required before attempting reentrancy or any kind of nested
+ * subroutine invocations.
+ */
+static void invoke_subroutine( struct brw_wm_compile *c,
+			       enum _subroutine subroutine,
+			       void (*emit)( struct brw_wm_compile * ) )
+{
+    struct brw_compile *p = &c->func;
+
+    assert( subroutine < BRW_WM_MAX_SUBROUTINE );
+    
+    if( c->subroutines[ subroutine ] ) {
+	/* subroutine previously emitted: reuse existing instructions */
+
+	int mark = mark_tmps( c );
+	struct brw_reg return_address = retype( alloc_tmp( c ),
+						BRW_REGISTER_TYPE_UD );
+	int here = p->nr_insn;
+	
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
+
+	brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
+		 brw_imm_d( ( c->subroutines[ subroutine ] -
+			      here - 1 ) << 4 ) );
+	brw_pop_insn_state(p);
+
+	release_tmps( c, mark );
+    } else {
+	/* previously unused subroutine: emit, and mark for later reuse */
+	
+	int mark = mark_tmps( c );
+	struct brw_reg return_address = retype( alloc_tmp( c ),
+						BRW_REGISTER_TYPE_UD );
+	struct brw_instruction *calc;
+	int base = p->nr_insn;
+	
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
+	brw_pop_insn_state(p);
+	
+	c->subroutines[ subroutine ] = p->nr_insn;
+
+	emit( c );
+	
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	brw_MOV( p, brw_ip_reg(), return_address );
+	brw_pop_insn_state(p);
+
+	brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
+	
+	release_tmps( c, mark );
+    }
+}
+
+static void emit_trunc( struct brw_wm_compile *c,
+                        const struct brw_fp_instruction *inst)
+{
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    struct brw_reg src, dst;
+	    dst = get_dst_reg(c, inst, i);
+	    src = get_src_reg(c, inst, 0, i);
+	    brw_RNDZ(p, dst, src);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_mov( struct brw_wm_compile *c,
+                      const struct brw_fp_instruction *inst)
+{
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    struct brw_reg src, dst;
+	    dst = get_dst_reg(c, inst, i);
+            /* XXX some moves from immediate value don't work reliably!!! */
+            /*src = get_src_reg_imm(c, inst, 0, i);*/
+            src = get_src_reg(c, inst, 0, i);
+	    brw_MOV(p, dst, src);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_pixel_xy(struct brw_wm_compile *c,
+                          const struct brw_fp_instruction *inst)
+{
+    struct brw_reg r1 = brw_vec1_grf(1, 0);
+    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
+
+    struct brw_reg dst0, dst1;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+
+    dst0 = get_dst_reg(c, inst, 0);
+    dst1 = get_dst_reg(c, inst, 1);
+    /* Calculate pixel centers by adding 1 or 0 to each of the
+     * micro-tile coordinates passed in r1.
+     */
+    if (mask & WRITEMASK_X) {
+	brw_ADD(p,
+		vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
+		stride(suboffset(r1_uw, 4), 2, 4, 0),
+		brw_imm_v(0x10101010));
+    }
+
+    if (mask & WRITEMASK_Y) {
+	brw_ADD(p,
+		vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
+		stride(suboffset(r1_uw, 5), 2, 4, 0),
+		brw_imm_v(0x11001100));
+    }
+}
+
+static void emit_delta_xy(struct brw_wm_compile *c,
+                          const struct brw_fp_instruction *inst)
+{
+    struct brw_reg r1 = brw_vec1_grf(1, 0);
+    struct brw_reg dst0, dst1, src0, src1;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+
+    dst0 = get_dst_reg(c, inst, 0);
+    dst1 = get_dst_reg(c, inst, 1);
+    src0 = get_src_reg(c, inst, 0, 0);
+    src1 = get_src_reg(c, inst, 0, 1);
+    /* Calc delta X,Y by subtracting origin in r1 from the pixel
+     * centers.
+     */
+    if (mask & WRITEMASK_X) {
+	brw_ADD(p,
+		dst0,
+		retype(src0, BRW_REGISTER_TYPE_UW),
+		negate(r1));
+    }
+
+    if (mask & WRITEMASK_Y) {
+	brw_ADD(p,
+		dst1,
+		retype(src1, BRW_REGISTER_TYPE_UW),
+		negate(suboffset(r1,1)));
+
+    }
+}
+
+static void fire_fb_write( struct brw_wm_compile *c,
+                           GLuint base_reg,
+                           GLuint nr,
+                           GLuint target,
+                           GLuint eot)
+{
+    struct brw_compile *p = &c->func;
+    /* Pass through control information:
+     */
+    /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
+    {
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
+	brw_MOV(p,
+		brw_message_reg(base_reg + 1),
+		brw_vec8_grf(1, 0));
+	brw_pop_insn_state(p);
+    }
+    /* Send framebuffer write message: */
+    brw_fb_WRITE(p,
+	    retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+	    base_reg,
+	    retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
+	    target,              
+	    nr,
+	    0,
+	    eot);
+}
+
+static void emit_fb_write(struct brw_wm_compile *c,
+                          const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    int nr = 2;
+    int channel;
+    GLuint target, eot;
+    struct brw_reg src0;
+
+    /* Reserve a space for AA - may not be needed:
+     */
+    if (c->key.aa_dest_stencil_reg)
+	nr += 1;
+
+    brw_push_insn_state(p);
+    for (channel = 0; channel < 4; channel++) {
+        src0 = get_src_reg(c,  inst, 0, channel);
+        /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
+        /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
+        brw_MOV(p, brw_message_reg(nr + channel), src0);
+    }
+    /* skip over the regs populated above: */
+    nr += 8;
+    brw_pop_insn_state(p);
+
+    if (c->key.source_depth_to_render_target) {
+       if (c->key.computes_depth) {
+          src0 = get_src_reg(c, inst, 2, 2);
+          brw_MOV(p, brw_message_reg(nr), src0);
+       }
+       else {
+          src0 = get_src_reg(c, inst, 1, 1);
+          brw_MOV(p, brw_message_reg(nr), src0);
+       }
+
+       nr += 2;
+    }
+
+    if (c->key.dest_depth_reg) {
+        const GLuint comp = c->key.dest_depth_reg / 2;
+        const GLuint off = c->key.dest_depth_reg % 2;
+
+        if (off != 0) {
+            /* XXX this code needs review/testing */
+            struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
+            struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
+
+            brw_push_insn_state(p);
+            brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+            brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
+            /* 2nd half? */
+            brw_MOV(p, brw_message_reg(nr+1), arg1_1);
+            brw_pop_insn_state(p);
+        }
+        else
+        {
+            struct brw_reg src =  get_src_reg(c, inst, 1, 1);
+            brw_MOV(p, brw_message_reg(nr), src);
+        }
+        nr += 2;
+   }
+
+    target = inst->Aux >> 1;
+    eot = inst->Aux & 1;
+    fire_fb_write(c, 0, nr, target, eot);
+}
+
+static void emit_pixel_w( struct brw_wm_compile *c,
+                          const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    if (mask & WRITEMASK_W) {
+	struct brw_reg dst, src0, delta0, delta1;
+	struct brw_reg interp3;
+
+	dst = get_dst_reg(c, inst, 3);
+	src0 = get_src_reg(c, inst, 0, 0);
+	delta0 = get_src_reg(c, inst, 1, 0);
+	delta1 = get_src_reg(c, inst, 1, 1);
+
+	interp3 = brw_vec1_grf(src0.nr+1, 4);
+	/* Calc 1/w - just linterp wpos[3] optimized by putting the
+	 * result straight into a message reg.
+	 */
+	brw_LINE(p, brw_null_reg(), interp3, delta0);
+	brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
+
+	/* Calc w */
+	brw_math_16( p, dst,
+		BRW_MATH_FUNCTION_INV,
+		BRW_MATH_SATURATE_NONE,
+		2, brw_null_reg(),
+		BRW_MATH_PRECISION_FULL);
+    }
+}
+
+static void emit_linterp(struct brw_wm_compile *c,
+                         const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg interp[4];
+    struct brw_reg dst, delta0, delta1;
+    struct brw_reg src0;
+    GLuint nr, i;
+
+    src0 = get_src_reg(c, inst, 0, 0);
+    delta0 = get_src_reg(c, inst, 1, 0);
+    delta1 = get_src_reg(c, inst, 1, 1);
+    nr = src0.nr;
+
+    interp[0] = brw_vec1_grf(nr, 0);
+    interp[1] = brw_vec1_grf(nr, 4);
+    interp[2] = brw_vec1_grf(nr+1, 0);
+    interp[3] = brw_vec1_grf(nr+1, 4);
+
+    for(i = 0; i < 4; i++ ) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
+	    brw_MAC(p, dst, suboffset(interp[i],1), delta1);
+	}
+    }
+}
+
+static void emit_cinterp(struct brw_wm_compile *c,
+                         const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+
+    struct brw_reg interp[4];
+    struct brw_reg dst, src0;
+    GLuint nr, i;
+
+    src0 = get_src_reg(c, inst, 0, 0);
+    nr = src0.nr;
+
+    interp[0] = brw_vec1_grf(nr, 0);
+    interp[1] = brw_vec1_grf(nr, 4);
+    interp[2] = brw_vec1_grf(nr+1, 0);
+    interp[3] = brw_vec1_grf(nr+1, 4);
+
+    for(i = 0; i < 4; i++ ) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    brw_MOV(p, dst, suboffset(interp[i],3));
+	}
+    }
+}
+
+static void emit_pinterp(struct brw_wm_compile *c,
+                         const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+
+    struct brw_reg interp[4];
+    struct brw_reg dst, delta0, delta1;
+    struct brw_reg src0, w;
+    GLuint nr, i;
+
+    src0 = get_src_reg(c, inst, 0, 0);
+    delta0 = get_src_reg(c, inst, 1, 0);
+    delta1 = get_src_reg(c, inst, 1, 1);
+    w = get_src_reg(c, inst, 2, 3);
+    nr = src0.nr;
+
+    interp[0] = brw_vec1_grf(nr, 0);
+    interp[1] = brw_vec1_grf(nr, 4);
+    interp[2] = brw_vec1_grf(nr+1, 0);
+    interp[3] = brw_vec1_grf(nr+1, 4);
+
+    for(i = 0; i < 4; i++ ) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
+	    brw_MAC(p, dst, suboffset(interp[i],1), 
+		    delta1);
+	    brw_MUL(p, dst, dst, w);
+	}
+    }
+}
+
+/* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
+static void emit_frontfacing(struct brw_wm_compile *c,
+			     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
+    struct brw_reg dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    brw_MOV(p, dst, brw_imm_f(0.0));
+	}
+    }
+
+    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
+     * us front face
+     */
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    brw_MOV(p, dst, brw_imm_f(1.0));
+	}
+    }
+    brw_set_predicate_control_flag_value(p, 0xff);
+}
+
+static void emit_xpd(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    for (i = 0; i < 4; i++) {
+	GLuint i2 = (i+2)%3;
+	GLuint i1 = (i+1)%3;
+	if (mask & (1<<i)) {
+	    struct brw_reg src0, src1, dst;
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = negate(get_src_reg(c, inst, 0, i2));
+	    src1 = get_src_reg_imm(c, inst, 1, i1);
+	    brw_MUL(p, brw_null_reg(), src0, src1);
+	    src0 = get_src_reg(c, inst, 0, i1);
+	    src1 = get_src_reg_imm(c, inst, 1, i2);
+	    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+	    brw_MAC(p, dst, src0, src1);
+	    brw_set_saturate(p, 0);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_dp3(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_reg src0[3], src1[3], dst;
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    for (i = 0; i < 3; i++) {
+	src0[i] = get_src_reg(c, inst, 0, i);
+	src1[i] = get_src_reg_imm(c, inst, 1, i);
+    }
+
+    dst = get_dst_reg(c, inst, dst_chan);
+    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    brw_MAC(p, dst, src0[2], src1[2]);
+    brw_set_saturate(p, 0);
+}
+
+static void emit_dp4(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_reg src0[4], src1[4], dst;
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    for (i = 0; i < 4; i++) {
+	src0[i] = get_src_reg(c, inst, 0, i);
+	src1[i] = get_src_reg_imm(c, inst, 1, i);
+    }
+    dst = get_dst_reg(c, inst, dst_chan);
+    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+    brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    brw_MAC(p, dst, src0[3], src1[3]);
+    brw_set_saturate(p, 0);
+}
+
+static void emit_dph(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_reg src0[4], src1[4], dst;
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    for (i = 0; i < 4; i++) {
+	src0[i] = get_src_reg(c, inst, 0, i);
+	src1[i] = get_src_reg_imm(c, inst, 1, i);
+    }
+    dst = get_dst_reg(c, inst, dst_chan);
+    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+    brw_MAC(p, dst, src0[2], src1[2]);
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    brw_ADD(p, dst, dst, src1[3]);
+    brw_set_saturate(p, 0);
+}
+
+/**
+ * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
+ * Note that the result of the function is smeared across the dest
+ * register's X, Y, Z and W channels (subject to writemasking of course).
+ */
+static void emit_math1(struct brw_wm_compile *c,
+                       const struct brw_fp_instruction *inst, GLuint func)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    /* Get first component of source register */
+    dst = get_dst_reg(c, inst, dst_chan);
+    src0 = get_src_reg(c, inst, 0, 0);
+
+    brw_MOV(p, brw_message_reg(2), src0);
+    brw_math(p,
+             dst,
+             func,
+             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+             2,
+             brw_null_reg(),
+             BRW_MATH_DATA_VECTOR,
+             BRW_MATH_PRECISION_FULL);
+}
+
+static void emit_rcp(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
+}
+
+static void emit_rsq(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
+}
+
+static void emit_sin(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
+}
+
+static void emit_cos(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
+}
+
+static void emit_ex2(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
+}
+
+static void emit_lg2(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
+}
+
+static void emit_add(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, src1, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
+	    brw_ADD(p, dst, src0, src1);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_arl(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, addr_reg;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+                           BRW_ARF_ADDRESS, 0);
+    src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
+    brw_MOV(p, addr_reg, src0);
+    brw_set_saturate(p, 0);
+}
+
+
+static void emit_mul(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, src1, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
+	    brw_MUL(p, dst, src0, src1);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_frc(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg_imm(c, inst, 0, i);
+	    brw_FRC(p, dst, src0);
+	}
+    }
+    if (inst->SaturateMode != SATURATE_OFF)
+	brw_set_saturate(p, 0);
+}
+
+static void emit_flr(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg_imm(c, inst, 0, i);
+	    brw_RNDD(p, dst, src0);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+
+static void emit_min_max(struct brw_wm_compile *c,
+                         const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    const GLuint mask = inst->DstReg.WriteMask;
+    const int mark = mark_tmps(c);
+    int i;
+    brw_push_insn_state(p);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+            struct brw_reg real_dst = get_dst_reg(c, inst, i);
+	    struct brw_reg src0 = get_src_reg(c, inst, 0, i);
+	    struct brw_reg src1 = get_src_reg(c, inst, 1, i);
+            struct brw_reg dst;
+            /* if dst==src0 or dst==src1 we need to use a temp reg */
+            GLboolean use_temp = brw_same_reg(dst, src0) ||
+                                 brw_same_reg(dst, src1);
+            if (use_temp)
+               dst = alloc_tmp(c);
+            else
+               dst = real_dst;
+
+            /*
+            printf("  Min/max: dst %d  src0 %d  src1 %d\n",
+                   dst.nr, src0.nr, src1.nr);
+            */
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_MOV(p, dst, src0);
+	    brw_set_saturate(p, 0);
+
+            if (inst->Opcode == OPCODE_MIN)
+               brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
+            else
+               brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
+
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	    brw_MOV(p, dst, src1);
+	    brw_set_saturate(p, 0);
+	    brw_set_predicate_control_flag_value(p, 0xff);
+            if (use_temp)
+               brw_MOV(p, real_dst, dst);
+	}
+    }
+    brw_pop_insn_state(p);
+    release_tmps(c, mark);
+}
+
+static void emit_pow(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg dst, src0, src1;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    dst = get_dst_reg(c, inst, dst_chan);
+    src0 = get_src_reg_imm(c, inst, 0, 0);
+    src1 = get_src_reg_imm(c, inst, 1, 0);
+
+    brw_MOV(p, brw_message_reg(2), src0);
+    brw_MOV(p, brw_message_reg(3), src1);
+
+    brw_math(p,
+	    dst,
+	    BRW_MATH_FUNCTION_POW,
+	    (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+static void emit_lrp(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
+    int i;
+    int mark = mark_tmps(c);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+
+	    src1 = get_src_reg_imm(c, inst, 1, i);
+
+	    if (src1.nr == dst.nr) {
+		tmp1 = alloc_tmp(c);
+		brw_MOV(p, tmp1, src1);
+	    } else
+		tmp1 = src1;
+
+	    src2 = get_src_reg(c, inst, 2, i);
+	    if (src2.nr == dst.nr) {
+		tmp2 = alloc_tmp(c);
+		brw_MOV(p, tmp2, src2);
+	    } else
+		tmp2 = src2;
+
+	    brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
+	    brw_MUL(p, brw_null_reg(), dst, tmp2);
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_MAC(p, dst, src0, tmp1);
+	    brw_set_saturate(p, 0);
+	}
+	release_tmps(c, mark);
+    }
+}
+
+/**
+ * For GLSL shaders, this KIL will be unconditional.
+ * It may be contained inside an IF/ENDIF structure of course.
+ */
+static void emit_kil(struct brw_wm_compile *c)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+    brw_push_insn_state(p);
+    brw_set_mask_control(p, BRW_MASK_DISABLE);
+    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+    brw_AND(p, depth, c->emit_mask_reg, depth);
+    brw_pop_insn_state(p);
+}
+
+static void emit_mad(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg dst, src0, src1, src2;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
+	    src2 = get_src_reg_imm(c, inst, 2, i);
+	    brw_MUL(p, dst, src0, src1);
+
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_ADD(p, dst, dst, src2);
+	    brw_set_saturate(p, 0);
+	}
+    }
+}
+
+static void emit_sop(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst, GLuint cond)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg dst, src0, src1;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
+	    brw_push_insn_state(p);
+	    brw_CMP(p, brw_null_reg(), cond, src0, src1);
+	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	    brw_MOV(p, dst, brw_imm_f(0.0));
+	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	    brw_MOV(p, dst, brw_imm_f(1.0));
+	    brw_pop_insn_state(p);
+	}
+    }
+}
+
+static void emit_slt(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_L);
+}
+
+static void emit_sle(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_LE);
+}
+
+static void emit_sgt(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_G);
+}
+
+static void emit_sge(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_GE);
+}
+
+static void emit_seq(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_EQ);
+}
+
+static void emit_sne(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
+}
+
+static INLINE struct brw_reg high_words( struct brw_reg reg )
+{
+    return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
+		   0, 8, 2 );
+}
+
+static INLINE struct brw_reg low_words( struct brw_reg reg )
+{
+    return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
+}
+
+static INLINE struct brw_reg even_bytes( struct brw_reg reg )
+{
+    return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
+}
+
+static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
+{
+    return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
+		   0, 16, 2 );
+}
+
+
+    
+static void emit_wpos_xy(struct brw_wm_compile *c,
+                         const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg src0[2], dst[2];
+
+    dst[0] = get_dst_reg(c, inst, 0);
+    dst[1] = get_dst_reg(c, inst, 1);
+
+    src0[0] = get_src_reg(c, inst, 0, 0);
+    src0[1] = get_src_reg(c, inst, 0, 1);
+
+    /* Calculate the pixel offset from window bottom left into destination
+     * X and Y channels.
+     */
+    if (mask & WRITEMASK_X) {
+	/* X' = X */
+	brw_MOV(p,
+		dst[0],
+		retype(src0[0], BRW_REGISTER_TYPE_W));
+    }
+
+    if (mask & WRITEMASK_Y) {
+	/* Y' = height - 1 - Y */
+	brw_ADD(p,
+		dst[1],
+		negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
+		brw_imm_d(c->key.drawable_height - 1));
+    }
+}
+
+/* TODO
+   BIAS on SIMD8 not working yet...
+ */	
+static void emit_txb(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg dst[4], src[4], payload_reg;
+    /* Note: tex_unit was already looked up through SamplerTextures[] */
+    const GLuint unit = inst->tex_unit;
+    GLuint i;
+    GLuint msg_type;
+
+    assert(unit < BRW_MAX_TEX_UNIT);
+
+    payload_reg = get_reg(c, TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+
+    for (i = 0; i < 4; i++) 
+	dst[i] = get_dst_reg(c, inst, i);
+    for (i = 0; i < 4; i++)
+	src[i] = get_src_reg(c, inst, 0, i);
+
+    switch (inst->tex_target) {
+	case TEXTURE_1D_INDEX:
+	    brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
+	    brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
+	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
+	    break;
+	case TEXTURE_2D_INDEX:
+	case TEXTURE_RECT_INDEX:
+	    brw_MOV(p, brw_message_reg(2), src[0]);
+	    brw_MOV(p, brw_message_reg(3), src[1]);
+	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+	    break;
+	case TEXTURE_3D_INDEX:
+	case TEXTURE_CUBE_INDEX:
+	    brw_MOV(p, brw_message_reg(2), src[0]);
+	    brw_MOV(p, brw_message_reg(3), src[1]);
+	    brw_MOV(p, brw_message_reg(4), src[2]);
+	    break;
+	default:
+            /* invalid target */
+            abort();
+    }
+    brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
+    brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
+
+    if (BRW_IS_IGDNG(p->brw)) {
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
+    } else {
+        /* Does it work well on SIMD8? */
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+    }
+
+    brw_SAMPLE(p,
+               retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
+               1,                                           /* msg_reg_nr */
+               retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
+               SURF_INDEX_TEXTURE(unit),
+               unit,                                        /* sampler */
+               inst->DstReg.WriteMask,                      /* writemask */
+               msg_type,                                    /* msg_type */
+               4,                                           /* response_length */
+               4,                                           /* msg_length */
+               0,                                           /* eot */
+               1,
+               BRW_SAMPLER_SIMD_MODE_SIMD8);	
+}
+
+
+static void emit_tex(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg dst[4], src[4], payload_reg;
+    /* Note: tex_unit was already looked up through SamplerTextures[] */
+    const GLuint unit = inst->tex_unit;
+    GLuint msg_len;
+    GLuint i, nr;
+    GLuint emit;
+    GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
+    GLuint msg_type;
+
+    assert(unit < BRW_MAX_TEX_UNIT);
+
+    payload_reg = get_reg(c, TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+
+    for (i = 0; i < 4; i++) 
+	dst[i] = get_dst_reg(c, inst, i);
+    for (i = 0; i < 4; i++)
+	src[i] = get_src_reg(c, inst, 0, i);
+
+    switch (inst->tex_target) {
+	case TEXTURE_1D_INDEX:
+	    emit = WRITEMASK_X;
+	    nr = 1;
+	    break;
+	case TEXTURE_2D_INDEX:
+	case TEXTURE_RECT_INDEX:
+	    emit = WRITEMASK_XY;
+	    nr = 2;
+	    break;
+	case TEXTURE_3D_INDEX:
+	case TEXTURE_CUBE_INDEX:
+	    emit = WRITEMASK_XYZ;
+	    nr = 3;
+	    break;
+	default:
+           /* invalid target */
+           abort();
+    }
+    msg_len = 1;
+
+    /* move/load S, T, R coords */
+    for (i = 0; i < nr; i++) {
+	static const GLuint swz[4] = {0,1,2,2};
+	if (emit & (1<<i))
+	    brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
+	else
+	    brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
+	msg_len += 1;
+    }
+
+    if (shadow) {
+       brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
+       brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
+    }
+
+    if (BRW_IS_IGDNG(p->brw)) {
+        if (shadow)
+            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
+        else
+            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
+    } else {
+        /* Does it work for shadow on SIMD8 ? */
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+    }
+    
+    brw_SAMPLE(p,
+               retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
+               1,                                          /* msg_reg_nr */
+               retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
+               SURF_INDEX_TEXTURE(unit),
+               unit,                                       /* sampler */
+               inst->DstReg.WriteMask,                     /* writemask */
+               msg_type,                                   /* msg_type */
+               4,                                          /* response_length */
+               shadow ? 6 : 4,                             /* msg_length */
+               0,                                          /* eot */
+               1,
+               BRW_SAMPLER_SIMD_MODE_SIMD8);	
+
+    if (shadow)
+	brw_MOV(p, dst[3], brw_imm_f(1.0));
+}
+
+
+/**
+ * Resolve subroutine calls after code emit is done.
+ */
+static void post_wm_emit( struct brw_wm_compile *c )
+{
+    brw_resolve_cals(&c->func);
+}
+
+static void
+get_argument_regs(struct brw_wm_compile *c,
+		  const struct brw_fp_instruction *inst,
+		  int index,
+		  struct brw_reg *regs,
+		  int mask)
+{
+    int i;
+
+    for (i = 0; i < 4; i++) {
+	if (mask & (1 << i))
+	    regs[i] = get_src_reg(c, inst, index, i);
+    }
+}
+
+static void brw_wm_emit_branching_shader(struct brw_context *brw, struct brw_wm_compile *c)
+{
+#define MAX_IF_DEPTH 32
+#define MAX_LOOP_DEPTH 32
+    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
+    GLuint i, if_depth = 0, loop_depth = 0;
+    struct brw_compile *p = &c->func;
+    struct brw_indirect stack_index = brw_indirect(0, 0);
+
+    c->out_of_regs = GL_FALSE;
+
+    prealloc_reg(c);
+    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
+
+    for (i = 0; i < c->nr_fp_insns; i++) {
+        const struct brw_fp_instruction *inst = &c->fp_instructions[i];
+	int dst_flags;
+	struct brw_reg args[3][4], dst[4];
+	int j;
+
+        c->cur_inst = i;
+
+#if 0
+        debug_printf("Inst %d: ", i);
+        _mesa_print_instruction(inst);
+#endif
+
+        /* fetch any constants that this instruction needs */
+        if (c->fp->use_const_buffer)
+           fetch_constants(c, inst);
+
+	if (inst->CondUpdate)
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	else
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+
+	dst_flags = inst->DstReg.WriteMask;
+	if (inst->SaturateMode == SATURATE_ZERO_ONE)
+	    dst_flags |= SATURATE;
+
+	switch (inst->Opcode) {
+	    case WM_PIXELXY:
+		emit_pixel_xy(c, inst);
+		break;
+	    case WM_DELTAXY: 
+		emit_delta_xy(c, inst);
+		break;
+	    case WM_PIXELW:
+		emit_pixel_w(c, inst);
+		break;	
+	    case WM_LINTERP:
+		emit_linterp(c, inst);
+		break;
+	    case WM_PINTERP:
+		emit_pinterp(c, inst);
+		break;
+	    case WM_CINTERP:
+		emit_cinterp(c, inst);
+		break;
+	    case WM_WPOSXY:
+		emit_wpos_xy(c, inst);
+		break;
+	    case WM_FB_WRITE:
+		emit_fb_write(c, inst);
+		break;
+	    case WM_FRONTFACING:
+		emit_frontfacing(c, inst);
+		break;
+	    case OPCODE_ADD:
+		emit_add(c, inst);
+		break;
+	    case OPCODE_ARL:
+		emit_arl(c, inst);
+		break;
+	    case OPCODE_FRC:
+		emit_frc(c, inst);
+		break;
+	    case OPCODE_FLR:
+		emit_flr(c, inst);
+		break;
+	    case OPCODE_LRP:
+		emit_lrp(c, inst);
+		break;
+	    case OPCODE_TRUNC:
+		emit_trunc(c, inst);
+		break;
+	    case OPCODE_MOV:
+		emit_mov(c, inst);
+		break;
+	    case OPCODE_DP3:
+		emit_dp3(c, inst);
+		break;
+	    case OPCODE_DP4:
+		emit_dp4(c, inst);
+		break;
+	    case OPCODE_XPD:
+		emit_xpd(c, inst);
+		break;
+	    case OPCODE_DPH:
+		emit_dph(c, inst);
+		break;
+	    case OPCODE_RCP:
+		emit_rcp(c, inst);
+		break;
+	    case OPCODE_RSQ:
+		emit_rsq(c, inst);
+		break;
+	    case OPCODE_SIN:
+		emit_sin(c, inst);
+		break;
+	    case OPCODE_COS:
+		emit_cos(c, inst);
+		break;
+	    case OPCODE_EX2:
+		emit_ex2(c, inst);
+		break;
+	    case OPCODE_LG2:
+		emit_lg2(c, inst);
+		break;
+	    case OPCODE_MIN:	
+	    case OPCODE_MAX:	
+		emit_min_max(c, inst);
+		break;
+	    case OPCODE_DDX:
+	    case OPCODE_DDY:
+		for (j = 0; j < 4; j++) {
+		    if (inst->DstReg.WriteMask & (1 << j))
+			dst[j] = get_dst_reg(c, inst, j);
+		    else
+			dst[j] = brw_null_reg();
+		}
+		get_argument_regs(c, inst, 0, args[0], WRITEMASK_XYZW);
+		emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
+			  args[0]);
+                break;
+	    case OPCODE_SLT:
+		emit_slt(c, inst);
+		break;
+	    case OPCODE_SLE:
+		emit_sle(c, inst);
+		break;
+	    case OPCODE_SGT:
+		emit_sgt(c, inst);
+		break;
+	    case OPCODE_SGE:
+		emit_sge(c, inst);
+		break;
+	    case OPCODE_SEQ:
+		emit_seq(c, inst);
+		break;
+	    case OPCODE_SNE:
+		emit_sne(c, inst);
+		break;
+	    case OPCODE_MUL:
+		emit_mul(c, inst);
+		break;
+	    case OPCODE_POW:
+		emit_pow(c, inst);
+		break;
+	    case OPCODE_MAD:
+		emit_mad(c, inst);
+		break;
+	    case OPCODE_TEX:
+		emit_tex(c, inst);
+		break;
+	    case OPCODE_TXB:
+		emit_txb(c, inst);
+		break;
+	    case OPCODE_KIL_NV:
+		emit_kil(c);
+		break;
+	    case OPCODE_IF:
+		assert(if_depth < MAX_IF_DEPTH);
+		if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
+		break;
+	    case OPCODE_ELSE:
+		if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
+		break;
+	    case OPCODE_ENDIF:
+		assert(if_depth > 0);
+		brw_ENDIF(p, if_inst[--if_depth]);
+		break;
+	    case OPCODE_BGNSUB:
+		brw_save_label(p, inst->Comment, p->nr_insn);
+		break;
+	    case OPCODE_ENDSUB:
+		/* no-op */
+		break;
+	    case OPCODE_CAL: 
+		brw_push_insn_state(p);
+		brw_set_mask_control(p, BRW_MASK_DISABLE);
+                brw_set_access_mode(p, BRW_ALIGN_1);
+                brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+                brw_set_access_mode(p, BRW_ALIGN_16);
+                brw_ADD(p, get_addr_reg(stack_index),
+                         get_addr_reg(stack_index), brw_imm_d(4));
+		brw_save_call(&c->func, inst->label, p->nr_insn);
+                brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+                brw_pop_insn_state(p);
+		break;
+
+	    case OPCODE_RET:
+		brw_push_insn_state(p);
+		brw_set_mask_control(p, BRW_MASK_DISABLE);
+                brw_ADD(p, get_addr_reg(stack_index),
+                        get_addr_reg(stack_index), brw_imm_d(-4));
+                brw_set_access_mode(p, BRW_ALIGN_1);
+                brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
+                brw_set_access_mode(p, BRW_ALIGN_16);
+		brw_pop_insn_state(p);
+
+		break;
+	    case OPCODE_BGNLOOP:
+                /* XXX may need to invalidate the current_constant regs */
+		loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
+		break;
+	    case OPCODE_BRK:
+		brw_BREAK(p);
+		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+		break;
+	    case OPCODE_CONT:
+		brw_CONT(p);
+		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+		break;
+	    case OPCODE_ENDLOOP: 
+               {
+                  struct brw_instruction *inst0, *inst1;
+                  GLuint br = 1;
+
+                  if (BRW_IS_IGDNG(brw))
+                     br = 2;
+ 
+                  loop_depth--;
+                  inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+                  /* patch all the BREAK/CONT instructions from last BGNLOOP */
+                  while (inst0 > loop_inst[loop_depth]) {
+                     inst0--;
+                     if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+			inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+			inst0->bits3.if_else.pop_count = 0;
+                     }
+                     else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+                        inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+                        inst0->bits3.if_else.pop_count = 0;
+                     }
+                  }
+               }
+               break;
+	    default:
+		debug_printf("unsupported IR in fragment shader %d\n",
+			inst->Opcode);
+	}
+
+	if (inst->CondUpdate)
+	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	else
+	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    }
+    post_wm_emit(c);
+
+    if (BRW_DEBUG & DEBUG_WM) {
+      debug_printf("wm-native:\n");
+      brw_disasm(stderr, p->store, p->nr_insn);
+    }
+}
+
+/**
+ * Do GPU code generation for shaders that use GLSL features such as
+ * flow control.  Other shaders will be compiled with the 
+ */
+void brw_wm_branching_shader_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+    if (BRW_DEBUG & DEBUG_WM) {
+       debug_printf("%s:\n", __FUNCTION__);
+    }
+
+    /* initial instruction translation/simplification */
+    brw_wm_pass_fp(c);
+
+    /* actual code generation */
+    brw_wm_emit_branching_shader(brw, c);
+
+    if (BRW_DEBUG & DEBUG_WM) {
+        brw_wm_print_program(c, "brw_wm_branching_shader_emit done");
+    }
+
+    c->prog_data.total_grf = num_grf_used(c);
+    c->prog_data.total_scratch = 0;
+}
diff --git a/src/gallium/drivers/i965/brw_wm_iz.c b/src/gallium/drivers/i965/brw_wm_iz.c
new file mode 100644
index 0000000000..6f1e9fcc3c
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_iz.c
@@ -0,0 +1,156 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                
+
+#include "brw_wm.h"
+
+
+#undef P			/* prompted depth */
+#undef C			/* computed */
+#undef N			/* non-promoted? */
+
+#define P 0
+#define C 1
+#define N 2
+
+const struct {
+   GLuint mode:2;
+   GLuint sd_present:1;
+   GLuint sd_to_rt:1;
+   GLuint dd_present:1;
+   GLuint ds_present:1;
+} wm_iz_table[IZ_BIT_MAX] =
+{
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 1 }, 
+ { N, 0, 1, 0, 1 }, 
+ { N, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 0, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 } 
+};
+
+/**
+ * \param line_aa  AA_NEVER, AA_ALWAYS or AA_SOMETIMES
+ * \param lookup  bitmask of IZ_* flags
+ */
+void brw_wm_lookup_iz( GLuint line_aa,
+		       GLuint lookup,
+		       GLboolean ps_uses_depth,
+		       struct brw_wm_prog_key *key )
+{
+   GLuint reg = 2;
+
+   assert (lookup < IZ_BIT_MAX);
+      
+   if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
+      key->computes_depth = 1;
+
+   if (wm_iz_table[lookup].sd_present || ps_uses_depth) {
+      key->source_depth_reg = reg;
+      reg += 2;
+   }
+
+   if (wm_iz_table[lookup].sd_to_rt)
+      key->source_depth_to_render_target = 1;
+
+   if (wm_iz_table[lookup].ds_present || line_aa != AA_NEVER) {
+      key->aa_dest_stencil_reg = reg;
+      key->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
+				      line_aa == AA_SOMETIMES);
+      reg++;
+   }
+
+   if (wm_iz_table[lookup].dd_present) {
+      key->dest_depth_reg = reg;
+      reg+=2;
+   }
+
+   key->nr_depth_regs = (reg+1)/2;
+}
+
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
new file mode 100644
index 0000000000..0bacad2b0f
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -0,0 +1,366 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "brw_debug.h"
+#include "brw_wm.h"
+
+
+
+/***********************************************************************
+ */
+
+static struct brw_wm_ref *get_ref( struct brw_wm_compile *c )
+{
+   assert(c->nr_refs < BRW_WM_MAX_REF);
+   return &c->refs[c->nr_refs++];
+}
+
+static struct brw_wm_value *get_value( struct brw_wm_compile *c)
+{
+   assert(c->nr_refs < BRW_WM_MAX_VREG);
+   return &c->vreg[c->nr_vreg++];
+}
+
+/** return pointer to a newly allocated instruction */
+static struct brw_wm_instruction *get_instruction( struct brw_wm_compile *c )
+{
+   assert(c->nr_insns < BRW_WM_MAX_INSN);
+   return &c->instruction[c->nr_insns++];
+}
+
+/***********************************************************************
+ */
+
+/** Init the "undef" register */
+static void pass0_init_undef( struct brw_wm_compile *c)
+{
+   struct brw_wm_ref *ref = &c->undef_ref;
+   ref->value = &c->undef_value;
+   ref->hw_reg = brw_vec8_grf(0, 0);
+   ref->insn = 0;
+   ref->prevuse = NULL;
+}
+
+/** Set a FP register to a value */
+static void pass0_set_fpreg_value( struct brw_wm_compile *c,
+				   GLuint file,
+				   GLuint idx,
+				   GLuint component,
+				   struct brw_wm_value *value )
+{
+   struct brw_wm_ref *ref = get_ref(c);
+   ref->value = value;
+   ref->hw_reg = brw_vec8_grf(0, 0);
+   ref->insn = 0;
+   ref->prevuse = NULL;
+   c->pass0_fp_reg[file][idx][component] = ref;
+}
+
+/** Set a FP register to a ref */
+static void pass0_set_fpreg_ref( struct brw_wm_compile *c,
+				 GLuint file,
+				 GLuint idx,
+				 GLuint component,
+				 const struct brw_wm_ref *src_ref )
+{
+   c->pass0_fp_reg[file][idx][component] = src_ref;
+}
+
+static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c, 
+					       unsigned idx,
+                                               unsigned component)
+{
+   GLuint i = idx * 4 + component;
+   
+   if (i >= BRW_WM_MAX_PARAM) {
+      debug_printf("%s: out of params\n", __FUNCTION__);
+      c->prog_data.error = 1;
+      return NULL;
+   }
+   else {
+      struct brw_wm_ref *ref = get_ref(c);
+
+      c->nr_creg = MAX2(c->nr_creg, (i+16)/16);
+
+      /* Push the offsets into hw_reg.  These will be added to the
+       * real register numbers once one is allocated in pass2.
+       */
+      ref->hw_reg = brw_vec1_grf((i&8)?1:0, i%8);
+      ref->value = &c->creg[i/16];
+      ref->insn = 0;
+      ref->prevuse = NULL;
+
+      return ref;
+   }
+}
+
+
+
+
+/* Lookup our internal registers
+ */
+static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
+					       GLuint file,
+					       GLuint idx,
+					       GLuint component )
+{
+   const struct brw_wm_ref *ref = c->pass0_fp_reg[file][idx][component];
+
+   if (!ref) {
+      switch (file) {
+      case TGSI_FILE_INPUT:
+      case TGSI_FILE_TEMPORARY:
+      case TGSI_FILE_OUTPUT:
+      case BRW_FILE_PAYLOAD:
+	 /* should already be done?? */
+	 break;
+
+      case TGSI_FILE_CONSTANT:
+	 ref = get_param_ref(c, 
+                             c->fp->info.immediate_count + idx,
+                             component);
+	 break;
+
+      case TGSI_FILE_IMMEDIATE:
+	 ref = get_param_ref(c, 
+                             idx,
+                             component);
+	 break;
+
+      default:
+	 assert(0);
+	 break;
+      }
+
+      c->pass0_fp_reg[file][idx][component] = ref;
+   }
+
+   if (!ref)
+      ref = &c->undef_ref;
+
+   return ref;
+}
+
+
+
+/***********************************************************************
+ * Straight translation to internal instruction format
+ */
+
+static void pass0_set_dst( struct brw_wm_compile *c,
+			   struct brw_wm_instruction *out,
+			   const struct brw_fp_instruction *inst,
+			   GLuint writemask )
+{
+   const struct brw_fp_dst dst = inst->dst;
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1<<i)) {
+	 out->dst[i] = get_value(c);
+	 pass0_set_fpreg_value(c, dst.file, dst.index, i, out->dst[i]);
+      }
+   }
+
+   out->writemask = writemask;
+}
+
+
+static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c,
+						    struct brw_fp_src src,
+						    GLuint i )
+{
+   return pass0_get_reg(c, src.file, src.index, BRW_GET_SWZ(src.swizzle,i));
+}
+
+
+static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c,
+				       struct brw_fp_src src,
+				       GLuint i,
+				       struct brw_wm_instruction *insn)
+{
+   const struct brw_wm_ref *ref = get_fp_src_reg_ref(c, src, i);
+   struct brw_wm_ref *newref = get_ref(c);
+
+   newref->value = ref->value;
+   newref->hw_reg = ref->hw_reg;
+
+   if (insn) {
+      newref->insn = insn - c->instruction;
+      newref->prevuse = newref->value->lastuse;
+      newref->value->lastuse = newref;
+   }
+
+   if (src.negate)
+      newref->hw_reg.negate ^= 1;
+
+   if (src.abs) {
+      newref->hw_reg.negate = 0;
+      newref->hw_reg.abs = 1;
+   }
+
+   return newref;
+}
+
+
+static void
+translate_insn(struct brw_wm_compile *c,
+               const struct brw_fp_instruction *inst)
+{
+   struct brw_wm_instruction *out = get_instruction(c);
+   GLuint writemask = inst->dst.writemask;
+   GLuint nr_args = brw_wm_nr_args(inst->opcode);
+   GLuint i, j;
+
+   /* Copy some data out of the instruction
+    */
+   out->opcode = inst->opcode;
+   out->saturate = inst->dst.saturate;
+   out->tex_unit = inst->tex_unit;
+   out->target = inst->target;
+
+   /* Nasty hack:
+    */
+   out->eot = (inst->opcode == WM_FB_WRITE &&
+               inst->tex_unit != 0);
+
+
+   /* Args:
+    */
+   for (i = 0; i < nr_args; i++) {
+      for (j = 0; j < 4; j++) {
+	 out->src[i][j] = get_new_ref(c, inst->src[i], j, out);
+      }
+   }
+
+   /* Dst:
+    */
+   pass0_set_dst(c, out, inst, writemask);
+}
+
+
+
+/***********************************************************************
+ * Optimize moves and swizzles away:
+ */ 
+static void pass0_precalc_mov( struct brw_wm_compile *c,
+			       const struct brw_fp_instruction *inst )
+{
+   const struct brw_fp_dst dst = inst->dst;
+   GLuint writemask = dst.writemask;
+   struct brw_wm_ref *refs[4];
+   GLuint i;
+
+   /* Get the effect of a MOV by manipulating our register table:
+    * First get all refs, then assign refs.  This ensures that "in-place"
+    * swizzles such as:
+    *   MOV t, t.xxyx
+    * are handled correctly.  Previously, these two steps were done in
+    * one loop and the above case was incorrectly handled.
+    */
+   for (i = 0; i < 4; i++) {
+      refs[i] = get_new_ref(c, inst->src[0], i, NULL);
+   }
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1 << i)) {	    
+         pass0_set_fpreg_ref( c, dst.file, dst.index, i, refs[i]);
+      }
+   }
+}
+
+
+/* Initialize payload "registers".
+ */
+static void pass0_init_payload( struct brw_wm_compile *c )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      GLuint j = i >= c->key.nr_depth_regs ? 0 : i;
+      pass0_set_fpreg_value( c, BRW_FILE_PAYLOAD, PAYLOAD_DEPTH, i, 
+			     &c->payload.depth[j] );
+   }
+
+   for (i = 0; i < c->key.nr_inputs; i++)
+      pass0_set_fpreg_value( c, BRW_FILE_PAYLOAD, i, 0, 
+			     &c->payload.input_interp[i] );      
+}
+
+
+/***********************************************************************
+ * PASS 0
+ *
+ * Work forwards to give each calculated value a unique number.  Where
+ * an instruction produces duplicate values (eg DP3), all are given
+ * the same number.
+ *
+ * Translate away swizzling and eliminate non-saturating moves.
+ *
+ * Translate instructions from our fp_instruction structs to our
+ * internal brw_wm_instruction representation.
+ */
+void brw_wm_pass0( struct brw_wm_compile *c )
+{
+   GLuint insn;
+
+   c->nr_vreg = 0;
+   c->nr_insns = 0;
+
+   pass0_init_undef(c);
+   pass0_init_payload(c);
+
+   for (insn = 0; insn < c->nr_fp_insns; insn++) {
+      const struct brw_fp_instruction *inst = &c->fp_instructions[insn];
+
+      /* Optimize away moves, otherwise emit translated instruction:
+       */      
+      switch (inst->opcode) {
+      case TGSI_OPCODE_MOV: 
+	 if (!inst->dst.saturate) {
+	    pass0_precalc_mov(c, inst);
+	 }
+	 else {
+	    translate_insn(c, inst);
+	 }
+	 break;
+      default:
+	 translate_insn(c, inst);
+	 break;
+      }
+   }
+ 
+   if (BRW_DEBUG & DEBUG_WM) {
+      brw_wm_print_program(c, "pass0");
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
new file mode 100644
index 0000000000..005747f00b
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -0,0 +1,292 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                  
+
+#include "brw_wm.h"
+#include "brw_debug.h"
+
+
+static GLuint get_tracked_mask(struct brw_wm_compile *c,
+			       struct brw_wm_instruction *inst)
+{
+   GLuint i;
+   for (i = 0; i < 4; i++) {
+      if (inst->writemask & (1<<i)) {
+	 if (!inst->dst[i]->contributes_to_output) {
+	    inst->writemask &= ~(1<<i);
+	    inst->dst[i] = 0;
+	 }
+      }
+   }
+
+   return inst->writemask;
+}
+
+/* Remove a reference from a value's usage chain.
+ */
+static void unlink_ref(struct brw_wm_ref *ref)
+{
+   struct brw_wm_value *value = ref->value;
+
+   if (ref == value->lastuse) {
+      value->lastuse = ref->prevuse;
+   }
+   else {
+      struct brw_wm_ref *i = value->lastuse;
+      while (i->prevuse != ref) i = i->prevuse;
+      i->prevuse = ref->prevuse;
+   }
+}
+
+static void track_arg(struct brw_wm_compile *c,
+		      struct brw_wm_instruction *inst,
+		      GLuint arg,
+		      GLuint readmask)
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      struct brw_wm_ref *ref = inst->src[arg][i];
+      if (ref) {
+	 if (readmask & (1<<i)) {
+	    ref->value->contributes_to_output = 1;
+         }
+	 else {
+	    unlink_ref(ref);
+	    inst->src[arg][i] = NULL;
+	 }
+      }
+   }
+}
+
+static GLuint get_texcoord_mask( GLuint tex_idx )
+{
+   switch (tex_idx) {
+   case TGSI_TEXTURE_1D:
+      return BRW_WRITEMASK_X;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+      return BRW_WRITEMASK_XY;
+   case TGSI_TEXTURE_3D:
+      return BRW_WRITEMASK_XYZ;
+   case TGSI_TEXTURE_CUBE:
+      return BRW_WRITEMASK_XYZ;
+
+   case TGSI_TEXTURE_SHADOW1D:
+      return BRW_WRITEMASK_XZ;
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      return BRW_WRITEMASK_XYZ;
+   default: 
+      assert(0);
+      return 0;
+   }
+}
+
+
+/* Step two: Basically this is dead code elimination.  
+ *
+ * Iterate backwards over instructions, noting which values
+ * contribute to the final result.  Adjust writemasks to only
+ * calculate these values.
+ */
+void brw_wm_pass1( struct brw_wm_compile *c )
+{
+   GLint insn;
+
+   for (insn = c->nr_insns-1; insn >= 0; insn--) {
+      struct brw_wm_instruction *inst = &c->instruction[insn];
+      GLuint writemask;
+      GLuint read0, read1, read2;
+
+      if (inst->opcode == TGSI_OPCODE_KIL) {
+	 track_arg(c, inst, 0, BRW_WRITEMASK_XYZW); /* All args contribute to final */
+	 continue;
+      }
+
+      if (inst->opcode == WM_FB_WRITE) {
+	 track_arg(c, inst, 0, BRW_WRITEMASK_XYZW); 
+	 track_arg(c, inst, 1, BRW_WRITEMASK_XYZW); 
+	 if (c->key.source_depth_to_render_target &&
+	     c->key.computes_depth)
+	    track_arg(c, inst, 2, BRW_WRITEMASK_Z); 
+	 else
+	    track_arg(c, inst, 2, 0); 
+	 continue;
+      }
+
+      /* Lookup all the registers which were written by this
+       * instruction and get a mask of those that contribute to the output:
+       */
+      writemask = get_tracked_mask(c, inst);
+      if (!writemask) {
+	 GLuint arg;
+	 for (arg = 0; arg < 3; arg++)
+	    track_arg(c, inst, arg, 0);
+	 continue;
+      }
+
+      read0 = 0;
+      read1 = 0;
+      read2 = 0;
+
+      /* Mark all inputs which contribute to the marked outputs:
+       */
+      switch (inst->opcode) {
+      case TGSI_OPCODE_ABS:
+      case TGSI_OPCODE_FLR:
+      case TGSI_OPCODE_FRC:
+      case TGSI_OPCODE_MOV:
+      case TGSI_OPCODE_TRUNC:
+	 read0 = writemask;
+	 break;
+
+      case TGSI_OPCODE_SUB:
+      case TGSI_OPCODE_SLT:
+      case TGSI_OPCODE_SLE:
+      case TGSI_OPCODE_SGE:
+      case TGSI_OPCODE_SGT:
+      case TGSI_OPCODE_SEQ:
+      case TGSI_OPCODE_SNE:
+      case TGSI_OPCODE_ADD:
+      case TGSI_OPCODE_MAX:
+      case TGSI_OPCODE_MIN:
+      case TGSI_OPCODE_MUL:
+	 read0 = writemask;
+	 read1 = writemask;
+	 break;
+
+      case TGSI_OPCODE_DDX:
+      case TGSI_OPCODE_DDY:
+	 read0 = writemask;
+	 break;
+
+      case TGSI_OPCODE_MAD:	
+      case TGSI_OPCODE_CMP:
+      case TGSI_OPCODE_LRP:
+	 read0 = writemask;
+	 read1 = writemask;	
+	 read2 = writemask;	
+	 break;
+
+      case TGSI_OPCODE_XPD: 
+	 if (writemask & BRW_WRITEMASK_X) read0 |= BRW_WRITEMASK_YZ;	 
+	 if (writemask & BRW_WRITEMASK_Y) read0 |= BRW_WRITEMASK_XZ;	 
+	 if (writemask & BRW_WRITEMASK_Z) read0 |= BRW_WRITEMASK_XY;
+	 read1 = read0;
+	 break;
+
+      case TGSI_OPCODE_COS:
+      case TGSI_OPCODE_EX2:
+      case TGSI_OPCODE_LG2:
+      case TGSI_OPCODE_RCP:
+      case TGSI_OPCODE_RSQ:
+      case TGSI_OPCODE_SIN:
+      case TGSI_OPCODE_SCS:
+      case WM_CINTERP:
+      case WM_PIXELXY:
+	 read0 = BRW_WRITEMASK_X;
+	 break;
+
+      case TGSI_OPCODE_POW:
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_X;
+	 break;
+
+      case TGSI_OPCODE_TEX:
+      case TGSI_OPCODE_TXP:
+	 read0 = get_texcoord_mask(inst->target);
+	 break;
+
+      case TGSI_OPCODE_TXB:
+	 read0 = get_texcoord_mask(inst->target) | BRW_WRITEMASK_W;
+	 break;
+
+      case WM_WPOSXY:
+	 read0 = writemask & BRW_WRITEMASK_XY;
+	 break;
+
+      case WM_DELTAXY:
+	 read0 = writemask & BRW_WRITEMASK_XY;
+	 read1 = BRW_WRITEMASK_X;
+	 break;
+
+      case WM_PIXELW:
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_XY;
+	 break;
+
+      case WM_LINTERP:
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_XY;
+	 break;
+
+      case WM_PINTERP:
+	 read0 = BRW_WRITEMASK_X; /* interpolant */
+	 read1 = BRW_WRITEMASK_XY; /* deltas */
+	 read2 = BRW_WRITEMASK_W; /* pixel w */
+	 break;
+
+      case TGSI_OPCODE_DP3:	
+	 read0 = BRW_WRITEMASK_XYZ;
+	 read1 = BRW_WRITEMASK_XYZ;
+	 break;
+
+      case TGSI_OPCODE_DPH:
+	 read0 = BRW_WRITEMASK_XYZ;
+	 read1 = BRW_WRITEMASK_XYZW;
+	 break;
+
+      case TGSI_OPCODE_DP4:
+	 read0 = BRW_WRITEMASK_XYZW;
+	 read1 = BRW_WRITEMASK_XYZW;
+	 break;
+
+      case TGSI_OPCODE_LIT: 
+	 read0 = BRW_WRITEMASK_XYW;
+	 break;
+
+      case TGSI_OPCODE_DST:
+      case WM_FRONTFACING:
+      case TGSI_OPCODE_KILP:
+      default:
+	 break;
+      }
+
+      track_arg(c, inst, 0, read0);
+      track_arg(c, inst, 1, read1);
+      track_arg(c, inst, 2, read2);
+   }
+
+   if (BRW_DEBUG & DEBUG_WM) {
+      brw_wm_print_program(c, "pass1");
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_wm_pass2.c b/src/gallium/drivers/i965/brw_wm_pass2.c
new file mode 100644
index 0000000000..19248b4519
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_pass2.c
@@ -0,0 +1,334 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                   
+
+#include "brw_debug.h"
+#include "brw_wm.h"
+
+
+/* Use these to force spilling so that that functionality can be
+ * tested with known-good examples rather than having to construct new
+ * tests.
+ */
+#define TEST_PAYLOAD_SPILLS 0
+#define TEST_DST_SPILLS 0
+
+static void spill_value(struct brw_wm_compile *c,
+			struct brw_wm_value *value);
+
+static void prealloc_reg(struct brw_wm_compile *c,
+			 struct brw_wm_value *value,
+			 GLuint reg)
+{
+   if (value->lastuse) {
+      /* Set nextuse to zero, it will be corrected by
+       * update_register_usage().
+       */
+      c->pass2_grf[reg].value = value;
+      c->pass2_grf[reg].nextuse = 0;
+
+      value->resident = &c->pass2_grf[reg];
+      value->hw_reg = brw_vec8_grf(reg*2, 0);
+
+      if (TEST_PAYLOAD_SPILLS)
+	 spill_value(c, value);
+   }
+}
+
+
+/* Initialize all the register values.  Do the initial setup
+ * calculations for interpolants.
+ */
+static void init_registers( struct brw_wm_compile *c )
+{
+   GLuint reg = 0;
+   GLuint j;
+
+   for (j = 0; j < c->grf_limit; j++) 
+      c->pass2_grf[j].nextuse = BRW_WM_MAX_INSN;
+
+   /* Pre-allocate incoming payload regs:
+    */
+   for (j = 0; j < c->key.nr_depth_regs; j++) 
+      prealloc_reg(c, &c->payload.depth[j], reg++);
+
+   for (j = 0; j < c->nr_creg; j++) 
+      prealloc_reg(c, &c->creg[j], reg++);
+
+   reg++;                       /* XXX: skip over position output */
+
+   /* XXX: currently just hope the VS outputs line up with FS inputs:
+    */
+   for (j = 0; j < c->key.nr_inputs; j++)
+      prealloc_reg(c, &c->payload.input_interp[j], reg++);
+
+   c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
+   c->prog_data.urb_read_length = (c->key.nr_inputs + 1) * 2;
+   c->prog_data.curb_read_length = c->nr_creg * 2;
+
+   /* Note this allocation:
+    */
+   c->max_wm_grf = reg * 2;
+}
+
+
+/* Update the nextuse value for each register in our file.
+ */
+static void update_register_usage(struct brw_wm_compile *c,
+				  GLuint thisinsn)
+{
+   GLuint i;
+
+   for (i = 1; i < c->grf_limit; i++) {
+      struct brw_wm_grf *grf = &c->pass2_grf[i];
+
+      /* Only search those which can change:
+       */
+      if (grf->nextuse < thisinsn) {
+	 const struct brw_wm_ref *ref = grf->value->lastuse;
+
+	 /* Has last use of value been passed?
+	  */
+	 if (ref->insn < thisinsn) {
+	    grf->value->resident = 0;
+	    grf->value = 0;
+	    grf->nextuse = BRW_WM_MAX_INSN;
+	 }
+	 else {
+	    /* Else loop through chain to update:
+	     */
+	    while (ref->prevuse && ref->prevuse->insn >= thisinsn)
+	       ref = ref->prevuse;
+
+	    grf->nextuse = ref->insn;
+	 }
+      }
+   }
+}
+
+
+static void spill_value(struct brw_wm_compile *c,
+			struct brw_wm_value *value)
+{	
+   /* Allocate a spill slot.  Note that allocations start from 0x40 -
+    * the first slot is reserved to mean "undef" in brw_wm_emit.c
+    */
+   if (!value->spill_slot) {
+      c->last_scratch += 0x40;	
+      value->spill_slot = c->last_scratch;
+   }
+
+   /* The spill will be done in brw_wm_emit.c immediately after the
+    * value is calculated, so we can just take this reg without any
+    * further work.
+    */
+   value->resident->value = NULL;
+   value->resident->nextuse = BRW_WM_MAX_INSN;
+   value->resident = NULL;
+}
+
+
+
+/* Search for contiguous region with the most distant nearest
+ * member.  Free regs count as very distant.
+ *
+ * TODO: implement spill-to-reg so that we can rearrange discontigous
+ * free regs and then spill the oldest non-free regs in sequence.
+ * This would mean inserting instructions in this pass.
+ */
+static GLuint search_contiguous_regs(struct brw_wm_compile *c,
+				     GLuint nr,
+				     GLuint thisinsn)
+{
+   struct brw_wm_grf *grf = c->pass2_grf;
+   GLuint furthest = 0;
+   GLuint reg = 0;
+   GLuint i, j;
+
+   /* Start search at 1: r0 is special and can't be used or spilled.
+    */
+   for (i = 1; i < c->grf_limit && furthest < BRW_WM_MAX_INSN; i++) {
+      GLuint group_nextuse = BRW_WM_MAX_INSN;
+
+      for (j = 0; j < nr; j++) {
+	 if (grf[i+j].nextuse < group_nextuse)
+	    group_nextuse = grf[i+j].nextuse;
+      }
+
+      if (group_nextuse > furthest) {
+	 furthest = group_nextuse;
+	 reg = i;
+      }
+   }
+
+   assert(furthest != thisinsn);
+
+   /* Any non-empty regs will need to be spilled:
+    */
+   for (j = 0; j < nr; j++) 
+      if (grf[reg+j].value)
+	 spill_value(c, grf[reg+j].value);
+
+   return reg;
+}
+
+
+static void alloc_contiguous_dest(struct brw_wm_compile *c, 
+				  struct brw_wm_value *dst[],
+				  GLuint nr,
+				  GLuint thisinsn)
+{
+   GLuint reg = search_contiguous_regs(c, nr, thisinsn);
+   GLuint i;
+
+   for (i = 0; i < nr; i++) {
+      if (!dst[i]) {
+	 /* Need to grab a dummy value in TEX case.  Don't introduce
+	  * it into the tracking scheme.
+	  */
+	 dst[i] = &c->vreg[c->nr_vreg++];
+      }
+      else {
+	 assert(!dst[i]->resident);
+	 assert(c->pass2_grf[reg+i].nextuse != thisinsn);
+
+	 c->pass2_grf[reg+i].value = dst[i];
+	 c->pass2_grf[reg+i].nextuse = thisinsn;
+
+	 dst[i]->resident = &c->pass2_grf[reg+i];
+      }
+
+      dst[i]->hw_reg = brw_vec8_grf((reg+i)*2, 0);
+   }
+
+   if ((reg+nr)*2 > c->max_wm_grf)
+      c->max_wm_grf = (reg+nr) * 2;
+}
+
+
+static void load_args(struct brw_wm_compile *c, 
+		      struct brw_wm_instruction *inst)
+{
+   GLuint thisinsn = inst - c->instruction;
+   GLuint i,j;
+
+   for (i = 0; i < 3; i++) {
+      for (j = 0; j < 4; j++) {
+	 struct brw_wm_ref *ref = inst->src[i][j];
+
+	 if (ref) {
+	    if (!ref->value->resident) {
+	       /* Need to bring the value in from scratch space.  The code for
+		* this will be done in brw_wm_emit.c, here we just do the
+		* register allocation and mark the ref as requiring a fill.
+		*/
+	       GLuint reg = search_contiguous_regs(c, 1, thisinsn);
+
+	       c->pass2_grf[reg].value = ref->value;
+	       c->pass2_grf[reg].nextuse = thisinsn;
+
+	       ref->value->resident = &c->pass2_grf[reg];
+
+	       /* Note that a fill is required:
+		*/
+	       ref->unspill_reg = reg*2;
+	    }
+
+	    /* Adjust the hw_reg to point at the value's current location:
+	     */
+	    assert(ref->value == ref->value->resident->value);
+	    ref->hw_reg.nr += (ref->value->resident - c->pass2_grf) * 2;
+	 }
+      }
+   }
+}
+
+
+
+/* Step 3: Work forwards once again.  Perform register allocations,
+ * taking into account instructions like TEX which require contiguous
+ * result registers.  Where necessary spill registers to scratch space
+ * and reload later.
+ */
+void brw_wm_pass2( struct brw_wm_compile *c )
+{
+   GLuint insn;
+   GLuint i;
+
+   init_registers(c);
+
+   for (insn = 0; insn < c->nr_insns; insn++) {
+      struct brw_wm_instruction *inst = &c->instruction[insn];
+
+      /* Update registers' nextuse values:
+       */
+      update_register_usage(c, insn);
+
+      /* May need to unspill some args.
+       */
+      load_args(c, inst);
+
+      /* Allocate registers to hold results:
+       */
+      switch (inst->opcode) {
+      case TGSI_OPCODE_TEX:
+      case TGSI_OPCODE_TXB:
+      case TGSI_OPCODE_TXP:
+	 alloc_contiguous_dest(c, inst->dst, 4, insn);
+	 break;
+
+      default:
+	 for (i = 0; i < 4; i++) {
+	    if (inst->writemask & (1<<i)) {
+	       assert(inst->dst[i]);
+	       alloc_contiguous_dest(c, &inst->dst[i], 1, insn);
+	    }
+	 }
+	 break;
+      }
+
+      if (TEST_DST_SPILLS && inst->opcode != WM_PIXELXY) {
+	 for (i = 0; i < 4; i++)	
+	    if (inst->dst[i])
+	       spill_value(c, inst->dst[i]);
+      }
+   }
+
+   if (BRW_DEBUG & DEBUG_WM) {
+      brw_wm_print_program(c, "pass2");
+   }
+
+   c->state = PASS2_DONE;
+
+   if (BRW_DEBUG & DEBUG_WM) {
+       brw_wm_print_program(c, "pass2/done");
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_wm_sampler_state.c b/src/gallium/drivers/i965/brw_wm_sampler_state.c
new file mode 100644
index 0000000000..8406a1a9e2
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_sampler_state.c
@@ -0,0 +1,228 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                   
+#include "util/u_math.h"
+#include "util/u_format.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_resource.h"
+
+
+/* Samplers aren't strictly wm state from the hardware's perspective,
+ * but that is the only situation in which we use them in this driver.
+ */
+
+
+
+static enum pipe_error
+upload_default_color( struct brw_context *brw,
+		      const GLfloat *color,
+                      struct brw_winsys_buffer **bo_out )
+{
+   struct brw_sampler_default_color sdc;
+   enum pipe_error ret;
+
+   COPY_4V(sdc.color, color); 
+   
+   ret = brw_cache_data( &brw->cache, BRW_SAMPLER_DEFAULT_COLOR, &sdc,
+                         NULL, 0, bo_out );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+struct wm_sampler_key {
+   int sampler_count;
+   struct brw_sampler_state sampler[BRW_MAX_TEX_UNIT];
+};
+
+
+/** Sets up the cache key for sampler state for all texture units */
+static void
+brw_wm_sampler_populate_key(struct brw_context *brw,
+			    struct wm_sampler_key *key)
+{
+   int i;
+
+   memset(key, 0, sizeof(*key));
+
+   key->sampler_count = MIN2(brw->curr.num_fragment_sampler_views,
+			    brw->curr.num_samplers);
+
+   for (i = 0; i < key->sampler_count; i++) {
+      const struct brw_texture *tex = brw_texture(brw->curr.fragment_sampler_views[i]->texture);
+      const struct brw_sampler *sampler = brw->curr.sampler[i];
+      struct brw_sampler_state *entry = &key->sampler[i];
+
+      entry->ss0 = sampler->ss0;
+      entry->ss1 = sampler->ss1;
+      entry->ss2.default_color_pointer = 0; /* reloc */
+      entry->ss3 = sampler->ss3;
+
+      /* Cube-maps on 965 and later must use the same wrap mode for all 3
+       * coordinate dimensions.  Futher, only CUBE and CLAMP are valid.
+       */
+      if (tex->b.b.target == PIPE_TEXTURE_CUBE) {
+	 if (FALSE &&
+	     (sampler->ss0.min_filter != BRW_MAPFILTER_NEAREST || 
+	      sampler->ss0.mag_filter != BRW_MAPFILTER_NEAREST)) {
+	    entry->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+	    entry->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+	    entry->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+	 } else {
+	    entry->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	    entry->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	    entry->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	 }
+      } else if (tex->b.b.target == PIPE_TEXTURE_1D) {
+	 /* There's a bug in 1D texture sampling - it actually pays
+	  * attention to the wrap_t value, though it should not.
+	  * Override the wrap_t value here to GL_REPEAT to keep
+	  * any nonexistent border pixels from floating in.
+	  */
+	 entry->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+      }
+   }
+}
+
+
+static enum pipe_error
+brw_wm_sampler_update_default_colors(struct brw_context *brw)
+{
+   enum pipe_error ret;
+   int nr = MIN2(brw->curr.num_fragment_sampler_views,
+		 brw->curr.num_samplers);
+   int i;
+
+   for (i = 0; i < nr; i++) {
+      const struct brw_texture *tex = brw_texture(brw->curr.fragment_sampler_views[i]->texture);
+      const struct brw_sampler *sampler = brw->curr.sampler[i];
+      const float *bc;
+      float bordercolor[4] = {
+         sampler->border_color[0],
+         sampler->border_color[0],
+         sampler->border_color[0],
+         sampler->border_color[0]
+      };
+      
+      if (util_format_is_depth_or_stencil(tex->b.b.format)) {
+         bc = bordercolor;
+      }
+      else {
+         bc = sampler->border_color;
+      }
+
+      /* GL specs that border color for depth textures is taken from the
+       * R channel, while the hardware uses A.  Spam R into all the
+       * channels for safety.
+       */
+      ret = upload_default_color(brw, 
+                                 bc,
+                                 &brw->wm.sdc_bo[i]);
+      if (ret) 
+         return ret;
+   }
+
+   return PIPE_OK;
+}
+
+
+
+/* All samplers must be uploaded in a single contiguous array.  
+ */
+static int upload_wm_samplers( struct brw_context *brw )
+{
+   struct wm_sampler_key key;
+   struct brw_winsys_reloc reloc[BRW_MAX_TEX_UNIT];
+   enum pipe_error ret;
+   int i;
+
+   brw_wm_sampler_update_default_colors(brw);
+   brw_wm_sampler_populate_key(brw, &key);
+
+   if (brw->wm.sampler_count != key.sampler_count) {
+      brw->wm.sampler_count = key.sampler_count;
+      brw->state.dirty.cache |= CACHE_NEW_SAMPLER;
+   }
+
+   if (brw->wm.sampler_count == 0) {
+      bo_reference(&brw->wm.sampler_bo, NULL);
+      return PIPE_OK;
+   }
+
+   /* Emit SDC relocations */
+   for (i = 0; i < key.sampler_count; i++) {
+      make_reloc( &reloc[i],
+                  BRW_USAGE_SAMPLER,
+                  0,
+                  i * sizeof(struct brw_sampler_state) +
+                  offsetof(struct brw_sampler_state, ss2),
+                  brw->wm.sdc_bo[i]);
+   }
+
+
+   if (brw_search_cache(&brw->cache, BRW_SAMPLER,
+                        &key, sizeof(key),
+                        reloc, key.sampler_count,
+                        NULL,
+                        &brw->wm.sampler_bo))
+      return PIPE_OK;
+
+   /* If we didnt find it in the cache, compute the state and put it in the
+    * cache.
+    */
+   ret = brw_upload_cache(&brw->cache, BRW_SAMPLER,
+                          &key, sizeof(key),
+                          reloc, key.sampler_count,
+                          &key.sampler, sizeof(key.sampler),
+                          NULL, NULL,
+                          &brw->wm.sampler_bo);
+   if (ret)
+      return ret;
+
+
+   return 0;
+}
+
+const struct brw_tracked_state brw_wm_samplers = {
+   .dirty = {
+      .mesa = PIPE_NEW_BOUND_TEXTURES | PIPE_NEW_SAMPLERS,
+      .brw = 0,
+      .cache = 0
+   },
+   .prepare = upload_wm_samplers,
+};
+
+
diff --git a/src/gallium/drivers/i965/brw_wm_state.c b/src/gallium/drivers/i965/brw_wm_state.c
new file mode 100644
index 0000000000..efc2d96be1
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_state.c
@@ -0,0 +1,340 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                   
+#include "util/u_math.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_wm.h"
+#include "brw_debug.h"
+#include "brw_pipe_rast.h"
+
+/***********************************************************************
+ * WM unit - fragment programs and rasterization
+ */
+
+struct brw_wm_unit_key {
+   unsigned int total_grf, total_scratch;
+   unsigned int urb_entry_read_length;
+   unsigned int curb_entry_read_length;
+   unsigned int dispatch_grf_start_reg;
+
+   unsigned int curbe_offset;
+   unsigned int urb_size;
+
+   unsigned int max_threads;
+
+   unsigned int nr_surfaces, sampler_count;
+   GLboolean uses_depth, computes_depth, uses_kill, has_flow_control;
+   GLboolean polygon_stipple, stats_wm, line_stipple, offset_enable;
+   GLfloat offset_units, offset_factor;
+};
+
+static void
+wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
+{
+   const struct brw_fragment_shader *fp = brw->curr.fragment_shader;
+
+   memset(key, 0, sizeof(*key));
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      key->max_threads = 1;
+   else {
+      /* WM maximum threads is number of EUs times number of threads per EU. */
+      if (BRW_IS_IGDNG(brw))
+         key->max_threads = 12 * 6;
+      else if (BRW_IS_G4X(brw))
+	 key->max_threads = 10 * 5;
+      else
+	 key->max_threads = 8 * 4;
+   }
+
+   /* CACHE_NEW_WM_PROG */
+   key->total_grf = brw->wm.prog_data->total_grf;
+   key->urb_entry_read_length = brw->wm.prog_data->urb_read_length;
+   key->curb_entry_read_length = brw->wm.prog_data->curb_read_length;
+   key->dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
+   key->total_scratch = align(brw->wm.prog_data->total_scratch, 1024);
+
+   /* BRW_NEW_URB_FENCE */
+   key->urb_size = brw->urb.vsize;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   key->curbe_offset = brw->curbe.wm_start;
+
+   /* BRW_NEW_NR_SURFACEs */
+   key->nr_surfaces = brw->wm.nr_surfaces;
+
+   /* CACHE_NEW_SAMPLER */
+   key->sampler_count = brw->wm.sampler_count;
+
+   /* PIPE_NEW_RAST */
+   key->polygon_stipple = brw->curr.rast->templ.poly_stipple_enable;
+
+   /* PIPE_NEW_FRAGMENT_PROGRAM */
+   key->uses_depth = fp->uses_depth;
+   key->computes_depth = fp->info.writes_z;
+
+   /* PIPE_NEW_DEPTH_BUFFER
+    *
+    * Override for NULL depthbuffer case, required by the Pixel Shader Computed
+    * Depth field.
+    */
+   if (brw->curr.fb.zsbuf == NULL)
+      key->computes_depth = 0;
+
+   /* PIPE_NEW_DEPTH_STENCIL_ALPHA */
+   key->uses_kill = (fp->info.uses_kill || 
+		     brw->curr.zstencil->cc3.alpha_test);
+
+   key->has_flow_control = fp->has_flow_control;
+
+   /* temporary sanity check assertion */
+   assert(fp->has_flow_control == 0);
+
+   /* PIPE_NEW_QUERY */
+   key->stats_wm = (brw->query.stats_wm != 0);
+
+   /* PIPE_NEW_RAST */
+   key->line_stipple = brw->curr.rast->templ.line_stipple_enable;
+
+
+   key->offset_enable = (brw->curr.rast->templ.offset_point ||
+			 brw->curr.rast->templ.offset_line ||
+			 brw->curr.rast->templ.offset_tri);
+
+   key->offset_units = brw->curr.rast->templ.offset_units;
+   key->offset_factor = brw->curr.rast->templ.offset_scale;
+}
+
+/**
+ * Setup wm hardware state.  See page 225 of Volume 2
+ */
+static enum pipe_error
+wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
+			struct brw_winsys_reloc *reloc,
+                        unsigned nr_reloc,
+                        struct brw_winsys_buffer **bo_out)
+{
+   struct brw_wm_unit_state wm;
+   enum pipe_error ret;
+
+   memset(&wm, 0, sizeof(wm));
+
+   wm.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
+   wm.thread0.kernel_start_pointer = 0; /* reloc */
+   wm.thread1.depth_coef_urb_read_offset = 1;
+   wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+
+   if (BRW_IS_IGDNG(brw))
+      wm.thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else
+      wm.thread1.binding_table_entry_count = key->nr_surfaces;
+
+   if (key->total_scratch != 0) {
+      wm.thread2.scratch_space_base_pointer = 0; /* reloc */
+      wm.thread2.per_thread_scratch_space = key->total_scratch / 1024 - 1;
+   } else {
+      wm.thread2.scratch_space_base_pointer = 0;
+      wm.thread2.per_thread_scratch_space = 0;
+   }
+
+   wm.thread3.dispatch_grf_start_reg = key->dispatch_grf_start_reg;
+   wm.thread3.urb_entry_read_length = key->urb_entry_read_length;
+   wm.thread3.urb_entry_read_offset = 0;
+   wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
+   wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
+
+   if (BRW_IS_IGDNG(brw)) 
+      wm.wm4.sampler_count = 0; /* hardware requirement */
+   else
+      wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
+
+   /* reloc */
+   wm.wm4.sampler_state_pointer = 0;
+
+   wm.wm5.program_uses_depth = key->uses_depth;
+   wm.wm5.program_computes_depth = key->computes_depth;
+   wm.wm5.program_uses_killpixel = key->uses_kill;
+
+   if (key->has_flow_control)
+      wm.wm5.enable_8_pix = 1;
+   else
+      wm.wm5.enable_16_pix = 1;
+
+   wm.wm5.max_threads = key->max_threads - 1;
+   wm.wm5.thread_dispatch_enable = 1;	/* AKA: color_write */
+   wm.wm5.legacy_line_rast = 0;
+   wm.wm5.legacy_global_depth_bias = 0;
+   wm.wm5.early_depth_test = 1;	        /* never need to disable */
+   wm.wm5.line_aa_region_width = 0;
+   wm.wm5.line_endcap_aa_region_width = 1;
+
+   wm.wm5.polygon_stipple = key->polygon_stipple;
+
+   if (key->offset_enable) {
+      wm.wm5.depth_offset = 1;
+      /* Something wierd going on with legacy_global_depth_bias,
+       * offset_constant, scaling and MRD.  This value passes glean
+       * but gives some odd results elsewere (eg. the
+       * quad-offset-units test).
+       */
+      wm.global_depth_offset_constant = key->offset_units * 2;
+
+      /* This is the only value that passes glean:
+       */
+      wm.global_depth_offset_scale = key->offset_factor;
+   }
+
+   wm.wm5.line_stipple = key->line_stipple;
+
+   if ((BRW_DEBUG & DEBUG_STATS) || key->stats_wm)
+      wm.wm4.stats_enable = 1;
+
+   ret = brw_upload_cache(&brw->cache, BRW_WM_UNIT,
+                          key, sizeof(*key),
+                          reloc, nr_reloc,
+                          &wm, sizeof(wm),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+static enum pipe_error upload_wm_unit( struct brw_context *brw )
+{
+   struct brw_wm_unit_key key;
+   struct brw_winsys_reloc reloc[3];
+   unsigned nr_reloc = 0;
+   enum pipe_error ret;
+   unsigned grf_reg_count;
+   unsigned per_thread_scratch_space;
+   unsigned stats_enable;
+   unsigned sampler_count;
+
+   wm_unit_populate_key(brw, &key);
+
+
+   /* Allocate the necessary scratch space if we haven't already.  Don't
+    * bother reducing the allocation later, since we use scratch so
+    * rarely.
+    */
+   assert(key.total_scratch <= 12 * 1024);
+   if (key.total_scratch) {
+      GLuint total = key.total_scratch * key.max_threads;
+
+      /* Do we need a new buffer:
+       */
+      if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) 
+	 bo_reference(&brw->wm.scratch_bo, NULL);
+
+      if (brw->wm.scratch_bo == NULL) {
+	 ret = brw->sws->bo_alloc(brw->sws,
+                                  BRW_BUFFER_TYPE_SHADER_SCRATCH,
+                                  total,
+                                  4096,
+                                  &brw->wm.scratch_bo);
+         if (ret)
+            return ret;
+      }
+   }
+
+
+   /* XXX: temporary:
+    */
+   grf_reg_count = (align(key.total_grf, 16) / 16 - 1);
+   per_thread_scratch_space = key.total_scratch / 1024 - 1;
+   stats_enable = (BRW_DEBUG & DEBUG_STATS) || key.stats_wm;
+   sampler_count = BRW_IS_IGDNG(brw) ? 0 :(key.sampler_count + 1) / 4;
+
+   /* Emit WM program relocation */
+   make_reloc(&reloc[nr_reloc++],
+              BRW_USAGE_STATE,
+              grf_reg_count << 1,
+              offsetof(struct brw_wm_unit_state, thread0),
+              brw->wm.prog_bo);
+
+   /* Emit scratch space relocation */
+   if (key.total_scratch != 0) {
+      make_reloc(&reloc[nr_reloc++],
+                 BRW_USAGE_SCRATCH,
+                 per_thread_scratch_space,
+                 offsetof(struct brw_wm_unit_state, thread2),
+                 brw->wm.scratch_bo);
+   }
+
+   /* Emit sampler state relocation */
+   if (key.sampler_count != 0) {
+      make_reloc(&reloc[nr_reloc++],
+                 BRW_USAGE_STATE,
+                 stats_enable | (sampler_count << 2),
+                 offsetof(struct brw_wm_unit_state, wm4),
+                 brw->wm.sampler_bo);
+   }
+
+
+   if (brw_search_cache(&brw->cache, BRW_WM_UNIT,
+                        &key, sizeof(key),
+                        reloc, nr_reloc,
+                        NULL,
+                        &brw->wm.state_bo))
+      return PIPE_OK;
+
+   ret = wm_unit_create_from_key(brw, &key, 
+                                 reloc, nr_reloc,
+                                 &brw->wm.state_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_wm_unit = {
+   .dirty = {
+      .mesa = (PIPE_NEW_FRAGMENT_SHADER |
+	       PIPE_NEW_DEPTH_BUFFER |
+	       PIPE_NEW_RAST | 
+	       PIPE_NEW_DEPTH_STENCIL_ALPHA |
+	       PIPE_NEW_QUERY),
+
+      .brw = (BRW_NEW_CURBE_OFFSETS |
+	      BRW_NEW_NR_WM_SURFACES),
+
+      .cache = (CACHE_NEW_WM_PROG |
+		CACHE_NEW_SAMPLER)
+   },
+   .prepare = upload_wm_unit,
+};
+
diff --git a/src/gallium/drivers/i965/brw_wm_surface_state.c b/src/gallium/drivers/i965/brw_wm_surface_state.c
new file mode 100644
index 0000000000..0d80a0114a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_surface_state.c
@@ -0,0 +1,293 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                   
+#include "pipe/p_format.h"
+
+#include "brw_batchbuffer.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_resource.h"
+
+
+
+
+static enum pipe_error
+brw_update_texture_surface( struct brw_context *brw,
+			    struct brw_texture *tex,
+                            struct brw_winsys_buffer **bo_out)
+{
+   struct brw_winsys_reloc reloc[1];
+   enum pipe_error ret;
+
+   /* Emit relocation to surface contents */
+   make_reloc(&reloc[0],
+              BRW_USAGE_SAMPLER,
+              0,
+              offsetof(struct brw_surface_state, ss1),
+              tex->bo);
+
+   if (brw_search_cache(&brw->surface_cache,
+                        BRW_SS_SURFACE,
+                        &tex->ss, sizeof tex->ss,
+                        reloc, Elements(reloc),
+                        NULL,
+                        bo_out))
+      return PIPE_OK;
+
+   ret = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
+                          &tex->ss, sizeof tex->ss,
+                          reloc, Elements(reloc),
+                          &tex->ss, sizeof tex->ss,
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+
+
+
+
+
+
+/**
+ * Sets up a surface state structure to point at the given region.
+ * While it is only used for the front/back buffer currently, it should be
+ * usable for further buffers when doing ARB_draw_buffer support.
+ */
+static enum pipe_error
+brw_update_render_surface(struct brw_context *brw,
+                          struct brw_surface *surface,
+                          struct brw_winsys_buffer **bo_out)
+{
+   struct brw_surf_ss0 blend_ss0 = brw->curr.blend->ss0;
+   struct brw_surface_state ss;
+   struct brw_winsys_reloc reloc[1];
+   enum pipe_error ret;
+
+   /* XXX: we will only be rendering to this surface:
+    */
+   make_reloc(&reloc[0],
+              BRW_USAGE_RENDER_TARGET,
+              0,
+              offsetof(struct brw_surface_state, ss1),
+              surface->bo);
+
+   /* Surfaces are potentially shared between contexts, so can't
+    * scribble the in-place ss0 value in the surface.
+    */
+   memcpy(&ss, &surface->ss, sizeof ss);
+
+   ss.ss0.color_blend        = blend_ss0.color_blend;
+   ss.ss0.writedisable_blue  = blend_ss0.writedisable_blue;
+   ss.ss0.writedisable_green = blend_ss0.writedisable_green;
+   ss.ss0.writedisable_red   = blend_ss0.writedisable_red;
+   ss.ss0.writedisable_alpha = blend_ss0.writedisable_alpha;
+
+   if (brw_search_cache(&brw->surface_cache,
+                        BRW_SS_SURFACE,
+                        &ss, sizeof(ss),
+                        reloc, Elements(reloc),
+                        NULL,
+                        bo_out))
+      return PIPE_OK;
+       
+   ret = brw_upload_cache(&brw->surface_cache,
+                          BRW_SS_SURFACE,
+                          &ss, sizeof ss,
+                          reloc, Elements(reloc),
+                          &ss, sizeof ss,
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+/**
+ * Constructs the binding table for the WM surface state, which maps unit
+ * numbers to surface state objects.
+ */
+static enum pipe_error
+brw_wm_get_binding_table(struct brw_context *brw,
+                         struct brw_winsys_buffer **bo_out )
+{
+   enum pipe_error ret;
+   struct brw_winsys_reloc reloc[BRW_WM_MAX_SURF];
+   uint32_t data[BRW_WM_MAX_SURF];
+   GLuint nr_relocs = 0;
+   GLuint data_size = brw->wm.nr_surfaces * sizeof data[0];
+   int i;
+
+   assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
+   assert(brw->wm.nr_surfaces > 0);
+
+   /* Emit binding table relocations to surface state 
+    */
+   for (i = 0; i < brw->wm.nr_surfaces; i++) {
+      if (brw->wm.surf_bo[i]) {
+         make_reloc(&reloc[nr_relocs++],
+                    BRW_USAGE_STATE,
+                    0,
+                    i * sizeof(GLuint),
+                    brw->wm.surf_bo[i]);
+      }
+   }
+
+   /* Note there is no key for this search beyond the values in the
+    * relocation array:
+    */
+   if (brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
+                        NULL, 0,
+                        reloc, nr_relocs,
+                        NULL,
+                        bo_out))
+      return PIPE_OK;
+
+   /* Upload zero data, will all be overwitten with relocation
+    * offsets:
+    */
+   for (i = 0; i < brw->wm.nr_surfaces; i++)
+      data[i] = 0;
+
+   ret = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
+                           NULL, 0,
+                           reloc, nr_relocs,
+                           data, data_size,
+                           NULL, NULL,
+                           bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+static enum pipe_error prepare_wm_surfaces(struct brw_context *brw )
+{
+   enum pipe_error ret;
+   int nr_surfaces = 0;
+   GLuint i;
+
+   /* PIPE_NEW_COLOR_BUFFERS | PIPE_NEW_BLEND
+    *
+    * Update surfaces for drawing buffers.  Mixes in colormask and
+    * blend state.
+    *
+    * XXX: no color buffer case
+    */
+   for (i = 0; i < brw->curr.fb.nr_cbufs; i++) {
+      ret = brw_update_render_surface(brw, 
+                                      brw_surface(brw->curr.fb.cbufs[i]), 
+                                      &brw->wm.surf_bo[BTI_COLOR_BUF(i)]);
+      if (ret)
+         return ret;
+      
+      nr_surfaces = BTI_COLOR_BUF(i) + 1;
+   }
+
+
+
+   /* PIPE_NEW_FRAGMENT_CONSTANTS
+    */
+#if 0
+   if (brw->curr.fragment_constants) {
+      ret = brw_update_fragment_constant_surface(
+         brw, 
+         brw->curr.fragment_constants, 
+         &brw->wm.surf_bo[BTI_FRAGMENT_CONSTANTS]);
+
+      if (ret)
+         return ret;
+
+      nr_surfaces = BTI_FRAGMENT_CONSTANTS + 1;
+   }
+   else {
+      bo_reference(&brw->wm.surf_bo[SURF_FRAG_CONSTANTS], NULL);      
+   }
+#endif
+
+
+   /* PIPE_NEW_TEXTURE 
+    */
+   for (i = 0; i < brw->curr.num_fragment_sampler_views; i++) {
+      ret = brw_update_texture_surface(brw, 
+                                       brw_texture(brw->curr.fragment_sampler_views[i]->texture),
+                                       &brw->wm.surf_bo[BTI_TEXTURE(i)]);
+      if (ret)
+         return ret;
+
+      nr_surfaces = BTI_TEXTURE(i) + 1;
+   }
+
+   /* Clear any inactive entries:
+    */
+   for (i = brw->curr.fb.nr_cbufs; i < BRW_MAX_DRAW_BUFFERS; i++) 
+      bo_reference(&brw->wm.surf_bo[BTI_COLOR_BUF(i)], NULL);
+
+   if (!brw->curr.fragment_constants)
+      bo_reference(&brw->wm.surf_bo[BTI_FRAGMENT_CONSTANTS], NULL);      
+
+   /* XXX: no pipe_max_textures define?? */
+   for (i = brw->curr.num_fragment_sampler_views; i < PIPE_MAX_SAMPLERS; i++)
+      bo_reference(&brw->wm.surf_bo[BTI_TEXTURE(i)], NULL);
+
+   if (brw->wm.nr_surfaces != nr_surfaces) {
+      brw->wm.nr_surfaces = nr_surfaces;
+      brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
+   }
+
+   ret = brw_wm_get_binding_table(brw, &brw->wm.bind_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_wm_surfaces = {
+   .dirty = {
+      .mesa = (PIPE_NEW_COLOR_BUFFERS |
+               PIPE_NEW_BOUND_TEXTURES |
+               PIPE_NEW_FRAGMENT_CONSTANTS |
+	       PIPE_NEW_BLEND),
+      .brw = (BRW_NEW_CONTEXT |
+	      BRW_NEW_WM_SURFACES),
+      .cache = 0
+   },
+   .prepare = prepare_wm_surfaces,
+};
+
+
+
diff --git a/src/gallium/drivers/i965/intel_decode.c b/src/gallium/drivers/i965/intel_decode.c
new file mode 100644
index 0000000000..bd8b9174a8
--- /dev/null
+++ b/src/gallium/drivers/i965/intel_decode.c
@@ -0,0 +1,1791 @@
+/* -*- c-basic-offset: 4 -*- */
+/*
+ * Copyright © 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/** @file intel_decode.c
+ * This file contains code to print out batchbuffer contents in a
+ * human-readable format.
+ *
+ * The current version only supports i915 packets, and only pretty-prints a
+ * subset of them.  The intention is for it to make just a best attempt to
+ * decode, but never crash in the process.
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "util/u_string.h"
+#include "intel_decode.h"
+
+/*#include "intel_chipset.h"*/
+#define IS_965(x) 1             /* XXX */
+#define IS_9XX(x) 1             /* XXX */
+
+#define BUFFER_FAIL(_count, _len, _name) do {			\
+    fprintf(out, "Buffer size too small in %s (%d < %d)\n",	\
+	    (_name), (_count), (_len));				\
+    (*failures)++;						\
+    return count;						\
+} while (0)
+
+static FILE *out;
+static uint32_t saved_s2 = 0, saved_s4 = 0;
+static char saved_s2_set = 0, saved_s4_set = 0;
+
+static float
+int_as_float(uint32_t intval)
+{
+    union intfloat {
+	uint32_t i;
+	float f;
+    } uval;
+
+    uval.i = intval;
+    return uval.f;
+}
+
+static void
+instr_out(const uint32_t *data, uint32_t hw_offset, unsigned int index,
+	  char *fmt, ...)
+{
+    va_list va;
+
+    fprintf(out, "0x%08x: 0x%08x:%s ", hw_offset + index * 4, data[index],
+	    index == 0 ? "" : "  ");
+    va_start(va, fmt);
+    vfprintf(out, fmt, va);
+    va_end(va);
+}
+
+
+static int
+decode_mi(const uint32_t *data, int count, uint32_t hw_offset, int *failures)
+{
+    unsigned int opcode;
+
+    struct {
+	uint32_t opcode;
+	int len_mask;
+	int min_len;
+	int max_len;
+	char *name;
+    } opcodes_mi[] = {
+	{ 0x08, 0, 1, 1, "MI_ARB_ON_OFF" },
+	{ 0x0a, 0, 1, 1, "MI_BATCH_BUFFER_END" },
+	{ 0x31, 0x3f, 2, 2, "MI_BATCH_BUFFER_START" },
+	{ 0x14, 0x3f, 3, 3, "MI_DISPLAY_BUFFER_INFO" },
+	{ 0x04, 0, 1, 1, "MI_FLUSH" },
+	{ 0x22, 0, 3, 3, "MI_LOAD_REGISTER_IMM" },
+	{ 0x13, 0x3f, 2, 2, "MI_LOAD_SCAN_LINES_EXCL" },
+	{ 0x12, 0x3f, 2, 2, "MI_LOAD_SCAN_LINES_INCL" },
+	{ 0x00, 0, 1, 1, "MI_NOOP" },
+	{ 0x11, 0x3f, 2, 2, "MI_OVERLAY_FLIP" },
+	{ 0x07, 0, 1, 1, "MI_REPORT_HEAD" },
+	{ 0x18, 0x3f, 2, 2, "MI_SET_CONTEXT" },
+	{ 0x20, 0x3f, 3, 4, "MI_STORE_DATA_IMM" },
+	{ 0x21, 0x3f, 3, 4, "MI_STORE_DATA_INDEX" },
+	{ 0x24, 0x3f, 3, 3, "MI_STORE_REGISTER_MEM" },
+	{ 0x02, 0, 1, 1, "MI_USER_INTERRUPT" },
+	{ 0x03, 0, 1, 1, "MI_WAIT_FOR_EVENT" },
+    };
+
+
+    for (opcode = 0; opcode < sizeof(opcodes_mi) / sizeof(opcodes_mi[0]);
+	 opcode++) {
+	if ((data[0] & 0x1f800000) >> 23 == opcodes_mi[opcode].opcode) {
+	    unsigned int len = 1, i;
+
+	    instr_out(data, hw_offset, 0, "%s\n", opcodes_mi[opcode].name);
+	    if (opcodes_mi[opcode].max_len > 1) {
+		len = (data[0] & opcodes_mi[opcode].len_mask) + 2;
+		if (len < opcodes_mi[opcode].min_len ||
+		    len > opcodes_mi[opcode].max_len)
+		{
+		    fprintf(out, "Bad length (%d) in %s, [%d, %d]\n",
+			    len, opcodes_mi[opcode].name,
+			    opcodes_mi[opcode].min_len,
+			    opcodes_mi[opcode].max_len);
+		}
+	    }
+
+	    for (i = 1; i < len; i++) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, opcodes_mi[opcode].name);
+		instr_out(data, hw_offset, i, "dword %d\n", i);
+	    }
+
+	    return len;
+	}
+    }
+
+    instr_out(data, hw_offset, 0, "MI UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+static int
+decode_2d(const uint32_t *data, int count, uint32_t hw_offset, int *failures)
+{
+    unsigned int opcode, len;
+    char *format = NULL;
+
+    struct {
+	uint32_t opcode;
+	int min_len;
+	int max_len;
+	char *name;
+    } opcodes_2d[] = {
+	{ 0x40, 5, 5, "COLOR_BLT" },
+	{ 0x43, 6, 6, "SRC_COPY_BLT" },
+	{ 0x01, 8, 8, "XY_SETUP_BLT" },
+	{ 0x11, 9, 9, "XY_SETUP_MONO_PATTERN_SL_BLT" },
+	{ 0x03, 3, 3, "XY_SETUP_CLIP_BLT" },
+	{ 0x24, 2, 2, "XY_PIXEL_BLT" },
+	{ 0x25, 3, 3, "XY_SCANLINES_BLT" },
+	{ 0x26, 4, 4, "Y_TEXT_BLT" },
+	{ 0x31, 5, 134, "XY_TEXT_IMMEDIATE_BLT" },
+	{ 0x50, 6, 6, "XY_COLOR_BLT" },
+	{ 0x51, 6, 6, "XY_PAT_BLT" },
+	{ 0x76, 8, 8, "XY_PAT_CHROMA_BLT" },
+	{ 0x72, 7, 135, "XY_PAT_BLT_IMMEDIATE" },
+	{ 0x77, 9, 137, "XY_PAT_CHROMA_BLT_IMMEDIATE" },
+	{ 0x52, 9, 9, "XY_MONO_PAT_BLT" },
+	{ 0x59, 7, 7, "XY_MONO_PAT_FIXED_BLT" },
+	{ 0x53, 8, 8, "XY_SRC_COPY_BLT" },
+	{ 0x54, 8, 8, "XY_MONO_SRC_COPY_BLT" },
+	{ 0x71, 9, 137, "XY_MONO_SRC_COPY_IMMEDIATE_BLT" },
+	{ 0x55, 9, 9, "XY_FULL_BLT" },
+	{ 0x55, 9, 137, "XY_FULL_IMMEDIATE_PATTERN_BLT" },
+	{ 0x56, 9, 9, "XY_FULL_MONO_SRC_BLT" },
+	{ 0x75, 10, 138, "XY_FULL_MONO_SRC_IMMEDIATE_PATTERN_BLT" },
+	{ 0x57, 12, 12, "XY_FULL_MONO_PATTERN_BLT" },
+	{ 0x58, 12, 12, "XY_FULL_MONO_PATTERN_MONO_SRC_BLT" },
+    };
+
+    switch ((data[0] & 0x1fc00000) >> 22) {
+    case 0x50:
+	instr_out(data, hw_offset, 0,
+		  "XY_COLOR_BLT (rgb %sabled, alpha %sabled, dst tile %d)\n",
+		  (data[0] & (1 << 20)) ? "en" : "dis",
+		  (data[0] & (1 << 21)) ? "en" : "dis",
+		  (data[0] >> 11) & 1);
+
+	len = (data[0] & 0x000000ff) + 2;
+	if (len != 6)
+	    fprintf(out, "Bad count in XY_COLOR_BLT\n");
+	if (count < 6)
+	    BUFFER_FAIL(count, len, "XY_COLOR_BLT");
+
+	switch ((data[1] >> 24) & 0x3) {
+	case 0:
+	    format="8";
+	    break;
+	case 1:
+	    format="565";
+	    break;
+	case 2:
+	    format="1555";
+	    break;
+	case 3:
+	    format="8888";
+	    break;
+	}
+
+	instr_out(data, hw_offset, 1, "format %s, pitch %d, "
+		  "clipping %sabled\n", format,
+		  (short)(data[1] & 0xffff),
+		  data[1] & (1 << 30) ? "en" : "dis");
+	instr_out(data, hw_offset, 2, "(%d,%d)\n",
+		  data[2] & 0xffff, data[2] >> 16);
+	instr_out(data, hw_offset, 3, "(%d,%d)\n",
+		  data[3] & 0xffff, data[3] >> 16);
+	instr_out(data, hw_offset, 4, "offset 0x%08x\n", data[4]);
+	instr_out(data, hw_offset, 5, "color\n");
+	return len;
+    case 0x53:
+	instr_out(data, hw_offset, 0,
+		  "XY_SRC_COPY_BLT (rgb %sabled, alpha %sabled, "
+		  "src tile %d, dst tile %d)\n",
+		  (data[0] & (1 << 20)) ? "en" : "dis",
+		  (data[0] & (1 << 21)) ? "en" : "dis",
+		  (data[0] >> 15) & 1,
+		  (data[0] >> 11) & 1);
+
+	len = (data[0] & 0x000000ff) + 2;
+	if (len != 8)
+	    fprintf(out, "Bad count in XY_SRC_COPY_BLT\n");
+	if (count < 8)
+	    BUFFER_FAIL(count, len, "XY_SRC_COPY_BLT");
+
+	switch ((data[1] >> 24) & 0x3) {
+	case 0:
+	    format="8";
+	    break;
+	case 1:
+	    format="565";
+	    break;
+	case 2:
+	    format="1555";
+	    break;
+	case 3:
+	    format="8888";
+	    break;
+	}
+
+	instr_out(data, hw_offset, 1, "format %s, dst pitch %d, "
+		  "clipping %sabled\n", format,
+		  (short)(data[1] & 0xffff),
+		  data[1] & (1 << 30) ? "en" : "dis");
+	instr_out(data, hw_offset, 2, "dst (%d,%d)\n",
+		  data[2] & 0xffff, data[2] >> 16);
+	instr_out(data, hw_offset, 3, "dst (%d,%d)\n",
+		  data[3] & 0xffff, data[3] >> 16);
+	instr_out(data, hw_offset, 4, "dst offset 0x%08x\n", data[4]);
+	instr_out(data, hw_offset, 5, "src (%d,%d)\n",
+		  data[5] & 0xffff, data[5] >> 16);
+	instr_out(data, hw_offset, 6, "src pitch %d\n",
+		  (short)(data[6] & 0xffff));
+	instr_out(data, hw_offset, 7, "src offset 0x%08x\n", data[7]);
+	return len;
+    }
+
+    for (opcode = 0; opcode < sizeof(opcodes_2d) / sizeof(opcodes_2d[0]);
+	 opcode++) {
+	if ((data[0] & 0x1fc00000) >> 22 == opcodes_2d[opcode].opcode) {
+	    unsigned int i;
+
+	    len = 1;
+	    instr_out(data, hw_offset, 0, "%s\n", opcodes_2d[opcode].name);
+	    if (opcodes_2d[opcode].max_len > 1) {
+		len = (data[0] & 0x000000ff) + 2;
+		if (len < opcodes_2d[opcode].min_len ||
+		    len > opcodes_2d[opcode].max_len)
+		{
+		    fprintf(out, "Bad count in %s\n", opcodes_2d[opcode].name);
+		}
+	    }
+
+	    for (i = 1; i < len; i++) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, opcodes_2d[opcode].name);
+		instr_out(data, hw_offset, i, "dword %d\n", i);
+	    }
+
+	    return len;
+	}
+    }
+
+    instr_out(data, hw_offset, 0, "2D UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+static int
+decode_3d_1c(const uint32_t *data, int count, uint32_t hw_offset, int *failures)
+{
+    switch ((data[0] & 0x00f80000) >> 19) {
+    case 0x11:
+	instr_out(data, hw_offset, 0, "3DSTATE_DEPTH_SUBRECTANGLE_DISALBE\n");
+	return 1;
+    case 0x10:
+	instr_out(data, hw_offset, 0, "3DSTATE_SCISSOR_ENABLE\n");
+	return 1;
+    case 0x01:
+	instr_out(data, hw_offset, 0, "3DSTATE_MAP_COORD_SET_I830\n");
+	return 1;
+    case 0x0a:
+	instr_out(data, hw_offset, 0, "3DSTATE_MAP_CUBE_I830\n");
+	return 1;
+    case 0x05:
+	instr_out(data, hw_offset, 0, "3DSTATE_MAP_TEX_STREAM_I830\n");
+	return 1;
+    }
+
+    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+/** Sets the string dstname to describe the destination of the PS instruction */
+static void
+i915_get_instruction_dst(const uint32_t *data, int i, char *dstname, int do_mask)
+{
+    uint32_t a0 = data[i];
+    int dst_nr = (a0 >> 14) & 0xf;
+    char dstmask[8];
+    char *sat;
+
+    if (do_mask) {
+	if (((a0 >> 10) & 0xf) == 0xf) {
+	    dstmask[0] = 0;
+	} else {
+	    int dstmask_index = 0;
+
+	    dstmask[dstmask_index++] = '.';
+	    if (a0 & (1 << 10))
+		dstmask[dstmask_index++] = 'x';
+	    if (a0 & (1 << 11))
+		dstmask[dstmask_index++] = 'y';
+	    if (a0 & (1 << 12))
+		dstmask[dstmask_index++] = 'z';
+	    if (a0 & (1 << 13))
+		dstmask[dstmask_index++] = 'w';
+	    dstmask[dstmask_index++] = 0;
+	}
+
+	if (a0 & (1 << 22))
+	    sat = ".sat";
+	else
+	    sat = "";
+    } else {
+	dstmask[0] = 0;
+	sat = "";
+    }
+
+    switch ((a0 >> 19) & 0x7) {
+    case 0:
+	if (dst_nr > 15)
+	    fprintf(out, "bad destination reg R%d\n", dst_nr);
+	sprintf(dstname, "R%d%s%s", dst_nr, dstmask, sat);
+	break;
+    case 4:
+	if (dst_nr > 0)
+	    fprintf(out, "bad destination reg oC%d\n", dst_nr);
+	sprintf(dstname, "oC%s%s", dstmask, sat);
+	break;
+    case 5:
+	if (dst_nr > 0)
+	    fprintf(out, "bad destination reg oD%d\n", dst_nr);
+	sprintf(dstname, "oD%s%s",  dstmask, sat);
+	break;
+    case 6:
+	if (dst_nr > 2)
+	    fprintf(out, "bad destination reg U%d\n", dst_nr);
+	sprintf(dstname, "U%d%s%s", dst_nr, dstmask, sat);
+	break;
+    default:
+	sprintf(dstname, "RESERVED");
+	break;
+    }
+}
+
+static char *
+i915_get_channel_swizzle(uint32_t select)
+{
+    switch (select & 0x7) {
+    case 0:
+	return (select & 8) ? "-x" : "x";
+    case 1:
+	return (select & 8) ? "-y" : "y";
+    case 2:
+	return (select & 8) ? "-z" : "z";
+    case 3:
+	return (select & 8) ? "-w" : "w";
+    case 4:
+	return (select & 8) ? "-0" : "0";
+    case 5:
+	return (select & 8) ? "-1" : "1";
+    default:
+	return (select & 8) ? "-bad" : "bad";
+    }
+}
+
+static void
+i915_get_instruction_src_name(uint32_t src_type, uint32_t src_nr, char *name)
+{
+    switch (src_type) {
+    case 0:
+	sprintf(name, "R%d", src_nr);
+	if (src_nr > 15)
+	    fprintf(out, "bad src reg %s\n", name);
+	break;
+    case 1:
+	if (src_nr < 8)
+	    sprintf(name, "T%d", src_nr);
+	else if (src_nr == 8)
+	    sprintf(name, "DIFFUSE");
+	else if (src_nr == 9)
+	    sprintf(name, "SPECULAR");
+	else if (src_nr == 10)
+	    sprintf(name, "FOG");
+	else {
+	    fprintf(out, "bad src reg T%d\n", src_nr);
+	    sprintf(name, "RESERVED");
+	}
+	break;
+    case 2:
+	sprintf(name, "C%d", src_nr);
+	if (src_nr > 31)
+	    fprintf(out, "bad src reg %s\n", name);
+	break;
+    case 4:
+	sprintf(name, "oC");
+	if (src_nr > 0)
+	    fprintf(out, "bad src reg oC%d\n", src_nr);
+	break;
+    case 5:
+	sprintf(name, "oD");
+	if (src_nr > 0)
+	    fprintf(out, "bad src reg oD%d\n", src_nr);
+	break;
+    case 6:
+	sprintf(name, "U%d", src_nr);
+	if (src_nr > 2)
+	    fprintf(out, "bad src reg %s\n", name);
+	break;
+    default:
+	fprintf(out, "bad src reg type %d\n", src_type);
+	sprintf(name, "RESERVED");
+	break;
+    }
+}
+
+static void
+i915_get_instruction_src0(const uint32_t *data, int i, char *srcname)
+{
+    uint32_t a0 = data[i];
+    uint32_t a1 = data[i + 1];
+    int src_nr = (a0 >> 2) & 0x1f;
+    char *swizzle_x = i915_get_channel_swizzle((a1 >> 28) & 0xf);
+    char *swizzle_y = i915_get_channel_swizzle((a1 >> 24) & 0xf);
+    char *swizzle_z = i915_get_channel_swizzle((a1 >> 20) & 0xf);
+    char *swizzle_w = i915_get_channel_swizzle((a1 >> 16) & 0xf);
+    char swizzle[100];
+
+    i915_get_instruction_src_name((a0 >> 7) & 0x7, src_nr, srcname);
+    util_snprintf(swizzle, sizeof(swizzle), ".%s%s%s%s", swizzle_x, swizzle_y, swizzle_z, swizzle_w);
+    if (strcmp(swizzle, ".xyzw") != 0)
+	strcat(srcname, swizzle);
+}
+
+static void
+i915_get_instruction_src1(const uint32_t *data, int i, char *srcname)
+{
+    uint32_t a1 = data[i + 1];
+    uint32_t a2 = data[i + 2];
+    int src_nr = (a1 >> 8) & 0x1f;
+    char *swizzle_x = i915_get_channel_swizzle((a1 >> 4) & 0xf);
+    char *swizzle_y = i915_get_channel_swizzle((a1 >> 0) & 0xf);
+    char *swizzle_z = i915_get_channel_swizzle((a2 >> 28) & 0xf);
+    char *swizzle_w = i915_get_channel_swizzle((a2 >> 24) & 0xf);
+    char swizzle[100];
+
+    i915_get_instruction_src_name((a1 >> 13) & 0x7, src_nr, srcname);
+    util_snprintf(swizzle, sizeof(swizzle), ".%s%s%s%s", swizzle_x, swizzle_y, swizzle_z, swizzle_w);
+    if (strcmp(swizzle, ".xyzw") != 0)
+	strcat(srcname, swizzle);
+}
+
+static void
+i915_get_instruction_src2(const uint32_t *data, int i, char *srcname)
+{
+    uint32_t a2 = data[i + 2];
+    int src_nr = (a2 >> 16) & 0x1f;
+    char *swizzle_x = i915_get_channel_swizzle((a2 >> 12) & 0xf);
+    char *swizzle_y = i915_get_channel_swizzle((a2 >> 8) & 0xf);
+    char *swizzle_z = i915_get_channel_swizzle((a2 >> 4) & 0xf);
+    char *swizzle_w = i915_get_channel_swizzle((a2 >> 0) & 0xf);
+    char swizzle[100];
+
+    i915_get_instruction_src_name((a2 >> 21) & 0x7, src_nr, srcname);
+    util_snprintf(swizzle, sizeof(swizzle), ".%s%s%s%s", swizzle_x, swizzle_y, swizzle_z, swizzle_w);
+    if (strcmp(swizzle, ".xyzw") != 0)
+	strcat(srcname, swizzle);
+}
+
+static void
+i915_get_instruction_addr(uint32_t src_type, uint32_t src_nr, char *name)
+{
+    switch (src_type) {
+    case 0:
+	sprintf(name, "R%d", src_nr);
+	if (src_nr > 15)
+	    fprintf(out, "bad src reg %s\n", name);
+	break;
+    case 1:
+	if (src_nr < 8)
+	    sprintf(name, "T%d", src_nr);
+	else if (src_nr == 8)
+	    sprintf(name, "DIFFUSE");
+	else if (src_nr == 9)
+	    sprintf(name, "SPECULAR");
+	else if (src_nr == 10)
+	    sprintf(name, "FOG");
+	else {
+	    fprintf(out, "bad src reg T%d\n", src_nr);
+	    sprintf(name, "RESERVED");
+	}
+	break;
+    case 4:
+	sprintf(name, "oC");
+	if (src_nr > 0)
+	    fprintf(out, "bad src reg oC%d\n", src_nr);
+	break;
+    case 5:
+	sprintf(name, "oD");
+	if (src_nr > 0)
+	    fprintf(out, "bad src reg oD%d\n", src_nr);
+	break;
+    default:
+	fprintf(out, "bad src reg type %d\n", src_type);
+	sprintf(name, "RESERVED");
+	break;
+    }
+}
+
+static void
+i915_decode_alu1(const uint32_t *data, uint32_t hw_offset,
+		 int i, char *instr_prefix, char *op_name)
+{
+    char dst[100], src0[100];
+
+    i915_get_instruction_dst(data, i, dst, 1);
+    i915_get_instruction_src0(data, i, src0);
+
+    instr_out(data, hw_offset, i++, "%s: %s %s, %s\n", instr_prefix,
+	      op_name, dst, src0);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+}
+
+static void
+i915_decode_alu2(const uint32_t *data, uint32_t hw_offset,
+		 int i, char *instr_prefix, char *op_name)
+{
+    char dst[100], src0[100], src1[100];
+
+    i915_get_instruction_dst(data, i, dst, 1);
+    i915_get_instruction_src0(data, i, src0);
+    i915_get_instruction_src1(data, i, src1);
+
+    instr_out(data, hw_offset, i++, "%s: %s %s, %s, %s\n", instr_prefix,
+	      op_name, dst, src0, src1);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+}
+
+static void
+i915_decode_alu3(const uint32_t *data, uint32_t hw_offset,
+		 int i, char *instr_prefix, char *op_name)
+{
+    char dst[100], src0[100], src1[100], src2[100];
+
+    i915_get_instruction_dst(data, i, dst, 1);
+    i915_get_instruction_src0(data, i, src0);
+    i915_get_instruction_src1(data, i, src1);
+    i915_get_instruction_src2(data, i, src2);
+
+    instr_out(data, hw_offset, i++, "%s: %s %s, %s, %s, %s\n", instr_prefix,
+	      op_name, dst, src0, src1, src2);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+}
+
+static void
+i915_decode_tex(const uint32_t *data, uint32_t hw_offset, int i, char *instr_prefix,
+		char *tex_name)
+{
+    uint32_t t0 = data[i];
+    uint32_t t1 = data[i + 1];
+    char dst_name[100];
+    char addr_name[100];
+    int sampler_nr;
+
+    i915_get_instruction_dst(data, i, dst_name, 0);
+    i915_get_instruction_addr((t1 >> 24) & 0x7,
+			      (t1 >> 17) & 0xf,
+			      addr_name);
+    sampler_nr = t0 & 0xf;
+
+    instr_out(data, hw_offset, i++, "%s: %s %s, S%d, %s\n", instr_prefix,
+	      tex_name, dst_name, sampler_nr, addr_name);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+}
+
+static void
+i915_decode_dcl(const uint32_t *data, uint32_t hw_offset, int i, char *instr_prefix)
+{
+    uint32_t d0 = data[i];
+    char *sampletype;
+    int dcl_nr = (d0 >> 14) & 0xf;
+    char *dcl_x = d0 & (1 << 10) ? "x" : "";
+    char *dcl_y = d0 & (1 << 11) ? "y" : "";
+    char *dcl_z = d0 & (1 << 12) ? "z" : "";
+    char *dcl_w = d0 & (1 << 13) ? "w" : "";
+    char dcl_mask[10];
+
+    switch ((d0 >> 19) & 0x3) {
+    case 1:
+	util_snprintf(dcl_mask, sizeof(dcl_mask), ".%s%s%s%s", dcl_x, dcl_y, dcl_z, dcl_w);
+	if (strcmp(dcl_mask, ".") == 0)
+	    fprintf(out, "bad (empty) dcl mask\n");
+
+	if (dcl_nr > 10)
+	    fprintf(out, "bad T%d dcl register number\n", dcl_nr);
+	if (dcl_nr < 8) {
+	    if (strcmp(dcl_mask, ".x") != 0 &&
+		strcmp(dcl_mask, ".xy") != 0 &&
+		strcmp(dcl_mask, ".xz") != 0 &&
+		strcmp(dcl_mask, ".w") != 0 &&
+		strcmp(dcl_mask, ".xyzw") != 0) {
+		fprintf(out, "bad T%d.%s dcl mask\n", dcl_nr, dcl_mask);
+	    }
+	    instr_out(data, hw_offset, i++, "%s: DCL T%d%s\n", instr_prefix,
+		      dcl_nr, dcl_mask);
+	} else {
+	    if (strcmp(dcl_mask, ".xz") == 0)
+		fprintf(out, "errataed bad dcl mask %s\n", dcl_mask);
+	    else if (strcmp(dcl_mask, ".xw") == 0)
+		fprintf(out, "errataed bad dcl mask %s\n", dcl_mask);
+	    else if (strcmp(dcl_mask, ".xzw") == 0)
+		fprintf(out, "errataed bad dcl mask %s\n", dcl_mask);
+
+	    if (dcl_nr == 8) {
+		instr_out(data, hw_offset, i++, "%s: DCL DIFFUSE%s\n", instr_prefix,
+			  dcl_mask);
+	    } else if (dcl_nr == 9) {
+		instr_out(data, hw_offset, i++, "%s: DCL SPECULAR%s\n", instr_prefix,
+			  dcl_mask);
+	    } else if (dcl_nr == 10) {
+		instr_out(data, hw_offset, i++, "%s: DCL FOG%s\n", instr_prefix,
+			  dcl_mask);
+	    }
+	}
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	break;
+    case 3:
+	switch ((d0 >> 22) & 0x3) {
+	case 0:
+	    sampletype = "2D";
+	    break;
+	case 1:
+	    sampletype = "CUBE";
+	    break;
+	case 2:
+	    sampletype = "3D";
+	    break;
+	default:
+	    sampletype = "RESERVED";
+	    break;
+	}
+	if (dcl_nr > 15)
+	    fprintf(out, "bad S%d dcl register number\n", dcl_nr);
+	instr_out(data, hw_offset, i++, "%s: DCL S%d %s\n", instr_prefix,
+		  dcl_nr, sampletype);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	break;
+    default:
+	instr_out(data, hw_offset, i++, "%s: DCL RESERVED%d\n", instr_prefix, dcl_nr);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+    }
+}
+
+static void
+i915_decode_instruction(const uint32_t *data, uint32_t hw_offset,
+			int i, char *instr_prefix)
+{
+    switch ((data[i] >> 24) & 0x1f) {
+    case 0x0:
+	instr_out(data, hw_offset, i++, "%s: NOP\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	break;
+    case 0x01:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "ADD");
+	break;
+    case 0x02:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "MOV");
+	break;
+    case 0x03:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "MUL");
+	break;
+    case 0x04:
+	i915_decode_alu3(data, hw_offset, i, instr_prefix, "MAD");
+	break;
+    case 0x05:
+	i915_decode_alu3(data, hw_offset, i, instr_prefix, "DP2ADD");
+	break;
+    case 0x06:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "DP3");
+	break;
+    case 0x07:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "DP4");
+	break;
+    case 0x08:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "FRC");
+	break;
+    case 0x09:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "RCP");
+	break;
+    case 0x0a:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "RSQ");
+	break;
+    case 0x0b:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "EXP");
+	break;
+    case 0x0c:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "LOG");
+	break;
+    case 0x0d:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "CMP");
+	break;
+    case 0x0e:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "MIN");
+	break;
+    case 0x0f:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "MAX");
+	break;
+    case 0x10:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "FLR");
+	break;
+    case 0x11:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "MOD");
+	break;
+    case 0x12:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "TRC");
+	break;
+    case 0x13:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "SGE");
+	break;
+    case 0x14:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "SLT");
+	break;
+    case 0x15:
+	i915_decode_tex(data, hw_offset, i, instr_prefix, "TEXLD");
+	break;
+    case 0x16:
+	i915_decode_tex(data, hw_offset, i, instr_prefix, "TEXLDP");
+	break;
+    case 0x17:
+	i915_decode_tex(data, hw_offset, i, instr_prefix, "TEXLDB");
+	break;
+    case 0x19:
+	i915_decode_dcl(data, hw_offset, i, instr_prefix);
+	break;
+    default:
+	instr_out(data, hw_offset, i++, "%s: unknown\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	break;
+    }
+}
+
+static int
+decode_3d_1d(const uint32_t *data, int count, uint32_t hw_offset, int *failures, int i830)
+{
+    unsigned int len, i, c, opcode, word, map, sampler, instr;
+    char *format;
+
+    struct {
+	uint32_t opcode;
+	int i830_only;
+	int min_len;
+	int max_len;
+	char *name;
+    } opcodes_3d_1d[] = {
+	{ 0x8e, 0, 3, 3, "3DSTATE_BUFFER_INFO" },
+	{ 0x86, 0, 4, 4, "3DSTATE_CHROMA_KEY" },
+	{ 0x9c, 0, 1, 1, "3DSTATE_CLEAR_PARAMETERS" },
+	{ 0x88, 0, 2, 2, "3DSTATE_CONSTANT_BLEND_COLOR" },
+	{ 0x99, 0, 2, 2, "3DSTATE_DEFAULT_DIFFUSE" },
+	{ 0x9a, 0, 2, 2, "3DSTATE_DEFAULT_SPECULAR" },
+	{ 0x98, 0, 2, 2, "3DSTATE_DEFAULT_Z" },
+	{ 0x97, 0, 2, 2, "3DSTATE_DEPTH_OFFSET_SCALE" },
+	{ 0x85, 0, 2, 2, "3DSTATE_DEST_BUFFER_VARIABLES" },
+	{ 0x80, 0, 5, 5, "3DSTATE_DRAWING_RECTANGLE" },
+	{ 0x8e, 0, 3, 3, "3DSTATE_BUFFER_INFO" },
+	{ 0x9d, 0, 65, 65, "3DSTATE_FILTER_COEFFICIENTS_4X4" },
+	{ 0x9e, 0, 4, 4, "3DSTATE_MONO_FILTER" },
+	{ 0x89, 0, 4, 4, "3DSTATE_FOG_MODE" },
+	{ 0x8f, 0, 2, 16, "3DSTATE_MAP_PALLETE_LOAD_32" },
+	{ 0x81, 0, 3, 3, "3DSTATE_SCISSOR_RECTANGLE" },
+	{ 0x83, 0, 2, 2, "3DSTATE_SPAN_STIPPLE" },
+	{ 0x8c, 1, 2, 2, "3DSTATE_MAP_COORD_TRANSFORM_I830" },
+	{ 0x8b, 1, 2, 2, "3DSTATE_MAP_VERTEX_TRANSFORM_I830" },
+	{ 0x8d, 1, 3, 3, "3DSTATE_W_STATE_I830" },
+	{ 0x01, 1, 2, 2, "3DSTATE_COLOR_FACTOR_I830" },
+	{ 0x02, 1, 2, 2, "3DSTATE_MAP_COORD_SETBIND_I830" },
+    };
+
+    switch ((data[0] & 0x00ff0000) >> 16) {
+    case 0x07:
+	/* This instruction is unusual.  A 0 length means just 1 DWORD instead of
+	 * 2.  The 0 length is specified in one place to be unsupported, but
+	 * stated to be required in another, and 0 length LOAD_INDIRECTs appear
+	 * to cause no harm at least.
+	 */
+	instr_out(data, hw_offset, 0, "3DSTATE_LOAD_INDIRECT\n");
+	len = (data[0] & 0x000000ff) + 1;
+	i = 1;
+	if (data[0] & (0x01 << 8)) {
+	    if (i + 2 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_LOAD_INDIRECT");
+	    instr_out(data, hw_offset, i++, "SIS.0\n");
+	    instr_out(data, hw_offset, i++, "SIS.1\n");
+	}
+	if (data[0] & (0x02 << 8)) {
+	    if (i + 1 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_LOAD_INDIRECT");
+	    instr_out(data, hw_offset, i++, "DIS.0\n");
+	}
+	if (data[0] & (0x04 << 8)) {
+	    if (i + 2 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_LOAD_INDIRECT");
+	    instr_out(data, hw_offset, i++, "SSB.0\n");
+	    instr_out(data, hw_offset, i++, "SSB.1\n");
+	}
+	if (data[0] & (0x08 << 8)) {
+	    if (i + 2 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_LOAD_INDIRECT");
+	    instr_out(data, hw_offset, i++, "MSB.0\n");
+	    instr_out(data, hw_offset, i++, "MSB.1\n");
+	}
+	if (data[0] & (0x10 << 8)) {
+	    if (i + 2 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_LOAD_INDIRECT");
+	    instr_out(data, hw_offset, i++, "PSP.0\n");
+	    instr_out(data, hw_offset, i++, "PSP.1\n");
+	}
+	if (data[0] & (0x20 << 8)) {
+	    if (i + 2 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_LOAD_INDIRECT");
+	    instr_out(data, hw_offset, i++, "PSC.0\n");
+	    instr_out(data, hw_offset, i++, "PSC.1\n");
+	}
+	if (len != i) {
+	    fprintf(out, "Bad count in 3DSTATE_LOAD_INDIRECT\n");
+	    (*failures)++;
+	    return len;
+	}
+	return len;
+    case 0x04:
+	instr_out(data, hw_offset, 0, "3DSTATE_LOAD_STATE_IMMEDIATE_1\n");
+	len = (data[0] & 0x0000000f) + 2;
+	i = 1;
+	for (word = 0; word <= 7; word++) {
+	    if (data[0] & (1 << (4 + word))) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, "3DSTATE_LOAD_STATE_IMMEDIATE_1");
+
+		/* save vertex state for decode */
+		if (word == 2) {
+		    saved_s2_set = 1;
+		    saved_s2 = data[i];
+		}
+		if (word == 4) {
+		    saved_s4_set = 1;
+		    saved_s4 = data[i];
+		}
+
+		instr_out(data, hw_offset, i++, "S%d\n", word);
+	    }
+	}
+	if (len != i) {
+	    fprintf(out, "Bad count in 3DSTATE_LOAD_INDIRECT\n");
+	    (*failures)++;
+	}
+	return len;
+    case 0x00:
+	instr_out(data, hw_offset, 0, "3DSTATE_MAP_STATE\n");
+	len = (data[0] & 0x0000003f) + 2;
+	instr_out(data, hw_offset, 1, "mask\n");
+
+	i = 2;
+	for (map = 0; map <= 15; map++) {
+	    if (data[1] & (1 << map)) {
+		if (i + 3 >= count)
+		    BUFFER_FAIL(count, len, "3DSTATE_MAP_STATE");
+		instr_out(data, hw_offset, i++, "map %d MS2\n", map);
+		instr_out(data, hw_offset, i++, "map %d MS3\n", map);
+		instr_out(data, hw_offset, i++, "map %d MS4\n", map);
+	    }
+	}
+	if (len != i) {
+	    fprintf(out, "Bad count in 3DSTATE_MAP_STATE\n");
+	    (*failures)++;
+	    return len;
+	}
+	return len;
+    case 0x06:
+	instr_out(data, hw_offset, 0, "3DSTATE_PIXEL_SHADER_CONSTANTS\n");
+	len = (data[0] & 0x000000ff) + 2;
+
+	i = 2;
+	for (c = 0; c <= 31; c++) {
+	    if (data[1] & (1 << c)) {
+		if (i + 4 >= count)
+		    BUFFER_FAIL(count, len, "3DSTATE_PIXEL_SHADER_CONSTANTS");
+		instr_out(data, hw_offset, i, "C%d.X = %f\n",
+			  c, int_as_float(data[i]));
+		i++;
+		instr_out(data, hw_offset, i, "C%d.Y = %f\n",
+			  c, int_as_float(data[i]));
+		i++;
+		instr_out(data, hw_offset, i, "C%d.Z = %f\n",
+			  c, int_as_float(data[i]));
+		i++;
+		instr_out(data, hw_offset, i, "C%d.W = %f\n",
+			  c, int_as_float(data[i]));
+		i++;
+	    }
+	}
+	if (len != i) {
+	    fprintf(out, "Bad count in 3DSTATE_PIXEL_SHADER_CONSTANTS\n");
+	    (*failures)++;
+	}
+	return len;
+    case 0x05:
+	instr_out(data, hw_offset, 0, "3DSTATE_PIXEL_SHADER_PROGRAM\n");
+	len = (data[0] & 0x000000ff) + 2;
+	if ((len - 1) % 3 != 0 || len > 370) {
+	    fprintf(out, "Bad count in 3DSTATE_PIXEL_SHADER_PROGRAM\n");
+	    (*failures)++;
+	}
+	i = 1;
+	for (instr = 0; instr < (len - 1) / 3; instr++) {
+	    char instr_prefix[10];
+
+	    if (i + 3 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_PIXEL_SHADER_PROGRAM");
+	    util_snprintf(instr_prefix, sizeof(instr_prefix), "PS%03d", instr);
+	    i915_decode_instruction(data, hw_offset, i, instr_prefix);
+	    i += 3;
+	}
+	return len;
+    case 0x01:
+	if (i830)
+	    break;
+	instr_out(data, hw_offset, 0, "3DSTATE_SAMPLER_STATE\n");
+	instr_out(data, hw_offset, 1, "mask\n");
+	len = (data[0] & 0x0000003f) + 2;
+	i = 2;
+	for (sampler = 0; sampler <= 15; sampler++) {
+	    if (data[1] & (1 << sampler)) {
+		if (i + 3 >= count)
+		    BUFFER_FAIL(count, len, "3DSTATE_SAMPLER_STATE");
+		instr_out(data, hw_offset, i++, "sampler %d SS2\n",
+			  sampler);
+		instr_out(data, hw_offset, i++, "sampler %d SS3\n",
+			  sampler);
+		instr_out(data, hw_offset, i++, "sampler %d SS4\n",
+			  sampler);
+	    }
+	}
+	if (len != i) {
+	    fprintf(out, "Bad count in 3DSTATE_SAMPLER_STATE\n");
+	    (*failures)++;
+	}
+	return len;
+    case 0x85:
+	len = (data[0] & 0x0000000f) + 2;
+
+	if (len != 2)
+	    fprintf(out, "Bad count in 3DSTATE_DEST_BUFFER_VARIABLES\n");
+	if (count < 2)
+	    BUFFER_FAIL(count, len, "3DSTATE_DEST_BUFFER_VARIABLES");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_DEST_BUFFER_VARIABLES\n");
+
+	switch ((data[1] >> 8) & 0xf) {
+	case 0x0: format = "g8"; break;
+	case 0x1: format = "x1r5g5b5"; break;
+	case 0x2: format = "r5g6b5"; break;
+	case 0x3: format = "a8r8g8b8"; break;
+	case 0x4: format = "ycrcb_swapy"; break;
+	case 0x5: format = "ycrcb_normal"; break;
+	case 0x6: format = "ycrcb_swapuv"; break;
+	case 0x7: format = "ycrcb_swapuvy"; break;
+	case 0x8: format = "a4r4g4b4"; break;
+	case 0x9: format = "a1r5g5b5"; break;
+	case 0xa: format = "a2r10g10b10"; break;
+	default: format = "BAD"; break;
+	}
+	instr_out(data, hw_offset, 1, "%s format, early Z %sabled\n",
+		  format,
+		  (data[1] & (1 << 31)) ? "en" : "dis");
+	return len;
+    }
+
+    for (opcode = 0; opcode < sizeof(opcodes_3d_1d) / sizeof(opcodes_3d_1d[0]);
+	 opcode++)
+    {
+	if (opcodes_3d_1d[opcode].i830_only && !i830)
+	    continue;
+
+	if (((data[0] & 0x00ff0000) >> 16) == opcodes_3d_1d[opcode].opcode) {
+	    len = 1;
+
+	    instr_out(data, hw_offset, 0, "%s\n", opcodes_3d_1d[opcode].name);
+	    if (opcodes_3d_1d[opcode].max_len > 1) {
+		len = (data[0] & 0x0000ffff) + 2;
+		if (len < opcodes_3d_1d[opcode].min_len ||
+		    len > opcodes_3d_1d[opcode].max_len)
+		{
+		    fprintf(out, "Bad count in %s\n",
+			    opcodes_3d_1d[opcode].name);
+		    (*failures)++;
+		}
+	    }
+
+	    for (i = 1; i < len; i++) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len,  opcodes_3d_1d[opcode].name);
+		instr_out(data, hw_offset, i, "dword %d\n", i);
+	    }
+
+	    return len;
+	}
+    }
+
+    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+static int
+decode_3d_primitive(const uint32_t *data, int count, uint32_t hw_offset,
+		    int *failures)
+{
+    char immediate = (data[0] & (1 << 23)) == 0;
+    unsigned int len, i;
+    char *primtype;
+
+    switch ((data[0] >> 18) & 0xf) {
+    case 0x0: primtype = "TRILIST"; break;
+    case 0x1: primtype = "TRISTRIP"; break;
+    case 0x2: primtype = "TRISTRIP_REVERSE"; break;
+    case 0x3: primtype = "TRIFAN"; break;
+    case 0x4: primtype = "POLYGON"; break;
+    case 0x5: primtype = "LINELIST"; break;
+    case 0x6: primtype = "LINESTRIP"; break;
+    case 0x7: primtype = "RECTLIST"; break;
+    case 0x8: primtype = "POINTLIST"; break;
+    case 0x9: primtype = "DIB"; break;
+    case 0xa: primtype = "CLEAR_RECT"; break;
+    default: primtype = "unknown"; break;
+    }
+
+    /* XXX: 3DPRIM_DIB not supported */
+    if (immediate) {
+	len = (data[0] & 0x0003ffff) + 2;
+	instr_out(data, hw_offset, 0, "3DPRIMITIVE inline %s\n", primtype);
+	if (count < len)
+	    BUFFER_FAIL(count, len,  "3DPRIMITIVE inline");
+	if (!saved_s2_set || !saved_s4_set) {
+	    fprintf(out, "unknown vertex format\n");
+	    for (i = 1; i < len; i++) {
+		instr_out(data, hw_offset, i,
+			  "           vertex data (%f float)\n",
+			  int_as_float(data[i]));
+	    }
+	} else {
+	    unsigned int vertex = 0;
+	    for (i = 1; i < len;) {
+		unsigned int tc;
+
+#define VERTEX_OUT(fmt, ...) do {					\
+    if (i < len)							\
+	instr_out(data, hw_offset, i, " V%d."fmt"\n", vertex, __VA_ARGS__); \
+    else								\
+	fprintf(out, " missing data in V%d\n", vertex);			\
+    i++;								\
+} while (0)
+
+		VERTEX_OUT("X = %f", int_as_float(data[i]));
+		VERTEX_OUT("Y = %f", int_as_float(data[i]));
+	        switch (saved_s4 >> 6 & 0x7) {
+		case 0x1:
+		    VERTEX_OUT("Z = %f", int_as_float(data[i]));
+		    break;
+		case 0x2:
+		    VERTEX_OUT("Z = %f", int_as_float(data[i]));
+		    VERTEX_OUT("W = %f", int_as_float(data[i]));
+		    break;
+		case 0x3:
+		    break;
+		case 0x4:
+		    VERTEX_OUT("W = %f", int_as_float(data[i]));
+		    break;
+		default:
+		    fprintf(out, "bad S4 position mask\n");
+		}
+
+		if (saved_s4 & (1 << 10)) {
+		    VERTEX_OUT("color = (A=0x%02x, R=0x%02x, G=0x%02x, "
+			       "B=0x%02x)",
+			       data[i] >> 24,
+			       (data[i] >> 16) & 0xff,
+			       (data[i] >> 8) & 0xff,
+			       data[i] & 0xff);
+		}
+		if (saved_s4 & (1 << 11)) {
+		    VERTEX_OUT("spec = (A=0x%02x, R=0x%02x, G=0x%02x, "
+			       "B=0x%02x)",
+			       data[i] >> 24,
+			       (data[i] >> 16) & 0xff,
+			       (data[i] >> 8) & 0xff,
+			       data[i] & 0xff);
+		}
+		if (saved_s4 & (1 << 12))
+		    VERTEX_OUT("width = 0x%08x)", data[i]);
+
+		for (tc = 0; tc <= 7; tc++) {
+		    switch ((saved_s2 >> (tc * 4)) & 0xf) {
+		    case 0x0:
+			VERTEX_OUT("T%d.X = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Y = %f", tc, int_as_float(data[i]));
+			break;
+		    case 0x1:
+			VERTEX_OUT("T%d.X = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Y = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Z = %f", tc, int_as_float(data[i]));
+			break;
+		    case 0x2:
+			VERTEX_OUT("T%d.X = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Y = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Z = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.W = %f", tc, int_as_float(data[i]));
+			break;
+		    case 0x3:
+			VERTEX_OUT("T%d.X = %f", tc, int_as_float(data[i]));
+			break;
+		    case 0x4:
+			VERTEX_OUT("T%d.XY = 0x%08x half-float", tc, data[i]);
+			break;
+		    case 0x5:
+			VERTEX_OUT("T%d.XY = 0x%08x half-float", tc, data[i]);
+			VERTEX_OUT("T%d.ZW = 0x%08x half-float", tc, data[i]);
+			break;
+		    case 0xf:
+			break;
+		    default:
+			fprintf(out, "bad S2.T%d format\n", tc);
+		    }
+		}
+		vertex++;
+	    }
+	}
+    } else {
+	/* indirect vertices */
+	len = data[0] & 0x0000ffff; /* index count */
+	if (data[0] & (1 << 17)) {
+	    /* random vertex access */
+	    if (count < (len + 1) / 2 + 1) {
+		BUFFER_FAIL(count, (len + 1) / 2 + 1,
+			    "3DPRIMITIVE random indirect");
+	    }
+	    instr_out(data, hw_offset, 0,
+		      "3DPRIMITIVE random indirect %s (%d)\n", primtype, len);
+	    if (len == 0) {
+		/* vertex indices continue until 0xffff is found */
+		for (i = 1; i < count; i++) {
+		    if ((data[i] & 0xffff) == 0xffff) {
+			instr_out(data, hw_offset, i,
+				  "            indices: (terminator)\n");
+			return i;
+		    } else if ((data[i] >> 16) == 0xffff) {
+			instr_out(data, hw_offset, i,
+				  "            indices: 0x%04x, "
+				  "(terminator)\n",
+				  data[i] & 0xffff);
+			return i;
+		    } else {
+			instr_out(data, hw_offset, i,
+				  "            indices: 0x%04x, 0x%04x\n",
+				  data[i] & 0xffff, data[i] >> 16);
+		    }
+		}
+		fprintf(out,
+			"3DPRIMITIVE: no terminator found in index buffer\n");
+		(*failures)++;
+		return count;
+	    } else {
+		/* fixed size vertex index buffer */
+		for (i = 0; i < len; i += 2) {
+		    if (i * 2 == len - 1) {
+			instr_out(data, hw_offset, i,
+				  "            indices: 0x%04x\n",
+				  data[i] & 0xffff);
+		    } else {
+			instr_out(data, hw_offset, i,
+				  "            indices: 0x%04x, 0x%04x\n",
+				  data[i] & 0xffff, data[i] >> 16);
+		    }
+		}
+	    }
+	    return (len + 1) / 2 + 1;
+	} else {
+	    /* sequential vertex access */
+	    if (count < 2)
+		BUFFER_FAIL(count, 2, "3DPRIMITIVE seq indirect");
+	    instr_out(data, hw_offset, 0,
+		      "3DPRIMITIVE sequential indirect %s, %d starting from "
+		      "%d\n", primtype, len, data[1] & 0xffff);
+	    instr_out(data, hw_offset, 1, "           start\n");
+	    return 2;
+	}
+    }
+
+    return len;
+}
+
+static int
+decode_3d(const uint32_t *data, int count, uint32_t hw_offset, int *failures)
+{
+    unsigned int opcode;
+
+    struct {
+	uint32_t opcode;
+	int min_len;
+	int max_len;
+	char *name;
+    } opcodes_3d[] = {
+	{ 0x06, 1, 1, "3DSTATE_ANTI_ALIASING" },
+	{ 0x08, 1, 1, "3DSTATE_BACKFACE_STENCIL_OPS" },
+	{ 0x09, 1, 1, "3DSTATE_BACKFACE_STENCIL_MASKS" },
+	{ 0x16, 1, 1, "3DSTATE_COORD_SET_BINDINGS" },
+	{ 0x15, 1, 1, "3DSTATE_FOG_COLOR" },
+	{ 0x0b, 1, 1, "3DSTATE_INDEPENDENT_ALPHA_BLEND" },
+	{ 0x0d, 1, 1, "3DSTATE_MODES_4" },
+	{ 0x0c, 1, 1, "3DSTATE_MODES_5" },
+	{ 0x07, 1, 1, "3DSTATE_RASTERIZATION_RULES" },
+    };
+
+    switch ((data[0] & 0x1f000000) >> 24) {
+    case 0x1f:
+	return decode_3d_primitive(data, count, hw_offset, failures);
+    case 0x1d:
+	return decode_3d_1d(data, count, hw_offset, failures, 0);
+    case 0x1c:
+	return decode_3d_1c(data, count, hw_offset, failures);
+    }
+
+    for (opcode = 0; opcode < sizeof(opcodes_3d) / sizeof(opcodes_3d[0]);
+	 opcode++) {
+	if ((data[0] & 0x1f000000) >> 24 == opcodes_3d[opcode].opcode) {
+	    unsigned int len = 1, i;
+
+	    instr_out(data, hw_offset, 0, "%s\n", opcodes_3d[opcode].name);
+	    if (opcodes_3d[opcode].max_len > 1) {
+		len = (data[0] & 0xff) + 2;
+		if (len < opcodes_3d[opcode].min_len ||
+		    len > opcodes_3d[opcode].max_len)
+		{
+		    fprintf(out, "Bad count in %s\n", opcodes_3d[opcode].name);
+		}
+	    }
+
+	    for (i = 1; i < len; i++) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, opcodes_3d[opcode].name);
+		instr_out(data, hw_offset, i, "dword %d\n", i);
+	    }
+	    return len;
+	}
+    }
+
+    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+static const char *
+get_965_surfacetype(unsigned int surfacetype)
+{
+    switch (surfacetype) {
+    case 0: return "1D";
+    case 1: return "2D";
+    case 2: return "3D";
+    case 3: return "CUBE";
+    case 4: return "BUFFER";
+    case 7: return "NULL";
+    default: return "unknown";
+    }
+}
+
+static const char *
+get_965_depthformat(unsigned int depthformat)
+{
+    switch (depthformat) {
+    case 0: return "s8_z24float";
+    case 1: return "z32float";
+    case 2: return "z24s8";
+    case 5: return "z16";
+    default: return "unknown";
+    }
+}
+
+static const char *
+get_965_element_component(uint32_t data, int component)
+{
+    uint32_t component_control = (data >> (16 + (3 - component) * 4)) & 0x7;
+
+    switch (component_control) {
+    case 0:
+	return "nostore";
+    case 1:
+	switch (component) {
+	case 0: return "X";
+	case 1: return "Y";
+	case 2: return "Z";
+	case 3: return "W";
+	default: return "fail";
+	}
+    case 2:
+	return "0.0";
+    case 3:
+	return "1.0";
+    case 4:
+	return "0x1";
+    case 5:
+	return "VID";
+    default:
+	return "fail";
+    }
+}
+
+static const char *
+get_965_prim_type(uint32_t data)
+{
+    uint32_t primtype = (data >> 10) & 0x1f;
+
+    switch (primtype) {
+    case 0x01: return "point list";
+    case 0x02: return "line list";
+    case 0x03: return "line strip";
+    case 0x04: return "tri list";
+    case 0x05: return "tri strip";
+    case 0x06: return "tri fan";
+    case 0x07: return "quad list";
+    case 0x08: return "quad strip";
+    case 0x09: return "line list adj";
+    case 0x0a: return "line strip adj";
+    case 0x0b: return "tri list adj";
+    case 0x0c: return "tri strip adj";
+    case 0x0d: return "tri strip reverse";
+    case 0x0e: return "polygon";
+    case 0x0f: return "rect list";
+    case 0x10: return "line loop";
+    case 0x11: return "point list bf";
+    case 0x12: return "line strip cont";
+    case 0x13: return "line strip bf";
+    case 0x14: return "line strip cont bf";
+    case 0x15: return "tri fan no stipple";
+    default: return "fail";
+    }
+}
+
+static int
+decode_3d_965(const uint32_t *data, int count, uint32_t hw_offset, int *failures)
+{
+    unsigned int opcode, len;
+    int i;
+
+    struct {
+	uint32_t opcode;
+	int min_len;
+	int max_len;
+	char *name;
+    } opcodes_3d[] = {
+	{ 0x6000, 3, 3, "URB_FENCE" },
+	{ 0x6001, 2, 2, "CS_URB_STATE" },
+	{ 0x6002, 2, 2, "CONSTANT_BUFFER" },
+	{ 0x6101, 6, 6, "STATE_BASE_ADDRESS" },
+	{ 0x6102, 2, 2 , "STATE_SIP" },
+	{ 0x6104, 1, 1, "3DSTATE_PIPELINE_SELECT" },
+	{ 0x680b, 1, 1, "3DSTATE_VF_STATISTICS" },
+	{ 0x6904, 1, 1, "3DSTATE_PIPELINE_SELECT" },
+	{ 0x7800, 7, 7, "3DSTATE_PIPELINED_POINTERS" },
+	{ 0x7801, 6, 6, "3DSTATE_BINDING_TABLE_POINTERS" },
+	{ 0x780b, 1, 1, "3DSTATE_VF_STATISTICS" },
+	{ 0x7808, 5, 257, "3DSTATE_VERTEX_BUFFERS" },
+	{ 0x7809, 3, 256, "3DSTATE_VERTEX_ELEMENTS" },
+	{ 0x780a, 3, 3, "3DSTATE_INDEX_BUFFER" },
+	{ 0x7900, 4, 4, "3DSTATE_DRAWING_RECTANGLE" },
+	{ 0x7901, 5, 5, "3DSTATE_CONSTANT_COLOR" },
+	{ 0x7905, 5, 7, "3DSTATE_DEPTH_BUFFER" },
+	{ 0x7906, 2, 2, "3DSTATE_POLY_STIPPLE_OFFSET" },
+	{ 0x7907, 33, 33, "3DSTATE_POLY_STIPPLE_PATTERN" },
+	{ 0x7908, 3, 3, "3DSTATE_LINE_STIPPLE" },
+	{ 0x7909, 2, 2, "3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP" },
+	{ 0x790a, 3, 3, "3DSTATE_AA_LINE_PARAMETERS" },
+	{ 0x7b00, 6, 6, "3DPRIMITIVE" },
+    };
+
+    len = (data[0] & 0x0000ffff) + 2;
+
+    switch ((data[0] & 0xffff0000) >> 16) {
+    case 0x6101:
+	if (len != 6)
+	    fprintf(out, "Bad count in STATE_BASE_ADDRESS\n");
+	if (count < 6)
+	    BUFFER_FAIL(count, len, "STATE_BASE_ADDRESS");
+
+	instr_out(data, hw_offset, 0,
+		  "STATE_BASE_ADDRESS\n");
+
+	if (data[1] & 1) {
+	    instr_out(data, hw_offset, 1, "General state at 0x%08x\n",
+		      data[1] & ~1);
+	} else
+	    instr_out(data, hw_offset, 1, "General state not updated\n");
+
+	if (data[2] & 1) {
+	    instr_out(data, hw_offset, 2, "Surface state at 0x%08x\n",
+		      data[2] & ~1);
+	} else
+	    instr_out(data, hw_offset, 2, "Surface state not updated\n");
+
+	if (data[3] & 1) {
+	    instr_out(data, hw_offset, 3, "Indirect state at 0x%08x\n",
+		      data[3] & ~1);
+	} else
+	    instr_out(data, hw_offset, 3, "Indirect state not updated\n");
+
+	if (data[4] & 1) {
+	    instr_out(data, hw_offset, 4, "General state upper bound 0x%08x\n",
+		      data[4] & ~1);
+	} else
+	    instr_out(data, hw_offset, 4, "General state not updated\n");
+
+	if (data[5] & 1) {
+	    instr_out(data, hw_offset, 5, "Indirect state upper bound 0x%08x\n",
+		      data[5] & ~1);
+	} else
+	    instr_out(data, hw_offset, 5, "Indirect state not updated\n");
+
+	return len;
+    case 0x7800:
+	if (len != 7)
+	    fprintf(out, "Bad count in 3DSTATE_PIPELINED_POINTERS\n");
+	if (count < 7)
+	    BUFFER_FAIL(count, len, "3DSTATE_PIPELINED_POINTERS");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_PIPELINED_POINTERS\n");
+	instr_out(data, hw_offset, 1, "VS state\n");
+	instr_out(data, hw_offset, 2, "GS state\n");
+	instr_out(data, hw_offset, 3, "Clip state\n");
+	instr_out(data, hw_offset, 4, "SF state\n");
+	instr_out(data, hw_offset, 5, "WM state\n");
+	instr_out(data, hw_offset, 6, "CC state\n");
+	return len;
+    case 0x7801:
+	if (len != 6)
+	    fprintf(out, "Bad count in 3DSTATE_BINDING_TABLE_POINTERS\n");
+	if (count < 6)
+	    BUFFER_FAIL(count, len, "3DSTATE_BINDING_TABLE_POINTERS");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_BINDING_TABLE_POINTERS\n");
+	instr_out(data, hw_offset, 1, "VS binding table\n");
+	instr_out(data, hw_offset, 2, "GS binding table\n");
+	instr_out(data, hw_offset, 3, "Clip binding table\n");
+	instr_out(data, hw_offset, 4, "SF binding table\n");
+	instr_out(data, hw_offset, 5, "WM binding table\n");
+
+	return len;
+
+    case 0x7808:
+	len = (data[0] & 0xff) + 2;
+	if ((len - 1) % 4 != 0)
+	    fprintf(out, "Bad count in 3DSTATE_VERTEX_BUFFERS\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DSTATE_VERTEX_BUFFERS");
+	instr_out(data, hw_offset, 0, "3DSTATE_VERTEX_BUFFERS\n");
+
+	for (i = 1; i < len;) {
+	    instr_out(data, hw_offset, i, "buffer %d: %s, pitch %db\n",
+		      data[i] >> 27,
+		      data[i] & (1 << 26) ? "random" : "sequential",
+		      data[i] & 0x07ff);
+	    i++;
+	    instr_out(data, hw_offset, i++, "buffer address\n");
+	    instr_out(data, hw_offset, i++, "max index\n");
+	    instr_out(data, hw_offset, i++, "mbz\n");
+	}
+	return len;
+
+    case 0x7809:
+	len = (data[0] & 0xff) + 2;
+	if ((len + 1) % 2 != 0)
+	    fprintf(out, "Bad count in 3DSTATE_VERTEX_ELEMENTS\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DSTATE_VERTEX_ELEMENTS");
+	instr_out(data, hw_offset, 0, "3DSTATE_VERTEX_ELEMENTS\n");
+
+	for (i = 1; i < len;) {
+	    instr_out(data, hw_offset, i, "buffer %d: %svalid, type 0x%04x, "
+		      "src offset 0x%04x bytes\n",
+		      data[i] >> 27,
+		      data[i] & (1 << 26) ? "" : "in",
+		      (data[i] >> 16) & 0x1ff,
+		      data[i] & 0x07ff);
+	    i++;
+	    instr_out(data, hw_offset, i, "(%s, %s, %s, %s), "
+		      "dst offset 0x%02x bytes\n",
+		      get_965_element_component(data[i], 0),
+		      get_965_element_component(data[i], 1),
+		      get_965_element_component(data[i], 2),
+		      get_965_element_component(data[i], 3),
+		      (data[i] & 0xff) * 4);
+	    i++;
+	}
+	return len;
+
+    case 0x780a:
+	len = (data[0] & 0xff) + 2;
+	if (len != 3)
+	    fprintf(out, "Bad count in 3DSTATE_INDEX_BUFFER\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DSTATE_INDEX_BUFFER");
+	instr_out(data, hw_offset, 0, "3DSTATE_INDEX_BUFFER\n");
+	instr_out(data, hw_offset, 1, "beginning buffer address\n");
+	instr_out(data, hw_offset, 2, "ending buffer address\n");
+	return len;
+
+    case 0x7900:
+	if (len != 4)
+	    fprintf(out, "Bad count in 3DSTATE_DRAWING_RECTANGLE\n");
+	if (count < 4)
+	    BUFFER_FAIL(count, len, "3DSTATE_DRAWING_RECTANGLE");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_DRAWING_RECTANGLE\n");
+	instr_out(data, hw_offset, 1, "top left: %d,%d\n",
+		  data[1] & 0xffff,
+		  (data[1] >> 16) & 0xffff);
+	instr_out(data, hw_offset, 2, "bottom right: %d,%d\n",
+		  data[2] & 0xffff,
+		  (data[2] >> 16) & 0xffff);
+	instr_out(data, hw_offset, 3, "origin: %d,%d\n",
+		  (int)data[3] & 0xffff,
+		  ((int)data[3] >> 16) & 0xffff);
+
+	return len;
+
+    case 0x7905:
+	if (len != 5 && len != 6)
+	    fprintf(out, "Bad count in 3DSTATE_DEPTH_BUFFER\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DSTATE_DEPTH_BUFFER");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_DEPTH_BUFFER\n");
+	instr_out(data, hw_offset, 1, "%s, %s, pitch = %d bytes, %stiled\n",
+		  get_965_surfacetype(data[1] >> 29),
+		  get_965_depthformat((data[1] >> 18) & 0x7),
+		  (data[1] & 0x0001ffff) + 1,
+		  data[1] & (1 << 27) ? "" : "not ");
+	instr_out(data, hw_offset, 2, "depth offset\n");
+	instr_out(data, hw_offset, 3, "%dx%d\n",
+		  ((data[3] & 0x0007ffc0) >> 6) + 1,
+		  ((data[3] & 0xfff80000) >> 19) + 1);
+	instr_out(data, hw_offset, 4, "volume depth\n");
+	if (len == 6)
+	    instr_out(data, hw_offset, 5, "\n");
+
+	return len;
+
+    case 0x7b00:
+	len = (data[0] & 0xff) + 2;
+	if (len != 6)
+	    fprintf(out, "Bad count in 3DPRIMITIVE\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DPRIMITIVE");
+
+	instr_out(data, hw_offset, 0,
+		  "3DPRIMITIVE: %s %s\n",
+		  get_965_prim_type(data[0]),
+		  (data[0] & (1 << 15)) ? "random" : "sequential");
+	instr_out(data, hw_offset, 1, "vertex count\n");
+	instr_out(data, hw_offset, 2, "start vertex\n");
+	instr_out(data, hw_offset, 3, "instance count\n");
+	instr_out(data, hw_offset, 4, "start instance\n");
+	instr_out(data, hw_offset, 5, "index bias\n");
+	return len;
+    }
+
+    for (opcode = 0; opcode < sizeof(opcodes_3d) / sizeof(opcodes_3d[0]);
+	 opcode++) {
+	if ((data[0] & 0xffff0000) >> 16 == opcodes_3d[opcode].opcode) {
+	    unsigned int i;
+	    len = 1;
+
+	    instr_out(data, hw_offset, 0, "%s\n", opcodes_3d[opcode].name);
+	    if (opcodes_3d[opcode].max_len > 1) {
+		len = (data[0] & 0xff) + 2;
+		if (len < opcodes_3d[opcode].min_len ||
+		    len > opcodes_3d[opcode].max_len)
+		{
+		    fprintf(out, "Bad count in %s\n", opcodes_3d[opcode].name);
+		}
+	    }
+
+	    for (i = 1; i < len; i++) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, opcodes_3d[opcode].name);
+		instr_out(data, hw_offset, i, "dword %d\n", i);
+	    }
+	    return len;
+	}
+    }
+
+    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+static int
+decode_3d_i830(const uint32_t *data, int count, uint32_t hw_offset, int *failures)
+{
+    unsigned int opcode;
+
+    struct {
+	uint32_t opcode;
+	int min_len;
+	int max_len;
+	char *name;
+    } opcodes_3d[] = {
+	{ 0x02, 1, 1, "3DSTATE_MODES_3" },
+	{ 0x03, 1, 1, "3DSTATE_ENABLES_1"},
+	{ 0x04, 1, 1, "3DSTATE_ENABLES_2"},
+	{ 0x05, 1, 1, "3DSTATE_VFT0"},
+	{ 0x06, 1, 1, "3DSTATE_AA"},
+	{ 0x07, 1, 1, "3DSTATE_RASTERIZATION_RULES" },
+	{ 0x08, 1, 1, "3DSTATE_MODES_1" },
+	{ 0x09, 1, 1, "3DSTATE_STENCIL_TEST" },
+	{ 0x0a, 1, 1, "3DSTATE_VFT1"},
+	{ 0x0b, 1, 1, "3DSTATE_INDPT_ALPHA_BLEND" },
+	{ 0x0c, 1, 1, "3DSTATE_MODES_5" },
+	{ 0x0d, 1, 1, "3DSTATE_MAP_BLEND_OP" },
+	{ 0x0e, 1, 1, "3DSTATE_MAP_BLEND_ARG" },
+	{ 0x0f, 1, 1, "3DSTATE_MODES_2" },
+	{ 0x15, 1, 1, "3DSTATE_FOG_COLOR" },
+	{ 0x16, 1, 1, "3DSTATE_MODES_4" },
+    };
+
+    switch ((data[0] & 0x1f000000) >> 24) {
+    case 0x1f:
+	return decode_3d_primitive(data, count, hw_offset, failures);
+    case 0x1d:
+	return decode_3d_1d(data, count, hw_offset, failures, 1);
+    case 0x1c:
+	return decode_3d_1c(data, count, hw_offset, failures);
+    }
+
+    for (opcode = 0; opcode < sizeof(opcodes_3d) / sizeof(opcodes_3d[0]);
+	 opcode++) {
+	if ((data[0] & 0x1f000000) >> 24 == opcodes_3d[opcode].opcode) {
+	    unsigned int len = 1, i;
+
+	    instr_out(data, hw_offset, 0, "%s\n", opcodes_3d[opcode].name);
+	    if (opcodes_3d[opcode].max_len > 1) {
+		len = (data[0] & 0xff) + 2;
+		if (len < opcodes_3d[opcode].min_len ||
+		    len > opcodes_3d[opcode].max_len)
+		{
+		    fprintf(out, "Bad count in %s\n", opcodes_3d[opcode].name);
+		}
+	    }
+
+	    for (i = 1; i < len; i++) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, opcodes_3d[opcode].name);
+		instr_out(data, hw_offset, i, "dword %d\n", i);
+	    }
+	    return len;
+	}
+    }
+
+    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+/**
+ * Decodes an i830-i915 batch buffer, writing the output to stdout.
+ *
+ * \param data batch buffer contents
+ * \param count number of DWORDs to decode in the batch buffer
+ * \param hw_offset hardware address for the buffer
+ */
+int
+intel_decode(const uint32_t *data, int count, uint32_t hw_offset, uint32_t devid)
+{
+    int index = 0;
+    int failures = 0;
+
+    out = stderr;
+
+    while (index < count) {
+	switch ((data[index] & 0xe0000000) >> 29) {
+	case 0x0:
+	    index += decode_mi(data + index, count - index,
+			       hw_offset + index * 4, &failures);
+	    break;
+	case 0x2:
+	    index += decode_2d(data + index, count - index,
+			       hw_offset + index * 4, &failures);
+	    break;
+	case 0x3:
+	    if (IS_965(devid)) {
+		index += decode_3d_965(data + index, count - index,
+				       hw_offset + index * 4, &failures);
+	    } else if (IS_9XX(devid)) {
+		index += decode_3d(data + index, count - index,
+				   hw_offset + index * 4, &failures);
+	    } else {
+		index += decode_3d_i830(data + index, count - index,
+					hw_offset + index * 4, &failures);
+	    }
+	    break;
+	default:
+	    instr_out(data, hw_offset, index, "UNKNOWN\n");
+	    failures++;
+	    index++;
+	    break;
+	}
+	fflush(out);
+    }
+
+    return failures;
+}
+
+void intel_decode_context_reset(void)
+{
+    saved_s2_set = 0;
+    saved_s4_set = 1;
+}
+
diff --git a/src/gallium/drivers/i965/intel_decode.h b/src/gallium/drivers/i965/intel_decode.h
new file mode 100644
index 0000000000..7683097b86
--- /dev/null
+++ b/src/gallium/drivers/i965/intel_decode.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright © 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+int intel_decode(const uint32_t *data, int count, uint32_t hw_offset, uint32_t devid);
+void intel_decode_context_reset(void);
diff --git a/src/gallium/drivers/i965/intel_structs.h b/src/gallium/drivers/i965/intel_structs.h
new file mode 100644
index 0000000000..522e3bd92c
--- /dev/null
+++ b/src/gallium/drivers/i965/intel_structs.h
@@ -0,0 +1,132 @@
+#ifndef INTEL_STRUCTS_H
+#define INTEL_STRUCTS_H
+
+struct br0 {
+   GLuint length:8;
+   GLuint pad0:3;
+   GLuint dst_tiled:1;
+   GLuint pad1:8;
+   GLuint write_rgb:1;
+   GLuint write_alpha:1;
+   GLuint opcode:7;
+   GLuint client:3;
+};
+
+   
+struct br13 {
+   GLint dest_pitch:16;
+   GLuint rop:8;
+   GLuint color_depth:2;
+   GLuint pad1:3;
+   GLuint mono_source_transparency:1;
+   GLuint clipping_enable:1;
+   GLuint pad0:1;
+};
+
+
+
+/* This is an attempt to move some of the 2D interaction in this
+ * driver to using structs for packets rather than a bunch of #defines
+ * and dwords.
+ */
+struct xy_color_blit {
+   struct br0 br0;
+   struct br13 br13;
+
+   struct {
+      GLuint dest_x1:16;
+      GLuint dest_y1:16;
+   } dw2;
+
+   struct {
+      GLuint dest_x2:16;
+      GLuint dest_y2:16;
+   } dw3;
+   
+   GLuint dest_base_addr;
+   GLuint color;
+};
+
+struct xy_src_copy_blit {
+   struct br0 br0;
+   struct br13 br13;
+
+   struct {
+      GLuint dest_x1:16;
+      GLuint dest_y1:16;
+   } dw2;
+
+   struct {
+      GLuint dest_x2:16;
+      GLuint dest_y2:16;
+   } dw3;
+   
+   GLuint dest_base_addr;
+
+   struct {
+      GLuint src_x1:16;
+      GLuint src_y1:16;
+   } dw5;
+
+   struct {
+      GLint src_pitch:16;
+      GLuint pad:16;
+   } dw6;
+   
+   GLuint src_base_addr;
+};
+
+struct xy_setup_blit {
+   struct br0 br0;
+   struct br13 br13;
+
+   struct {
+      GLuint clip_x1:16;
+      GLuint clip_y1:16;
+   } dw2;
+
+   struct {
+      GLuint clip_x2:16;
+      GLuint clip_y2:16;
+   } dw3;
+      
+   GLuint dest_base_addr;
+   GLuint background_color;
+   GLuint foreground_color;
+   GLuint pattern_base_addr;
+};
+
+
+struct xy_text_immediate_blit {
+   struct {
+      GLuint length:8;
+      GLuint pad2:3;
+      GLuint dst_tiled:1;
+      GLuint pad1:4;
+      GLuint byte_packed:1;
+      GLuint pad0:5;
+      GLuint opcode:7;
+      GLuint client:3;
+   } dw0;
+
+   struct {
+      GLuint dest_x1:16;
+      GLuint dest_y1:16;
+   } dw1;
+
+   struct {
+      GLuint dest_x2:16;
+      GLuint dest_y2:16;
+   } dw2;   
+
+   /* Src bitmap data follows as inline dwords.
+    */
+};
+
+
+#define CLIENT_2D 0x2
+#define OPCODE_XY_SETUP_BLT 0x1
+#define OPCODE_XY_COLOR_BLT 0x50
+#define OPCODE_XY_TEXT_IMMEDIATE_BLT 0x31
+
+#endif
diff --git a/src/gallium/drivers/identity/Makefile b/src/gallium/drivers/identity/Makefile
new file mode 100644
index 0000000000..e32b9102e5
--- /dev/null
+++ b/src/gallium/drivers/identity/Makefile
@@ -0,0 +1,12 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = identity
+
+C_SOURCES = \
+	id_objects.c \
+	id_context.c \
+	id_screen.c \
+	id_drm.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/identity/SConscript b/src/gallium/drivers/identity/SConscript
new file mode 100644
index 0000000000..2a68891c28
--- /dev/null
+++ b/src/gallium/drivers/identity/SConscript
@@ -0,0 +1,14 @@
+Import('*')
+
+env = env.Clone()
+
+identity = env.ConvenienceLibrary(
+	target = 'identity',
+	source = [
+		'id_context.c',
+		'id_drm.c',
+		'id_objects.c',
+		'id_screen.c',
+	])
+
+Export('identity')
diff --git a/src/gallium/drivers/identity/id_context.c b/src/gallium/drivers/identity/id_context.c
new file mode 100644
index 0000000000..67be895b38
--- /dev/null
+++ b/src/gallium/drivers/identity/id_context.c
@@ -0,0 +1,952 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+
+#include "id_context.h"
+#include "id_objects.h"
+
+
+static void
+identity_destroy(struct pipe_context *_pipe)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->destroy(pipe);
+
+   FREE(id_pipe);
+}
+
+static void
+identity_draw_arrays(struct pipe_context *_pipe,
+                     unsigned prim,
+                     unsigned start,
+                     unsigned count)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->draw_arrays(pipe,
+                     prim,
+                     start,
+                     count);
+}
+
+static void
+identity_draw_elements(struct pipe_context *_pipe,
+                       struct pipe_resource *_indexResource,
+                       unsigned indexSize,
+                       int indexBias,
+                       unsigned prim,
+                       unsigned start,
+                       unsigned count)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct identity_resource *id_resource = identity_resource(_indexResource);
+   struct pipe_context *pipe = id_pipe->pipe;
+   struct pipe_resource *indexResource = id_resource->resource;
+
+   pipe->draw_elements(pipe,
+                       indexResource,
+                       indexSize,
+                       indexBias,
+                       prim,
+                       start,
+                       count);
+}
+
+static void
+identity_draw_range_elements(struct pipe_context *_pipe,
+                             struct pipe_resource *_indexResource,
+                             unsigned indexSize,
+                             int indexBias,
+                             unsigned minIndex,
+                             unsigned maxIndex,
+                             unsigned mode,
+                             unsigned start,
+                             unsigned count)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct identity_resource *id_resource = identity_resource(_indexResource);
+   struct pipe_context *pipe = id_pipe->pipe;
+   struct pipe_resource *indexResource = id_resource->resource;
+
+   pipe->draw_range_elements(pipe,
+                             indexResource,
+                             indexSize,
+                             indexBias,
+                             minIndex,
+                             maxIndex,
+                             mode,
+                             start,
+                             count);
+}
+
+static struct pipe_query *
+identity_create_query(struct pipe_context *_pipe,
+                      unsigned query_type)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   return pipe->create_query(pipe,
+                             query_type);
+}
+
+static void
+identity_destroy_query(struct pipe_context *_pipe,
+                       struct pipe_query *query)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->destroy_query(pipe,
+                       query);
+}
+
+static void
+identity_begin_query(struct pipe_context *_pipe,
+                     struct pipe_query *query)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->begin_query(pipe,
+                     query);
+}
+
+static void
+identity_end_query(struct pipe_context *_pipe,
+                   struct pipe_query *query)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->end_query(pipe,
+                   query);
+}
+
+static boolean
+identity_get_query_result(struct pipe_context *_pipe,
+                          struct pipe_query *query,
+                          boolean wait,
+                          void *result)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   return pipe->get_query_result(pipe,
+                                 query,
+                                 wait,
+                                 result);
+}
+
+static void *
+identity_create_blend_state(struct pipe_context *_pipe,
+                            const struct pipe_blend_state *blend)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   return pipe->create_blend_state(pipe,
+                                   blend);
+}
+
+static void
+identity_bind_blend_state(struct pipe_context *_pipe,
+                          void *blend)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->bind_blend_state(pipe,
+                              blend);
+}
+
+static void
+identity_delete_blend_state(struct pipe_context *_pipe,
+                            void *blend)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->delete_blend_state(pipe,
+                            blend);
+}
+
+static void *
+identity_create_sampler_state(struct pipe_context *_pipe,
+                              const struct pipe_sampler_state *sampler)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   return pipe->create_sampler_state(pipe,
+                                     sampler);
+}
+
+static void
+identity_bind_fragment_sampler_states(struct pipe_context *_pipe,
+                                      unsigned num_samplers,
+                                      void **samplers)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->bind_fragment_sampler_states(pipe,
+                                      num_samplers,
+                                      samplers);
+}
+
+static void
+identity_bind_vertex_sampler_states(struct pipe_context *_pipe,
+                                    unsigned num_samplers,
+                                    void **samplers)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->bind_vertex_sampler_states(pipe,
+                                    num_samplers,
+                                    samplers);
+}
+
+static void
+identity_delete_sampler_state(struct pipe_context *_pipe,
+                              void *sampler)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->delete_sampler_state(pipe,
+                              sampler);
+}
+
+static void *
+identity_create_rasterizer_state(struct pipe_context *_pipe,
+                                 const struct pipe_rasterizer_state *rasterizer)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   return pipe->create_rasterizer_state(pipe,
+                                        rasterizer);
+}
+
+static void
+identity_bind_rasterizer_state(struct pipe_context *_pipe,
+                               void *rasterizer)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->bind_rasterizer_state(pipe,
+                               rasterizer);
+}
+
+static void
+identity_delete_rasterizer_state(struct pipe_context *_pipe,
+                                 void *rasterizer)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->delete_rasterizer_state(pipe,
+                                 rasterizer);
+}
+
+static void *
+identity_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                          const struct pipe_depth_stencil_alpha_state *depth_stencil_alpha)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   return pipe->create_depth_stencil_alpha_state(pipe,
+                                                 depth_stencil_alpha);
+}
+
+static void
+identity_bind_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                        void *depth_stencil_alpha)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->bind_depth_stencil_alpha_state(pipe,
+                                        depth_stencil_alpha);
+}
+
+static void
+identity_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                          void *depth_stencil_alpha)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->delete_depth_stencil_alpha_state(pipe,
+                                          depth_stencil_alpha);
+}
+
+static void *
+identity_create_fs_state(struct pipe_context *_pipe,
+                         const struct pipe_shader_state *fs)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   return pipe->create_fs_state(pipe,
+                                fs);
+}
+
+static void
+identity_bind_fs_state(struct pipe_context *_pipe,
+                       void *fs)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->bind_fs_state(pipe,
+                       fs);
+}
+
+static void
+identity_delete_fs_state(struct pipe_context *_pipe,
+                         void *fs)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->delete_fs_state(pipe,
+                         fs);
+}
+
+static void *
+identity_create_vs_state(struct pipe_context *_pipe,
+                         const struct pipe_shader_state *vs)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   return pipe->create_vs_state(pipe,
+                                vs);
+}
+
+static void
+identity_bind_vs_state(struct pipe_context *_pipe,
+                       void *vs)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->bind_vs_state(pipe,
+                       vs);
+}
+
+static void
+identity_delete_vs_state(struct pipe_context *_pipe,
+                         void *vs)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->delete_vs_state(pipe,
+                         vs);
+}
+
+
+static void *
+identity_create_vertex_elements_state(struct pipe_context *_pipe,
+                                      unsigned num_elements,
+                                      const struct pipe_vertex_element *vertex_elements)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   return pipe->create_vertex_elements_state(pipe,
+                                             num_elements,
+                                             vertex_elements);
+}
+
+static void
+identity_bind_vertex_elements_state(struct pipe_context *_pipe,
+                                    void *velems)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->bind_vertex_elements_state(pipe,
+                                    velems);
+}
+
+static void
+identity_delete_vertex_elements_state(struct pipe_context *_pipe,
+                                      void *velems)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->delete_vertex_elements_state(pipe,
+                                      velems);
+}
+
+static void
+identity_set_blend_color(struct pipe_context *_pipe,
+                         const struct pipe_blend_color *blend_color)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->set_blend_color(pipe,
+                         blend_color);
+}
+
+static void
+identity_set_stencil_ref(struct pipe_context *_pipe,
+                         const struct pipe_stencil_ref *stencil_ref)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->set_stencil_ref(pipe,
+                         stencil_ref);
+}
+
+static void
+identity_set_clip_state(struct pipe_context *_pipe,
+                        const struct pipe_clip_state *clip)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->set_clip_state(pipe,
+                        clip);
+}
+
+static void
+identity_set_sample_mask(struct pipe_context *_pipe,
+                         unsigned sample_mask)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->set_sample_mask(pipe,
+                         sample_mask);
+}
+
+static void
+identity_set_constant_buffer(struct pipe_context *_pipe,
+                             uint shader,
+                             uint index,
+                             struct pipe_resource *_resource)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+   struct pipe_resource *unwrapped_resource;
+   struct pipe_resource *resource = NULL;
+
+   /* XXX hmm? unwrap the input state */
+   if (_resource) {
+      unwrapped_resource = identity_resource_unwrap(_resource);
+      resource = unwrapped_resource;
+   }
+
+   pipe->set_constant_buffer(pipe,
+                             shader,
+                             index,
+                             resource);
+}
+
+static void
+identity_set_framebuffer_state(struct pipe_context *_pipe,
+                               const struct pipe_framebuffer_state *_state)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+   struct pipe_framebuffer_state unwrapped_state;
+   struct pipe_framebuffer_state *state = NULL;
+   unsigned i;
+
+   /* unwrap the input state */
+   if (_state) {
+      memcpy(&unwrapped_state, _state, sizeof(unwrapped_state));
+      for(i = 0; i < _state->nr_cbufs; i++)
+         unwrapped_state.cbufs[i] = identity_surface_unwrap(_state->cbufs[i]);
+      for (; i < PIPE_MAX_COLOR_BUFS; i++)
+         unwrapped_state.cbufs[i] = NULL;
+      unwrapped_state.zsbuf = identity_surface_unwrap(_state->zsbuf);
+      state = &unwrapped_state;
+   }
+
+   pipe->set_framebuffer_state(pipe,
+                               state);
+}
+
+static void
+identity_set_polygon_stipple(struct pipe_context *_pipe,
+                             const struct pipe_poly_stipple *poly_stipple)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->set_polygon_stipple(pipe,
+                             poly_stipple);
+}
+
+static void
+identity_set_scissor_state(struct pipe_context *_pipe,
+                           const struct pipe_scissor_state *scissor)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->set_scissor_state(pipe,
+                           scissor);
+}
+
+static void
+identity_set_viewport_state(struct pipe_context *_pipe,
+                            const struct pipe_viewport_state *viewport)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->set_viewport_state(pipe,
+                            viewport);
+}
+
+static void
+identity_set_fragment_sampler_views(struct pipe_context *_pipe,
+                                    unsigned num,
+                                    struct pipe_sampler_view **_views)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+   struct pipe_sampler_view *unwrapped_views[PIPE_MAX_SAMPLERS];
+   struct pipe_sampler_view **views = NULL;
+   unsigned i;
+
+   if (_views) {
+      for (i = 0; i < num; i++)
+         unwrapped_views[i] = identity_sampler_view_unwrap(_views[i]);
+      for (; i < PIPE_MAX_SAMPLERS; i++)
+         unwrapped_views[i] = NULL;
+
+      views = unwrapped_views;
+   }
+
+   pipe->set_fragment_sampler_views(pipe, num, views);
+}
+
+static void
+identity_set_vertex_sampler_views(struct pipe_context *_pipe,
+                                  unsigned num,
+                                  struct pipe_sampler_view **_views)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+   struct pipe_sampler_view *unwrapped_views[PIPE_MAX_VERTEX_SAMPLERS];
+   struct pipe_sampler_view **views = NULL;
+   unsigned i;
+
+   if (_views) {
+      for (i = 0; i < num; i++)
+         unwrapped_views[i] = identity_sampler_view_unwrap(_views[i]);
+      for (; i < PIPE_MAX_VERTEX_SAMPLERS; i++)
+         unwrapped_views[i] = NULL;
+
+      views = unwrapped_views;
+   }
+
+   pipe->set_vertex_sampler_views(pipe, num, views);
+}
+
+static void
+identity_set_vertex_buffers(struct pipe_context *_pipe,
+                            unsigned num_buffers,
+                            const struct pipe_vertex_buffer *_buffers)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+   struct pipe_vertex_buffer unwrapped_buffers[PIPE_MAX_SHADER_INPUTS];
+   struct pipe_vertex_buffer *buffers = NULL;
+   unsigned i;
+
+   if (num_buffers) {
+      memcpy(unwrapped_buffers, _buffers, num_buffers * sizeof(*_buffers));
+      for (i = 0; i < num_buffers; i++)
+         unwrapped_buffers[i].buffer = identity_resource_unwrap(_buffers[i].buffer);
+      buffers = unwrapped_buffers;
+   }
+
+   pipe->set_vertex_buffers(pipe,
+                            num_buffers,
+                            buffers);
+}
+static void
+identity_resource_copy_region(struct pipe_context *_pipe,
+                              struct pipe_resource *_dst,
+                              struct pipe_subresource subdst,
+                              unsigned dstx,
+                              unsigned dsty,
+                              unsigned dstz,
+                              struct pipe_resource *_src,
+                              struct pipe_subresource subsrc,
+                              unsigned srcx,
+                              unsigned srcy,
+                              unsigned srcz,
+                              unsigned width,
+                              unsigned height)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct identity_resource *id_resource_dst = identity_resource(_dst);
+   struct identity_resource *id_resource_src = identity_resource(_src);
+   struct pipe_context *pipe = id_pipe->pipe;
+   struct pipe_resource *dst = id_resource_dst->resource;
+   struct pipe_resource *src = id_resource_src->resource;
+
+   pipe->resource_copy_region(pipe,
+                              dst,
+                              subdst,
+                              dstx,
+                              dsty,
+                              dstz,
+                              src,
+                              subsrc,
+                              srcx,
+                              srcy,
+                              srcz,
+                              width,
+                              height);
+}
+
+static void
+identity_clear(struct pipe_context *_pipe,
+               unsigned buffers,
+               const float *rgba,
+               double depth,
+               unsigned stencil)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->clear(pipe,
+               buffers,
+               rgba,
+               depth,
+               stencil);
+}
+
+static void
+identity_clear_render_target(struct pipe_context *_pipe,
+                             struct pipe_surface *_dst,
+                             const float *rgba,
+                             unsigned dstx, unsigned dsty,
+                             unsigned width, unsigned height)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct identity_surface *id_surface_dst = identity_surface(_dst);
+   struct pipe_context *pipe = id_pipe->pipe;
+   struct pipe_surface *dst = id_surface_dst->surface;
+
+   pipe->clear_render_target(pipe,
+                             dst,
+                             rgba,
+                             dstx,
+                             dsty,
+                             width,
+                             height);
+}
+static void
+identity_clear_depth_stencil(struct pipe_context *_pipe,
+                             struct pipe_surface *_dst,
+                             unsigned clear_flags,
+                             double depth,
+                             unsigned stencil,
+                             unsigned dstx, unsigned dsty,
+                             unsigned width, unsigned height)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct identity_surface *id_surface_dst = identity_surface(_dst);
+   struct pipe_context *pipe = id_pipe->pipe;
+   struct pipe_surface *dst = id_surface_dst->surface;
+
+   pipe->clear_depth_stencil(pipe,
+                             dst,
+                             clear_flags,
+                             depth,
+                             stencil,
+                             dstx,
+                             dsty,
+                             width,
+                             height);
+
+}
+
+static void
+identity_flush(struct pipe_context *_pipe,
+               unsigned flags,
+               struct pipe_fence_handle **fence)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->flush(pipe,
+               flags,
+               fence);
+}
+
+static unsigned int
+identity_is_resource_referenced(struct pipe_context *_pipe,
+                                struct pipe_resource *_resource,
+                                unsigned face,
+                                unsigned level)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct identity_resource *id_resource = identity_resource(_resource);
+   struct pipe_context *pipe = id_pipe->pipe;
+   struct pipe_resource *resource = id_resource->resource;
+
+   return pipe->is_resource_referenced(pipe,
+                                       resource,
+                                       face,
+                                       level);
+}
+
+static struct pipe_sampler_view *
+identity_context_create_sampler_view(struct pipe_context *_pipe,
+                                     struct pipe_resource *_resource,
+                                     const struct pipe_sampler_view *templ)
+{
+   struct identity_context *id_context = identity_context(_pipe);
+   struct identity_resource *id_resource = identity_resource(_resource);
+   struct pipe_context *pipe = id_context->pipe;
+   struct pipe_resource *resource = id_resource->resource;
+   struct pipe_sampler_view *result;
+
+   result = pipe->create_sampler_view(pipe,
+                                      resource,
+                                      templ);
+
+   if (result)
+      return identity_sampler_view_create(id_context, id_resource, result);
+   return NULL;
+}
+
+static void
+identity_context_sampler_view_destroy(struct pipe_context *_pipe,
+                                      struct pipe_sampler_view *_view)
+{
+   identity_sampler_view_destroy(identity_context(_pipe),
+                                 identity_sampler_view(_view));
+}
+
+static struct pipe_transfer *
+identity_context_get_transfer(struct pipe_context *_context,
+                              struct pipe_resource *_resource,
+                              struct pipe_subresource sr,
+                              unsigned usage,
+                              const struct pipe_box *box)
+{
+   struct identity_context *id_context = identity_context(_context);
+   struct identity_resource *id_resource = identity_resource(_resource);
+   struct pipe_context *context = id_context->pipe;
+   struct pipe_resource *resource = id_resource->resource;
+   struct pipe_transfer *result;
+
+   result = context->get_transfer(context,
+                                  resource,
+                                  sr,
+                                  usage,
+                                  box);
+
+   if (result)
+      return identity_transfer_create(id_context, id_resource, result);
+   return NULL;
+}
+
+static void
+identity_context_transfer_destroy(struct pipe_context *_pipe,
+                                  struct pipe_transfer *_transfer)
+{
+   identity_transfer_destroy(identity_context(_pipe),
+                             identity_transfer(_transfer));
+}
+
+static void *
+identity_context_transfer_map(struct pipe_context *_context,
+                              struct pipe_transfer *_transfer)
+{
+   struct identity_context *id_context = identity_context(_context);
+   struct identity_transfer *id_transfer = identity_transfer(_transfer);
+   struct pipe_context *context = id_context->pipe;
+   struct pipe_transfer *transfer = id_transfer->transfer;
+
+   return context->transfer_map(context,
+                                transfer);
+}
+
+
+
+static void
+identity_context_transfer_flush_region(struct pipe_context *_context,
+                                       struct pipe_transfer *_transfer,
+                                       const struct pipe_box *box)
+{
+   struct identity_context *id_context = identity_context(_context);
+   struct identity_transfer *id_transfer = identity_transfer(_transfer);
+   struct pipe_context *context = id_context->pipe;
+   struct pipe_transfer *transfer = id_transfer->transfer;
+
+   context->transfer_flush_region(context,
+                                  transfer,
+                                  box);
+}
+
+
+static void
+identity_context_transfer_unmap(struct pipe_context *_context,
+                                struct pipe_transfer *_transfer)
+{
+   struct identity_context *id_context = identity_context(_context);
+   struct identity_transfer *id_transfer = identity_transfer(_transfer);
+   struct pipe_context *context = id_context->pipe;
+   struct pipe_transfer *transfer = id_transfer->transfer;
+
+   context->transfer_unmap(context,
+                           transfer);
+}
+
+
+static void 
+identity_context_transfer_inline_write(struct pipe_context *_context,
+                                       struct pipe_resource *_resource,
+                                       struct pipe_subresource sr,
+                                       unsigned usage,
+                                       const struct pipe_box *box,
+                                       const void *data,
+                                       unsigned stride,
+                                       unsigned slice_stride)
+{
+   struct identity_context *id_context = identity_context(_context);
+   struct identity_resource *id_resource = identity_resource(_resource);
+   struct pipe_context *context = id_context->pipe;
+   struct pipe_resource *resource = id_resource->resource;
+
+   context->transfer_inline_write(context,
+                                  resource,
+                                  sr,
+                                  usage,
+                                  box,
+                                  data,
+                                  stride,
+                                  slice_stride);
+}
+
+
+struct pipe_context *
+identity_context_create(struct pipe_screen *_screen, struct pipe_context *pipe)
+{
+   struct identity_context *id_pipe;
+   (void)identity_screen(_screen);
+
+   id_pipe = CALLOC_STRUCT(identity_context);
+   if (!id_pipe) {
+      return NULL;
+   }
+
+   id_pipe->base.winsys = NULL;
+   id_pipe->base.screen = _screen;
+   id_pipe->base.priv = pipe->priv; /* expose wrapped data */
+   id_pipe->base.draw = NULL;
+
+   id_pipe->base.destroy = identity_destroy;
+   id_pipe->base.draw_arrays = identity_draw_arrays;
+   id_pipe->base.draw_elements = identity_draw_elements;
+   id_pipe->base.draw_range_elements = identity_draw_range_elements;
+   id_pipe->base.create_query = identity_create_query;
+   id_pipe->base.destroy_query = identity_destroy_query;
+   id_pipe->base.begin_query = identity_begin_query;
+   id_pipe->base.end_query = identity_end_query;
+   id_pipe->base.get_query_result = identity_get_query_result;
+   id_pipe->base.create_blend_state = identity_create_blend_state;
+   id_pipe->base.bind_blend_state = identity_bind_blend_state;
+   id_pipe->base.delete_blend_state = identity_delete_blend_state;
+   id_pipe->base.create_sampler_state = identity_create_sampler_state;
+   id_pipe->base.bind_fragment_sampler_states = identity_bind_fragment_sampler_states;
+   id_pipe->base.bind_vertex_sampler_states = identity_bind_vertex_sampler_states;
+   id_pipe->base.delete_sampler_state = identity_delete_sampler_state;
+   id_pipe->base.create_rasterizer_state = identity_create_rasterizer_state;
+   id_pipe->base.bind_rasterizer_state = identity_bind_rasterizer_state;
+   id_pipe->base.delete_rasterizer_state = identity_delete_rasterizer_state;
+   id_pipe->base.create_depth_stencil_alpha_state = identity_create_depth_stencil_alpha_state;
+   id_pipe->base.bind_depth_stencil_alpha_state = identity_bind_depth_stencil_alpha_state;
+   id_pipe->base.delete_depth_stencil_alpha_state = identity_delete_depth_stencil_alpha_state;
+   id_pipe->base.create_fs_state = identity_create_fs_state;
+   id_pipe->base.bind_fs_state = identity_bind_fs_state;
+   id_pipe->base.delete_fs_state = identity_delete_fs_state;
+   id_pipe->base.create_vs_state = identity_create_vs_state;
+   id_pipe->base.bind_vs_state = identity_bind_vs_state;
+   id_pipe->base.delete_vs_state = identity_delete_vs_state;
+   id_pipe->base.create_vertex_elements_state = identity_create_vertex_elements_state;
+   id_pipe->base.bind_vertex_elements_state = identity_bind_vertex_elements_state;
+   id_pipe->base.delete_vertex_elements_state = identity_delete_vertex_elements_state;
+   id_pipe->base.set_blend_color = identity_set_blend_color;
+   id_pipe->base.set_stencil_ref = identity_set_stencil_ref;
+   id_pipe->base.set_clip_state = identity_set_clip_state;
+   id_pipe->base.set_sample_mask = identity_set_sample_mask;
+   id_pipe->base.set_constant_buffer = identity_set_constant_buffer;
+   id_pipe->base.set_framebuffer_state = identity_set_framebuffer_state;
+   id_pipe->base.set_polygon_stipple = identity_set_polygon_stipple;
+   id_pipe->base.set_scissor_state = identity_set_scissor_state;
+   id_pipe->base.set_viewport_state = identity_set_viewport_state;
+   id_pipe->base.set_fragment_sampler_views = identity_set_fragment_sampler_views;
+   id_pipe->base.set_vertex_sampler_views = identity_set_vertex_sampler_views;
+   id_pipe->base.set_vertex_buffers = identity_set_vertex_buffers;
+   id_pipe->base.resource_copy_region = identity_resource_copy_region;
+   id_pipe->base.clear = identity_clear;
+   id_pipe->base.clear_render_target = identity_clear_render_target;
+   id_pipe->base.clear_depth_stencil = identity_clear_depth_stencil;
+   id_pipe->base.flush = identity_flush;
+   id_pipe->base.is_resource_referenced = identity_is_resource_referenced;
+   id_pipe->base.create_sampler_view = identity_context_create_sampler_view;
+   id_pipe->base.sampler_view_destroy = identity_context_sampler_view_destroy;
+   id_pipe->base.get_transfer = identity_context_get_transfer;
+   id_pipe->base.transfer_destroy = identity_context_transfer_destroy;
+   id_pipe->base.transfer_map = identity_context_transfer_map;
+   id_pipe->base.transfer_unmap = identity_context_transfer_unmap;
+   id_pipe->base.transfer_flush_region = identity_context_transfer_flush_region;
+   id_pipe->base.transfer_inline_write = identity_context_transfer_inline_write;
+
+   id_pipe->pipe = pipe;
+
+   return &id_pipe->base;
+}
diff --git a/src/gallium/drivers/identity/id_context.h b/src/gallium/drivers/identity/id_context.h
new file mode 100644
index 0000000000..6d3c1899d5
--- /dev/null
+++ b/src/gallium/drivers/identity/id_context.h
@@ -0,0 +1,52 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef ID_CONTEXT_H
+#define ID_CONTEXT_H
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+
+
+struct identity_context {
+   struct pipe_context base;  /**< base class */
+
+   struct pipe_context *pipe;
+};
+
+
+struct pipe_context *
+identity_context_create(struct pipe_screen *screen, struct pipe_context *pipe);
+
+
+static INLINE struct identity_context *
+identity_context(struct pipe_context *pipe)
+{
+   return (struct identity_context *)pipe;
+}
+
+#endif /* ID_CONTEXT_H */
diff --git a/src/gallium/drivers/identity/id_drm.c b/src/gallium/drivers/identity/id_drm.c
new file mode 100644
index 0000000000..15d01519f8
--- /dev/null
+++ b/src/gallium/drivers/identity/id_drm.c
@@ -0,0 +1,93 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "state_tracker/drm_api.h"
+
+#include "util/u_memory.h"
+#include "id_drm.h"
+#include "id_screen.h"
+#include "id_public.h"
+
+struct identity_drm_api
+{
+   struct drm_api base;
+
+   struct drm_api *api;
+};
+
+static INLINE struct identity_drm_api *
+identity_drm_api(struct drm_api *_api)
+{
+   return (struct identity_drm_api *)_api;
+}
+
+static struct pipe_screen *
+identity_drm_create_screen(struct drm_api *_api, int fd)
+{
+   struct identity_drm_api *id_api = identity_drm_api(_api);
+   struct drm_api *api = id_api->api;
+   struct pipe_screen *screen;
+
+   screen = api->create_screen(api, fd);
+
+   return identity_screen_create(screen);
+}
+
+static void
+identity_drm_destroy(struct drm_api *_api)
+{
+   struct identity_drm_api *id_api = identity_drm_api(_api);
+   struct drm_api *api = id_api->api;
+   api->destroy(api);
+
+   FREE(id_api);
+}
+
+struct drm_api *
+identity_drm_create(struct drm_api *api)
+{
+   struct identity_drm_api *id_api;
+
+   if (!api)
+      goto error;
+
+   id_api = CALLOC_STRUCT(identity_drm_api);
+
+   if (!id_api)
+      goto error;
+
+   id_api->base.name = api->name;
+   id_api->base.driver_name = api->driver_name;
+   id_api->base.create_screen = identity_drm_create_screen;
+   id_api->base.destroy = identity_drm_destroy;
+   id_api->api = api;
+
+   return &id_api->base;
+
+error:
+   return api;
+}
diff --git a/src/gallium/drivers/identity/id_drm.h b/src/gallium/drivers/identity/id_drm.h
new file mode 100644
index 0000000000..cf2ad2ce07
--- /dev/null
+++ b/src/gallium/drivers/identity/id_drm.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef ID_DRM_H
+#define ID_DRM_H
+
+struct drm_api;
+
+struct drm_api* identity_drm_create(struct drm_api *api);
+
+#endif /* ID_DRM_H */
diff --git a/src/gallium/drivers/identity/id_objects.c b/src/gallium/drivers/identity/id_objects.c
new file mode 100644
index 0000000000..ca4743f9ef
--- /dev/null
+++ b/src/gallium/drivers/identity/id_objects.c
@@ -0,0 +1,187 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+
+#include "id_screen.h"
+#include "id_objects.h"
+#include "id_context.h"
+
+
+
+struct pipe_resource *
+identity_resource_create(struct identity_screen *id_screen,
+                        struct pipe_resource *resource)
+{
+   struct identity_resource *id_resource;
+
+   if(!resource)
+      goto error;
+
+   assert(resource->screen == id_screen->screen);
+
+   id_resource = CALLOC_STRUCT(identity_resource);
+   if(!id_resource)
+      goto error;
+
+   memcpy(&id_resource->base, resource, sizeof(struct pipe_resource));
+
+   pipe_reference_init(&id_resource->base.reference, 1);
+   id_resource->base.screen = &id_screen->base;
+   id_resource->resource = resource;
+
+   return &id_resource->base;
+
+error:
+   pipe_resource_reference(&resource, NULL);
+   return NULL;
+}
+
+void
+identity_resource_destroy(struct identity_resource *id_resource)
+{
+   pipe_resource_reference(&id_resource->resource, NULL);
+   FREE(id_resource);
+}
+
+
+struct pipe_surface *
+identity_surface_create(struct identity_resource *id_resource,
+                        struct pipe_surface *surface)
+{
+   struct identity_surface *id_surface;
+
+   if(!surface)
+      goto error;
+
+   assert(surface->texture == id_resource->resource);
+
+   id_surface = CALLOC_STRUCT(identity_surface);
+   if(!id_surface)
+      goto error;
+
+   memcpy(&id_surface->base, surface, sizeof(struct pipe_surface));
+
+   pipe_reference_init(&id_surface->base.reference, 1);
+   id_surface->base.texture = NULL;
+   pipe_resource_reference(&id_surface->base.texture, &id_resource->base);
+   id_surface->surface = surface;
+
+   return &id_surface->base;
+
+error:
+   pipe_surface_reference(&surface, NULL);
+   return NULL;
+}
+
+void
+identity_surface_destroy(struct identity_surface *id_surface)
+{
+   pipe_resource_reference(&id_surface->base.texture, NULL);
+   pipe_surface_reference(&id_surface->surface, NULL);
+   FREE(id_surface);
+}
+
+
+struct pipe_sampler_view *
+identity_sampler_view_create(struct identity_context *id_context,
+                             struct identity_resource *id_resource,
+                             struct pipe_sampler_view *view)
+{
+   struct identity_sampler_view *id_view;
+
+   if (!view)
+      goto error;
+
+   assert(view->texture == id_resource->resource);
+
+   id_view = MALLOC(sizeof(struct identity_sampler_view));
+
+   id_view->base = *view;
+   id_view->base.reference.count = 1;
+   id_view->base.texture = NULL;
+   pipe_resource_reference(&id_view->base.texture, id_resource->resource);
+   id_view->base.context = id_context->pipe;
+
+   return &id_view->base;
+error:
+   return NULL;
+}
+
+void
+identity_sampler_view_destroy(struct identity_context *id_context,
+                              struct identity_sampler_view *id_view)
+{
+   pipe_resource_reference(&id_view->base.texture, NULL);
+   id_context->pipe->sampler_view_destroy(id_context->pipe,
+                                          id_view->sampler_view);
+   FREE(id_view);
+}
+
+
+struct pipe_transfer *
+identity_transfer_create(struct identity_context *id_context,
+                         struct identity_resource *id_resource,
+                         struct pipe_transfer *transfer)
+{
+   struct identity_transfer *id_transfer;
+
+   if(!transfer)
+      goto error;
+
+   assert(transfer->resource == id_resource->resource);
+
+   id_transfer = CALLOC_STRUCT(identity_transfer);
+   if(!id_transfer)
+      goto error;
+
+   memcpy(&id_transfer->base, transfer, sizeof(struct pipe_transfer));
+
+   id_transfer->base.resource = NULL;
+   id_transfer->transfer = transfer;
+
+   pipe_resource_reference(&id_transfer->base.resource, &id_resource->base);
+   assert(id_transfer->base.resource == &id_resource->base);
+
+   return &id_transfer->base;
+
+error:
+   id_context->pipe->transfer_destroy(id_context->pipe, transfer);
+   return NULL;
+}
+
+void
+identity_transfer_destroy(struct identity_context *id_context,
+                          struct identity_transfer *id_transfer)
+{
+   pipe_resource_reference(&id_transfer->base.resource, NULL);
+   id_transfer->pipe->transfer_destroy(id_context->pipe,
+                                       id_transfer->transfer);
+   FREE(id_transfer);
+}
+
diff --git a/src/gallium/drivers/identity/id_objects.h b/src/gallium/drivers/identity/id_objects.h
new file mode 100644
index 0000000000..5eea10b0b5
--- /dev/null
+++ b/src/gallium/drivers/identity/id_objects.h
@@ -0,0 +1,176 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef ID_OBJECTS_H
+#define ID_OBJECTS_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "id_screen.h"
+
+struct identity_context;
+
+
+struct identity_resource
+{
+   struct pipe_resource base;
+
+   struct pipe_resource *resource;
+};
+
+
+struct identity_sampler_view
+{
+   struct pipe_sampler_view base;
+
+   struct pipe_sampler_view *sampler_view;
+};
+
+
+struct identity_surface
+{
+   struct pipe_surface base;
+
+   struct pipe_surface *surface;
+};
+
+
+struct identity_transfer
+{
+   struct pipe_transfer base;
+
+   struct pipe_context *pipe;
+   struct pipe_transfer *transfer;
+};
+
+
+static INLINE struct identity_resource *
+identity_resource(struct pipe_resource *_resource)
+{
+   if(!_resource)
+      return NULL;
+   (void)identity_screen(_resource->screen);
+   return (struct identity_resource *)_resource;
+}
+
+static INLINE struct identity_sampler_view *
+identity_sampler_view(struct pipe_sampler_view *_sampler_view)
+{
+   if (!_sampler_view) {
+      return NULL;
+   }
+   return (struct identity_sampler_view *)_sampler_view;
+}
+
+static INLINE struct identity_surface *
+identity_surface(struct pipe_surface *_surface)
+{
+   if(!_surface)
+      return NULL;
+   (void)identity_resource(_surface->texture);
+   return (struct identity_surface *)_surface;
+}
+
+static INLINE struct identity_transfer *
+identity_transfer(struct pipe_transfer *_transfer)
+{
+   if(!_transfer)
+      return NULL;
+   (void)identity_resource(_transfer->resource);
+   return (struct identity_transfer *)_transfer;
+}
+
+static INLINE struct pipe_resource *
+identity_resource_unwrap(struct pipe_resource *_resource)
+{
+   if(!_resource)
+      return NULL;
+   return identity_resource(_resource)->resource;
+}
+
+static INLINE struct pipe_sampler_view *
+identity_sampler_view_unwrap(struct pipe_sampler_view *_sampler_view)
+{
+   if (!_sampler_view) {
+      return NULL;
+   }
+   return identity_sampler_view(_sampler_view)->sampler_view;
+}
+
+static INLINE struct pipe_surface *
+identity_surface_unwrap(struct pipe_surface *_surface)
+{
+   if(!_surface)
+      return NULL;
+   return identity_surface(_surface)->surface;
+}
+
+static INLINE struct pipe_transfer *
+identity_transfer_unwrap(struct pipe_transfer *_transfer)
+{
+   if(!_transfer)
+      return NULL;
+   return identity_transfer(_transfer)->transfer;
+}
+
+
+struct pipe_resource *
+identity_resource_create(struct identity_screen *id_screen,
+                         struct pipe_resource *resource);
+
+void
+identity_resource_destroy(struct identity_resource *id_resource);
+
+struct pipe_surface *
+identity_surface_create(struct identity_resource *id_resource,
+                        struct pipe_surface *surface);
+
+void
+identity_surface_destroy(struct identity_surface *id_surface);
+
+struct pipe_sampler_view *
+identity_sampler_view_create(struct identity_context *id_context,
+                             struct identity_resource *id_resource,
+                             struct pipe_sampler_view *view);
+
+void
+identity_sampler_view_destroy(struct identity_context *id_context,
+                              struct identity_sampler_view *id_sampler_view);
+
+struct pipe_transfer *
+identity_transfer_create(struct identity_context *id_context,
+                         struct identity_resource *id_resource,
+                         struct pipe_transfer *transfer);
+
+void
+identity_transfer_destroy(struct identity_context *id_context,
+                          struct identity_transfer *id_transfer);
+
+
+#endif /* ID_OBJECTS_H */
diff --git a/src/gallium/drivers/identity/id_public.h b/src/gallium/drivers/identity/id_public.h
new file mode 100644
index 0000000000..d0d5847c61
--- /dev/null
+++ b/src/gallium/drivers/identity/id_public.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef ID_PUBLIC_H
+#define ID_PUBLIC_H
+
+struct pipe_screen;
+struct pipe_context;
+
+struct pipe_screen *
+identity_screen_create(struct pipe_screen *screen);
+
+#endif /* ID_PUBLIC_H */
diff --git a/src/gallium/drivers/identity/id_screen.c b/src/gallium/drivers/identity/id_screen.c
new file mode 100644
index 0000000000..f71585e06f
--- /dev/null
+++ b/src/gallium/drivers/identity/id_screen.c
@@ -0,0 +1,325 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+
+#include "id_public.h"
+#include "id_screen.h"
+#include "id_context.h"
+#include "id_objects.h"
+
+
+static void
+identity_screen_destroy(struct pipe_screen *_screen)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+
+   screen->destroy(screen);
+
+   FREE(id_screen);
+}
+
+static const char *
+identity_screen_get_name(struct pipe_screen *_screen)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+
+   return screen->get_name(screen);
+}
+
+static const char *
+identity_screen_get_vendor(struct pipe_screen *_screen)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+
+   return screen->get_vendor(screen);
+}
+
+static int
+identity_screen_get_param(struct pipe_screen *_screen,
+                          enum pipe_cap param)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+
+   return screen->get_param(screen,
+                            param);
+}
+
+static float
+identity_screen_get_paramf(struct pipe_screen *_screen,
+                           enum pipe_cap param)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+
+   return screen->get_paramf(screen,
+                             param);
+}
+
+static boolean
+identity_screen_is_format_supported(struct pipe_screen *_screen,
+                                    enum pipe_format format,
+                                    enum pipe_texture_target target,
+                                    unsigned sample_count,
+                                    unsigned tex_usage,
+                                    unsigned geom_flags)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+
+   return screen->is_format_supported(screen,
+                                      format,
+                                      target,
+                                      sample_count,
+                                      tex_usage,
+                                      geom_flags);
+}
+
+static struct pipe_context *
+identity_screen_context_create(struct pipe_screen *_screen,
+                               void *priv)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+   struct pipe_context *result;
+
+   result = screen->context_create(screen, priv);
+   if (result)
+      return identity_context_create(_screen, result);
+   return NULL;
+}
+
+static struct pipe_resource *
+identity_screen_resource_create(struct pipe_screen *_screen,
+                                const struct pipe_resource *templat)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+   struct pipe_resource *result;
+
+   result = screen->resource_create(screen,
+                                    templat);
+
+   if (result)
+      return identity_resource_create(id_screen, result);
+   return NULL;
+}
+
+static struct pipe_resource *
+identity_screen_resource_from_handle(struct pipe_screen *_screen,
+                                     const struct pipe_resource *templ,
+                                     struct winsys_handle *handle)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+   struct pipe_resource *result;
+
+   /* TODO trace call */
+
+   result = screen->resource_from_handle(screen, templ, handle);
+
+   result = identity_resource_create(identity_screen(_screen), result);
+
+   return result;
+}
+
+static boolean
+identity_screen_resource_get_handle(struct pipe_screen *_screen,
+                                    struct pipe_resource *_resource,
+                                    struct winsys_handle *handle)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct identity_resource *id_resource = identity_resource(_resource);
+   struct pipe_screen *screen = id_screen->screen;
+   struct pipe_resource *resource = id_resource->resource;
+
+   /* TODO trace call */
+
+   return screen->resource_get_handle(screen, resource, handle);
+}
+
+
+
+static void
+identity_screen_resource_destroy(struct pipe_screen *screen,
+                                 struct pipe_resource *_resource)
+{
+   identity_resource_destroy(identity_resource(_resource));
+}
+
+static struct pipe_surface *
+identity_screen_get_tex_surface(struct pipe_screen *_screen,
+                                struct pipe_resource *_resource,
+                                unsigned face,
+                                unsigned level,
+                                unsigned zslice,
+                                unsigned usage)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct identity_resource *id_resource = identity_resource(_resource);
+   struct pipe_screen *screen = id_screen->screen;
+   struct pipe_resource *resource = id_resource->resource;
+   struct pipe_surface *result;
+
+   result = screen->get_tex_surface(screen,
+                                    resource,
+                                    face,
+                                    level,
+                                    zslice,
+                                    usage);
+
+   if (result)
+      return identity_surface_create(id_resource, result);
+   return NULL;
+}
+
+static void
+identity_screen_tex_surface_destroy(struct pipe_surface *_surface)
+{
+   identity_surface_destroy(identity_surface(_surface));
+}
+
+
+
+static struct pipe_resource *
+identity_screen_user_buffer_create(struct pipe_screen *_screen,
+                                   void *ptr,
+                                   unsigned bytes,
+                                   unsigned usage)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+   struct pipe_resource *result;
+
+   result = screen->user_buffer_create(screen,
+                                       ptr,
+                                       bytes,
+                                       usage);
+
+   if (result)
+      return identity_resource_create(id_screen, result);
+   return NULL;
+}
+
+
+
+static void
+identity_screen_flush_frontbuffer(struct pipe_screen *_screen,
+                                  struct pipe_surface *_surface,
+                                  void *context_private)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct identity_surface *id_surface = identity_surface(_surface);
+   struct pipe_screen *screen = id_screen->screen;
+   struct pipe_surface *surface = id_surface->surface;
+
+   screen->flush_frontbuffer(screen,
+                             surface,
+                             context_private);
+}
+
+static void
+identity_screen_fence_reference(struct pipe_screen *_screen,
+                                struct pipe_fence_handle **ptr,
+                                struct pipe_fence_handle *fence)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+
+   screen->fence_reference(screen,
+                           ptr,
+                           fence);
+}
+
+static int
+identity_screen_fence_signalled(struct pipe_screen *_screen,
+                                struct pipe_fence_handle *fence,
+                                unsigned flags)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+
+   return screen->fence_signalled(screen,
+                                  fence,
+                                  flags);
+}
+
+static int
+identity_screen_fence_finish(struct pipe_screen *_screen,
+                             struct pipe_fence_handle *fence,
+                             unsigned flags)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+
+   return screen->fence_finish(screen,
+                               fence,
+                               flags);
+}
+
+struct pipe_screen *
+identity_screen_create(struct pipe_screen *screen)
+{
+   struct identity_screen *id_screen;
+
+   id_screen = CALLOC_STRUCT(identity_screen);
+   if (!id_screen) {
+      return NULL;
+   }
+
+   id_screen->base.winsys = NULL;
+
+   id_screen->base.destroy = identity_screen_destroy;
+   id_screen->base.get_name = identity_screen_get_name;
+   id_screen->base.get_vendor = identity_screen_get_vendor;
+   id_screen->base.get_param = identity_screen_get_param;
+   id_screen->base.get_paramf = identity_screen_get_paramf;
+   id_screen->base.is_format_supported = identity_screen_is_format_supported;
+   id_screen->base.context_create = identity_screen_context_create;
+   id_screen->base.resource_create = identity_screen_resource_create;
+   id_screen->base.resource_from_handle = identity_screen_resource_from_handle;
+   id_screen->base.resource_get_handle = identity_screen_resource_get_handle;
+   id_screen->base.resource_destroy = identity_screen_resource_destroy;
+   id_screen->base.get_tex_surface = identity_screen_get_tex_surface;
+   id_screen->base.tex_surface_destroy = identity_screen_tex_surface_destroy;
+   id_screen->base.user_buffer_create = identity_screen_user_buffer_create;
+   id_screen->base.flush_frontbuffer = identity_screen_flush_frontbuffer;
+   id_screen->base.fence_reference = identity_screen_fence_reference;
+   id_screen->base.fence_signalled = identity_screen_fence_signalled;
+   id_screen->base.fence_finish = identity_screen_fence_finish;
+
+   id_screen->screen = screen;
+
+   return &id_screen->base;
+}
diff --git a/src/gallium/drivers/identity/id_screen.h b/src/gallium/drivers/identity/id_screen.h
new file mode 100644
index 0000000000..2c4f129089
--- /dev/null
+++ b/src/gallium/drivers/identity/id_screen.h
@@ -0,0 +1,48 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef ID_SCREEN_H
+#define ID_SCREEN_H
+
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+
+
+struct identity_screen {
+   struct pipe_screen base;
+
+   struct pipe_screen *screen;
+};
+
+
+static INLINE struct identity_screen *
+identity_screen(struct pipe_screen *screen)
+{
+   return (struct identity_screen *)screen;
+}
+
+#endif /* ID_SCREEN_H */
diff --git a/src/gallium/drivers/llvmpipe/.gitignore b/src/gallium/drivers/llvmpipe/.gitignore
new file mode 100644
index 0000000000..a1b6f56e0d
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/.gitignore
@@ -0,0 +1,5 @@
+lp_tile_soa.c
+lp_test_blend
+lp_test_conv
+lp_test_format
+lp_test_printf
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
new file mode 100644
index 0000000000..ee28179c30
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -0,0 +1,72 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = llvmpipe
+
+DEFINES += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+
+C_SOURCES = \
+	lp_bld_alpha.c \
+	lp_bld_blend_aos.c \
+	lp_bld_blend_logicop.c \
+	lp_bld_blend_soa.c \
+	lp_bld_depth.c \
+	lp_bld_interp.c \
+	lp_clear.c \
+	lp_context.c \
+	lp_draw_arrays.c \
+	lp_fence.c \
+	lp_flush.c \
+	lp_jit.c \
+	lp_perf.c \
+	lp_query.c \
+	lp_rast.c \
+	lp_rast_tri.c \
+	lp_scene.c \
+	lp_scene_queue.c \
+	lp_screen.c \
+	lp_setup.c \
+	lp_setup_line.c \
+	lp_setup_point.c \
+	lp_setup_tri.c \
+	lp_setup_vbuf.c \
+	lp_state_blend.c \
+	lp_state_clip.c \
+	lp_state_derived.c \
+	lp_state_fs.c \
+	lp_state_gs.c \
+	lp_state_rasterizer.c \
+	lp_state_sampler.c \
+        lp_state_so.c \
+	lp_state_surface.c \
+	lp_state_vertex.c \
+	lp_state_vs.c \
+	lp_surface.c \
+	lp_tex_sample.c \
+	lp_texture.c \
+	lp_tile_image.c \
+	lp_tile_soa.c
+
+CPP_SOURCES = \
+
+PROGS := lp_test_format	\
+	 lp_test_blend	\
+	 lp_test_conv	\
+	 lp_test_printf \
+         lp_test_sincos
+
+lp_test_sincos.o : sse_mathfun.h
+
+PROGS_DEPS := ../../auxiliary/libgallium.a
+
+include ../../Makefile.template
+
+lp_tile_soa.c: lp_tile_soa.py ../../auxiliary/util/u_format_parse.py ../../auxiliary/util/u_format_pack.py ../../auxiliary/util/u_format.csv
+	python lp_tile_soa.py ../../auxiliary/util/u_format.csv > $@
+
+LDFLAGS += $(LLVM_LDFLAGS)
+LIBS += -L../../auxiliary/ -lgallium libllvmpipe.a $(LLVM_LIBS) $(GL_LIB_DEPS)
+LD=g++
+
+$(PROGS): lp_test_main.o libllvmpipe.a
+
diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
new file mode 100644
index 0000000000..8b5539d2c5
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/README
@@ -0,0 +1,166 @@
+LLVMPIPE -- a fork of softpipe that employs LLVM for code generation.
+
+
+Status
+======
+
+Done so far is:
+
+ - the whole fragment pipeline is code generated in a single function
+ 
+   - input interpolation
+   
+   - depth testing
+ 
+   - texture sampling
+     - 1D/2D/3D/cube maps supported
+     - all texture wrap modes supported
+     - all texture filtering modes supported
+     - perhaps not all texture formats yet supported
+   
+   - fragment shader TGSI translation
+     - same level of support as the TGSI SSE2 exec machine, with the exception
+       we don't fallback to TGSI interpretation when an unsupported opcode is
+       found, but just ignore it
+     - done in SoA layout
+     - input interpolation also code generated
+ 
+   - alpha testing
+ 
+   - blend (including logic ops)
+     - both in SoA and AoS layouts, but only the former used for now
+ 
+ - code is generic
+   - intermediates can be vectors of floats, ubytes, fixed point, etc, and of
+     any width and length
+   - not all operations are implemented for these types yet though
+
+Most mesa/progs/demos/* work. 
+
+To do (probably by this order):
+
+ - code generate stipple and stencil testing
+
+ - translate TGSI control flow instructions, and all other remaining opcodes
+ 
+ - integrate with the draw module for VS code generation
+
+ - code generate the triangle setup and rasterization
+
+
+Requirements
+============
+
+ - A x86 or amd64 processor.  64bit mode is preferred.
+ 
+   Support for sse2 is strongly encouraged.  Support for ssse3, and sse4.1 will
+   yield the most efficient code.  The less features the CPU has the more
+   likely is that you ran into underperforming, buggy, or incomplete code.  
+   
+   See /proc/cpuinfo to know what your CPU supports.
+ 
+ - LLVM 2.6 (or later)
+ 
+   For Linux, on a recent Debian based distribution do:
+ 
+     aptitude install llvm-dev
+
+   For Windows download pre-built MSVC 9.0 or MinGW binaries from
+   http://people.freedesktop.org/~jrfonseca/llvm/ and set the LLVM environment
+   variable to the extracted path.
+
+   For MSVC there are two set of binaries: llvm-x.x-msvc32mt.7z and
+   llvm-x.x-msvc32mtd.7z .
+
+   You have to set the LLVM=/path/to/llvm-x.x-msvc32mtd env var when passing
+   debug=yes to scons, and LLVM=/path/to/llvm-x.x-msvc32mt when building with
+   debug=no. This is necessary as LLVM builds as static library so the chosen
+   MS CRT must match.
+
+   The version of LLVM from SVN ("2.7svn") from mid-March 2010 is pretty
+   stable and has some features not in version 2.6.
+
+ - scons (optional)
+
+ - udis86, http://udis86.sourceforge.net/ (optional). My personal repository
+   supports more opcodes which haven't been merged upstream yet:
+ 
+     git clone git://anongit.freedesktop.org/~jrfonseca/udis86
+     cd udis86
+     ./autogen.sh
+     ./configure --with-pic
+     make
+     sudo make install
+ 
+
+Building
+========
+
+To build everything on Linux invoke scons as:
+
+  scons debug=yes statetrackers=mesa drivers=llvmpipe winsys=xlib dri=false
+
+Alternatively, you can build it with GNU make, if you prefer, by invoking it as
+
+  make linux-llvm
+
+but the rest of these instructions assume that scons is used.
+
+For windows is everything the except except the winsys:
+
+  scons debug=yes statetrackers=mesa drivers=llvmpipe winsys=gdi dri=false
+
+Using
+=====
+
+On Linux, building will create a drop-in alternative for libGL.so. To use it
+set the environment variables:
+
+  export LD_LIBRARY_PATH=$PWD/build/linux-x86_64-debug/lib:$LD_LIBRARY_PATH
+
+or
+
+  export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/lib:$LD_LIBRARY_PATH
+
+For performance evaluation pass debug=no to scons, and use the corresponding
+lib directory without the "-debug" suffix.
+
+On Windows, building will create a drop-in alternative for opengl32.dll. To use
+it put it in the same directory as the application. It can also be used by
+replacing the native ICD driver, but it's quite an advanced usage, so if you
+need to ask, don't even try it.
+
+
+Unit testing
+============
+
+Building will also create several unit tests in
+build/linux-???-debug/gallium/drivers/llvmpipe:
+
+ - lp_test_blend: blending
+ - lp_test_conv: SIMD vector conversion
+ - lp_test_format: pixel unpacking/packing
+
+Some of this tests can output results and benchmarks to a tab-separated-file
+for posterior analysis, e.g.:
+
+  build/linux-x86_64-debug/gallium/drivers/llvmpipe/lp_test_blend -o blend.tsv
+
+
+Development Notes
+=================
+
+- When looking to this code by the first time start in lp_state_fs.c, and 
+  then skim through the lp_bld_* functions called in there, and the comments
+  at the top of the lp_bld_*.c functions.  
+
+- The driver-independent parts of the LLVM / Gallium code are found in
+  src/gallium/auxiliary/gallivm/.  The filenames and function prefixes
+  need to be renamed from "lp_bld_" to something else though.
+
+- We use LLVM-C bindings for now. They are not documented, but follow the C++
+  interfaces very closely, and appear to be complete enough for code
+  generation. See 
+  http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
+  for a stand-alone example.
+  See the llvm-c/Core.h file for reference.
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
new file mode 100644
index 0000000000..a1ef71da89
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -0,0 +1,92 @@
+Import('*')
+
+if not env['llvm']:
+    print 'warning: LLVM disabled: not building llvmpipe'
+    Return()
+
+env = env.Clone()
+
+env.Tool('udis86')
+
+env.Append(CPPPATH = ['.'])
+
+env.CodeGenerate(
+	target = 'lp_tile_soa.c',
+	script = 'lp_tile_soa.py',
+	source = ['#src/gallium/auxiliary/util/u_format.csv'],
+	command = 'python $SCRIPT $SOURCE > $TARGET'
+)
+
+# XXX: Our dependency scanner only finds depended modules in relative dirs.
+env.Depends('lp_tile_soa.c', [
+    '#src/gallium/auxiliary/util/u_format_parse.py', 
+    '#src/gallium/auxiliary/util/u_format_pack.py', 
+])
+
+llvmpipe = env.ConvenienceLibrary(
+	target = 'llvmpipe',
+	source = [
+		'lp_bld_alpha.c',
+		'lp_bld_blend_aos.c',
+		'lp_bld_blend_logicop.c',
+		'lp_bld_blend_soa.c',
+		'lp_bld_depth.c',
+		'lp_bld_interp.c',
+		'lp_clear.c',
+		'lp_context.c',
+		'lp_draw_arrays.c',
+		'lp_fence.c',
+		'lp_flush.c',
+		'lp_jit.c',
+		'lp_perf.c',
+		'lp_query.c',
+		'lp_rast.c',
+		'lp_rast_tri.c',
+		'lp_scene.c',
+		'lp_scene_queue.c',
+		'lp_screen.c',
+		'lp_setup.c',
+		'lp_setup_line.c',
+		'lp_setup_point.c',
+		'lp_setup_tri.c',
+		'lp_setup_vbuf.c',
+		'lp_state_blend.c',
+		'lp_state_clip.c',
+		'lp_state_derived.c',
+		'lp_state_fs.c',
+		'lp_state_gs.c',
+		'lp_state_rasterizer.c',
+		'lp_state_sampler.c',
+                'lp_state_so.c',
+		'lp_state_surface.c',
+		'lp_state_vertex.c',
+		'lp_state_vs.c',
+		'lp_surface.c',
+		'lp_tex_sample.c',
+		'lp_texture.c',
+		'lp_tile_image.c',
+		'lp_tile_soa.c',
+	])
+
+
+if env['platform'] != 'embedded':
+    env = env.Clone()
+
+    env.Prepend(LIBS = [llvmpipe] + gallium)
+
+    tests = [
+        'format',
+        'blend',
+        'conv',
+	'printf',
+	'sincos',
+    ]
+
+    for test in tests:
+        target = env.Program(
+            target = 'lp_test_' + test,
+            source = ['lp_test_' + test + '.c', 'lp_test_main.c'],
+        )
+        env.InstallProgram(target)
+
+Export('llvmpipe')
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
new file mode 100644
index 0000000000..8514030cde
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
@@ -0,0 +1,64 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Alpha testing to LLVM IR translation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_state.h"
+
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_flow.h"
+#include "gallivm/lp_bld_debug.h"
+
+#include "lp_bld_alpha.h"
+
+
+void
+lp_build_alpha_test(LLVMBuilderRef builder,
+                    const struct pipe_alpha_state *state,
+                    struct lp_type type,
+                    struct lp_build_mask_context *mask,
+                    LLVMValueRef alpha,
+                    LLVMValueRef ref)
+{
+   struct lp_build_context bld;
+
+   lp_build_context_init(&bld, builder, type);
+
+   if(state->enabled) {
+      LLVMValueRef test = lp_build_cmp(&bld, state->func, alpha, ref);
+
+      lp_build_name(test, "alpha_mask");
+
+      lp_build_mask_update(mask, test);
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
new file mode 100644
index 0000000000..0f99fec65e
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
@@ -0,0 +1,54 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Alpha testing to LLVM IR translation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_ALPHA_H
+#define LP_BLD_ALPHA_H
+
+
+#include "gallivm/lp_bld.h"
+
+struct pipe_alpha_state;
+struct lp_type;
+struct lp_build_mask_context;
+
+
+void
+lp_build_alpha_test(LLVMBuilderRef builder,
+                    const struct pipe_alpha_state *state,
+                    struct lp_type type,
+                    struct lp_build_mask_context *mask,
+                    LLVMValueRef alpha,
+                    LLVMValueRef ref);
+
+
+#endif /* !LP_BLD_ALPHA_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
new file mode 100644
index 0000000000..5cecec3d7f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
@@ -0,0 +1,98 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef LP_BLD_BLEND_H
+#define LP_BLD_BLEND_H
+
+
+#include "gallivm/lp_bld.h"
+ 
+#include "pipe/p_format.h"
+
+
+struct pipe_blend_state;
+struct lp_type;
+struct lp_build_context;
+
+
+/**
+ * Whether the blending function is commutative or not.
+ */
+boolean
+lp_build_blend_func_commutative(unsigned func);
+
+
+/**
+ * Whether the blending functions are the reverse of each other.
+ */
+boolean
+lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func);
+
+
+LLVMValueRef
+lp_build_blend_func(struct lp_build_context *bld,
+                    unsigned func,
+                    LLVMValueRef term1,
+                    LLVMValueRef term2);
+
+
+LLVMValueRef
+lp_build_blend_aos(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   struct lp_type type,
+                   unsigned rt,
+                   LLVMValueRef src,
+                   LLVMValueRef dst,
+                   LLVMValueRef const_,
+                   unsigned alpha_swizzle);
+
+
+void
+lp_build_blend_soa(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   struct lp_type type,
+                   unsigned rt,
+                   LLVMValueRef src[4],
+                   LLVMValueRef dst[4],
+                   LLVMValueRef const_[4],
+                   LLVMValueRef res[4]);
+
+
+/**
+ * Apply a logic op.
+ *
+ * src/dst parameters are packed values. It should work regardless the inputs
+ * are scalars, or a vector.
+ */
+LLVMValueRef
+lp_build_logicop(LLVMBuilderRef builder,
+                 unsigned logicop_func,
+                 LLVMValueRef src,
+                 LLVMValueRef dst);
+
+
+#endif /* !LP_BLD_BLEND_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
new file mode 100644
index 0000000000..70d08e71f6
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -0,0 +1,363 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- AoS layout.
+ *
+ * AoS blending is in general much slower than SoA, but there are some cases
+ * where it might be faster. In particular, if a pixel is rendered only once
+ * then the overhead of tiling and untiling will dominate over the speedup that
+ * SoA gives. So we might want to detect such cases and fallback to AoS in the
+ * future, but for now this function is here for historical/benchmarking
+ * purposes.
+ *
+ * Run lp_blend_test after any change to this file.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_debug.h"
+
+#include "lp_bld_blend.h"
+
+
+/**
+ * We may the same values several times, so we keep them here to avoid
+ * recomputing them. Also reusing the values allows us to do simplifications
+ * that LLVM optimization passes wouldn't normally be able to do.
+ */
+struct lp_build_blend_aos_context
+{
+   struct lp_build_context base;
+   
+   LLVMValueRef src;
+   LLVMValueRef dst;
+   LLVMValueRef const_;
+
+   LLVMValueRef inv_src;
+   LLVMValueRef inv_dst;
+   LLVMValueRef inv_const;
+   LLVMValueRef saturate;
+
+   LLVMValueRef rgb_src_factor;
+   LLVMValueRef alpha_src_factor;
+   LLVMValueRef rgb_dst_factor;
+   LLVMValueRef alpha_dst_factor;
+};
+
+
+static LLVMValueRef
+lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
+                                 unsigned factor,
+                                 boolean alpha)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_ONE:
+      return bld->base.one;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return bld->src;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return bld->dst;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      if(alpha)
+         return bld->base.one;
+      else {
+         if(!bld->inv_dst)
+            bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
+         if(!bld->saturate)
+            bld->saturate = lp_build_min(&bld->base, bld->src, bld->inv_dst);
+         return bld->saturate;
+      }
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return bld->const_;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      if(!bld->inv_src)
+         bld->inv_src = lp_build_comp(&bld->base, bld->src);
+      return bld->inv_src;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      if(!bld->inv_dst)
+         bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
+      return bld->inv_dst;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      if(!bld->inv_const)
+         bld->inv_const = lp_build_comp(&bld->base, bld->const_);
+      return bld->inv_const;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   default:
+      assert(0);
+      return bld->base.zero;
+   }
+}
+
+
+enum lp_build_blend_swizzle {
+   LP_BUILD_BLEND_SWIZZLE_RGBA = 0,
+   LP_BUILD_BLEND_SWIZZLE_AAAA = 1
+};
+
+
+/**
+ * How should we shuffle the base factor.
+ */
+static enum lp_build_blend_swizzle
+lp_build_blend_factor_swizzle(unsigned factor)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+   case PIPE_BLENDFACTOR_ZERO:
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+   case PIPE_BLENDFACTOR_DST_COLOR:
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      return LP_BUILD_BLEND_SWIZZLE_RGBA;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      return LP_BUILD_BLEND_SWIZZLE_AAAA;
+   default:
+      assert(0);
+      return LP_BUILD_BLEND_SWIZZLE_RGBA;
+   }
+}
+
+
+static LLVMValueRef
+lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld,
+                       LLVMValueRef rgb, 
+                       LLVMValueRef alpha, 
+                       enum lp_build_blend_swizzle rgb_swizzle,
+                       unsigned alpha_swizzle)
+{
+   if(rgb == alpha) {
+      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA)
+         return rgb;
+      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA)
+         return lp_build_broadcast_aos(&bld->base, rgb, alpha_swizzle);
+   }
+   else {
+      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA) {
+         boolean cond[4] = {0, 0, 0, 0};
+         cond[alpha_swizzle] = 1;
+         return lp_build_select_aos(&bld->base, alpha, rgb, cond);
+      }
+      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA) {
+         unsigned char swizzle[4];
+         swizzle[0] = alpha_swizzle;
+         swizzle[1] = alpha_swizzle;
+         swizzle[2] = alpha_swizzle;
+         swizzle[3] = alpha_swizzle;
+         swizzle[alpha_swizzle] += 4;
+         return lp_build_swizzle2_aos(&bld->base, rgb, alpha, swizzle);
+      }
+   }
+   assert(0);
+   return bld->base.undef;
+}
+
+
+/**
+ * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendFuncSeparate.xml
+ */
+static LLVMValueRef
+lp_build_blend_factor(struct lp_build_blend_aos_context *bld,
+                      LLVMValueRef factor1,
+                      unsigned rgb_factor,
+                      unsigned alpha_factor,
+                      unsigned alpha_swizzle)
+{
+   LLVMValueRef rgb_factor_;
+   LLVMValueRef alpha_factor_;
+   LLVMValueRef factor2;
+   enum lp_build_blend_swizzle rgb_swizzle;
+
+   rgb_factor_   = lp_build_blend_factor_unswizzled(bld, rgb_factor,   FALSE);
+   alpha_factor_ = lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE);
+
+   rgb_swizzle = lp_build_blend_factor_swizzle(rgb_factor);
+
+   factor2 = lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle);
+
+   return lp_build_mul(&bld->base, factor1, factor2);
+}
+
+
+/**
+ * Is (a OP b) == (b OP a)?
+ */
+boolean
+lp_build_blend_func_commutative(unsigned func)
+{
+   switch (func) {
+   case PIPE_BLEND_ADD:
+   case PIPE_BLEND_MIN:
+   case PIPE_BLEND_MAX:
+      return TRUE;
+   case PIPE_BLEND_SUBTRACT:
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return FALSE;
+   default:
+      assert(0);
+      return TRUE;
+   }
+}
+
+
+boolean
+lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
+{
+   if(rgb_func == alpha_func)
+      return FALSE;
+   if(rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
+      return TRUE;
+   if(rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
+      return TRUE;
+   return FALSE;
+}
+
+
+/**
+ * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
+ */
+LLVMValueRef
+lp_build_blend_func(struct lp_build_context *bld,
+                    unsigned func,
+                    LLVMValueRef term1, 
+                    LLVMValueRef term2)
+{
+   switch (func) {
+   case PIPE_BLEND_ADD:
+      return lp_build_add(bld, term1, term2);
+   case PIPE_BLEND_SUBTRACT:
+      return lp_build_sub(bld, term1, term2);
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return lp_build_sub(bld, term2, term1);
+   case PIPE_BLEND_MIN:
+      return lp_build_min(bld, term1, term2);
+   case PIPE_BLEND_MAX:
+      return lp_build_max(bld, term1, term2);
+   default:
+      assert(0);
+      return bld->zero;
+   }
+}
+
+
+LLVMValueRef
+lp_build_blend_aos(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   struct lp_type type,
+                   unsigned rt,
+                   LLVMValueRef src,
+                   LLVMValueRef dst,
+                   LLVMValueRef const_,
+                   unsigned alpha_swizzle)
+{
+   struct lp_build_blend_aos_context bld;
+   LLVMValueRef src_term;
+   LLVMValueRef dst_term;
+
+   /* FIXME: color masking not implemented yet */
+   assert(blend->rt[rt].colormask == 0xf);
+
+   if(!blend->rt[rt].blend_enable)
+      return src;
+
+   /* It makes no sense to blend unless values are normalized */
+   assert(type.norm);
+
+   /* Setup build context */
+   memset(&bld, 0, sizeof bld);
+   lp_build_context_init(&bld.base, builder, type);
+   bld.src = src;
+   bld.dst = dst;
+   bld.const_ = const_;
+
+   /* TODO: There are still a few optimization opportunities here. For certain
+    * combinations it is possible to reorder the operations and therefore saving
+    * some instructions. */
+
+   src_term = lp_build_blend_factor(&bld, src, blend->rt[rt].rgb_src_factor,
+                                    blend->rt[rt].alpha_src_factor, alpha_swizzle);
+   dst_term = lp_build_blend_factor(&bld, dst, blend->rt[rt].rgb_dst_factor,
+                                    blend->rt[rt].alpha_dst_factor, alpha_swizzle);
+
+   lp_build_name(src_term, "src_term");
+   lp_build_name(dst_term, "dst_term");
+
+   if(blend->rt[rt].rgb_func == blend->rt[rt].alpha_func) {
+      return lp_build_blend_func(&bld.base, blend->rt[rt].rgb_func, src_term, dst_term);
+   }
+   else {
+      /* Seperate RGB / A functions */
+
+      LLVMValueRef rgb;
+      LLVMValueRef alpha;
+
+      rgb   = lp_build_blend_func(&bld.base, blend->rt[rt].rgb_func,   src_term, dst_term);
+      alpha = lp_build_blend_func(&bld.base, blend->rt[rt].alpha_func, src_term, dst_term);
+
+      return lp_build_blend_swizzle(&bld, rgb, alpha, LP_BUILD_BLEND_SWIZZLE_RGBA, alpha_swizzle);
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c
new file mode 100644
index 0000000000..1eac0a5c89
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c
@@ -0,0 +1,109 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- logic ops.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+
+#include "lp_bld_blend.h"
+
+
+LLVMValueRef
+lp_build_logicop(LLVMBuilderRef builder,
+                 unsigned logicop_func,
+                 LLVMValueRef src,
+                 LLVMValueRef dst)
+{
+   LLVMTypeRef type;
+   LLVMValueRef res;
+
+   type = LLVMTypeOf(src);
+
+   switch (logicop_func) {
+   case PIPE_LOGICOP_CLEAR:
+      res = LLVMConstNull(type);
+      break;
+   case PIPE_LOGICOP_NOR:
+      res = LLVMBuildNot(builder, LLVMBuildOr(builder, src, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      res = LLVMBuildAnd(builder, LLVMBuildNot(builder, src, ""), dst, "");
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      res = LLVMBuildNot(builder, src, "");
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      res = LLVMBuildAnd(builder, src, LLVMBuildNot(builder, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_INVERT:
+      res = LLVMBuildNot(builder, dst, "");
+      break;
+   case PIPE_LOGICOP_XOR:
+      res = LLVMBuildXor(builder, src, dst, "");
+      break;
+   case PIPE_LOGICOP_NAND:
+      res = LLVMBuildNot(builder, LLVMBuildAnd(builder, src, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_AND:
+      res = LLVMBuildAnd(builder, src, dst, "");
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      res = LLVMBuildNot(builder, LLVMBuildXor(builder, src, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_NOOP:
+      res = dst;
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      res = LLVMBuildOr(builder, LLVMBuildNot(builder, src, ""), dst, "");
+      break;
+   case PIPE_LOGICOP_COPY:
+      res = src;
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      res = LLVMBuildOr(builder, src, LLVMBuildNot(builder, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_OR:
+      res = LLVMBuildOr(builder, src, dst, "");
+      break;
+   case PIPE_LOGICOP_SET:
+      res = LLVMConstAllOnes(type);
+      break;
+   default:
+      assert(0);
+      res = src;
+   }
+
+   return res;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
new file mode 100644
index 0000000000..b9c7a6ceed
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
@@ -0,0 +1,326 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- SoA layout.
+ *
+ * Blending in SoA is much faster than AoS, especially when separate rgb/alpha
+ * factors/functions are used, since no channel masking/shuffling is necessary
+ * and we can achieve the full throughput of the SIMD operations. Furthermore
+ * the fragment shader output is also in SoA, so it fits nicely with the rest
+ * of the fragment pipeline.
+ *
+ * The drawback is that to be displayed the color buffer needs to be in AoS
+ * layout, so we need to tile/untile the color buffer before/after rendering.
+ * A color buffer like
+ *
+ *  R11 G11 B11 A11 R12 G12 B12 A12  R13 G13 B13 A13 R14 G14 B14 A14  ...
+ *  R21 G21 B21 A21 R22 G22 B22 A22  R23 G23 B23 A23 R24 G24 B24 A24  ...
+ *
+ *  R31 G31 B31 A31 R32 G32 B32 A32  R33 G33 B33 A33 R34 G34 B34 A34  ...
+ *  R41 G41 B41 A41 R42 G42 B42 A42  R43 G43 B43 A43 R44 G44 B44 A44  ...
+ *
+ *  ... ... ... ... ... ... ... ...  ... ... ... ... ... ... ... ...  ...
+ *
+ * will actually be stored in memory as
+ *
+ *  R11 R12 R21 R22 R13 R14 R23 R24 ... G11 G12 G21 G22 G13 G14 G23 G24 ... B11 B12 B21 B22 B13 B14 B23 B24 ... A11 A12 A21 A22 A13 A14 A23 A24 ...
+ *  R31 R32 R41 R42 R33 R34 R43 R44 ... G31 G32 G41 G42 G33 G34 G43 G44 ... B31 B32 B41 B42 B33 B34 B43 B44 ... A31 A32 A41 A42 A33 A34 A43 A44 ...
+ *  ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
+ *
+ * NOTE: Run lp_blend_test after any change to this file.
+ *
+ * You can also run lp_blend_test to obtain AoS vs SoA benchmarks. Invoking it
+ * as:
+ *
+ *  lp_blend_test -o blend.tsv
+ *
+ * will generate a tab-seperated-file with the test results and performance
+ * measurements.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_arit.h"
+#include "lp_bld_blend.h"
+
+
+/**
+ * We may use the same values several times, so we keep them here to avoid
+ * recomputing them. Also reusing the values allows us to do simplifications
+ * that LLVM optimization passes wouldn't normally be able to do.
+ */
+struct lp_build_blend_soa_context
+{
+   struct lp_build_context base;
+
+   LLVMValueRef src[4];
+   LLVMValueRef dst[4];
+   LLVMValueRef con[4];
+
+   LLVMValueRef inv_src[4];
+   LLVMValueRef inv_dst[4];
+   LLVMValueRef inv_con[4];
+
+   LLVMValueRef src_alpha_saturate;
+
+   /**
+    * We store all factors in a table in order to eliminate redundant
+    * multiplications later.
+    * Indexes are: factor[src,dst][color,term][r,g,b,a]
+    */
+   LLVMValueRef factor[2][2][4];
+
+   /**
+    * Table with all terms.
+    * Indexes are: term[src,dst][r,g,b,a]
+    */
+   LLVMValueRef term[2][4];
+};
+
+
+/**
+ * Build a single SOA blend factor for a color channel.
+ * \param i  the color channel in [0,3]
+ */
+static LLVMValueRef
+lp_build_blend_soa_factor(struct lp_build_blend_soa_context *bld,
+                          unsigned factor, unsigned i)
+{
+   /*
+    * Compute src/first term RGB
+    */
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      return bld->base.one;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return bld->src[i];
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return bld->src[3];
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return bld->dst[i];
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return bld->dst[3];
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      if(i == 3)
+         return bld->base.one;
+      else {
+         if(!bld->inv_dst[3])
+            bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
+         if(!bld->src_alpha_saturate)
+            bld->src_alpha_saturate = lp_build_min(&bld->base, bld->src[3], bld->inv_dst[3]);
+         return bld->src_alpha_saturate;
+      }
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return bld->con[i];
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return bld->con[3];
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_ZERO:
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      if(!bld->inv_src[i])
+         bld->inv_src[i] = lp_build_comp(&bld->base, bld->src[i]);
+      return bld->inv_src[i];
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      if(!bld->inv_src[3])
+         bld->inv_src[3] = lp_build_comp(&bld->base, bld->src[3]);
+      return bld->inv_src[3];
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      if(!bld->inv_dst[i])
+         bld->inv_dst[i] = lp_build_comp(&bld->base, bld->dst[i]);
+      return bld->inv_dst[i];
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      if(!bld->inv_dst[3])
+         bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
+      return bld->inv_dst[3];
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      if(!bld->inv_con[i])
+         bld->inv_con[i] = lp_build_comp(&bld->base, bld->con[i]);
+      return bld->inv_con[i];
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      if(!bld->inv_con[3])
+         bld->inv_con[3] = lp_build_comp(&bld->base, bld->con[3]);
+      return bld->inv_con[3];
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   default:
+      assert(0);
+      return bld->base.zero;
+   }
+}
+
+
+/**
+ * Generate blend code in SOA mode.
+ * \param rt  render target index (to index the blend / colormask state)
+ * \param src  src/fragment color
+ * \param dst  dst/framebuffer color
+ * \param con  constant blend color
+ * \param res  the result/output
+ */
+void
+lp_build_blend_soa(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   struct lp_type type,
+                   unsigned rt,
+                   LLVMValueRef src[4],
+                   LLVMValueRef dst[4],
+                   LLVMValueRef con[4],
+                   LLVMValueRef res[4])
+{
+   struct lp_build_blend_soa_context bld;
+   unsigned i, j, k;
+
+   assert(rt < PIPE_MAX_COLOR_BUFS);
+
+   /* Setup build context */
+   memset(&bld, 0, sizeof bld);
+   lp_build_context_init(&bld.base, builder, type);
+   for (i = 0; i < 4; ++i) {
+      bld.src[i] = src[i];
+      bld.dst[i] = dst[i];
+      bld.con[i] = con[i];
+   }
+
+   for (i = 0; i < 4; ++i) {
+      /* only compute blending for the color channels enabled for writing */
+      if (blend->rt[rt].colormask & (1 << i)) {
+         if (blend->logicop_enable) {
+            if(!type.floating) {
+               res[i] = lp_build_logicop(builder, blend->logicop_func, src[i], dst[i]);
+            }
+            else
+               res[i] = dst[i];
+         }
+         else if (blend->rt[rt].blend_enable) {
+            unsigned src_factor = i < 3 ? blend->rt[rt].rgb_src_factor : blend->rt[rt].alpha_src_factor;
+            unsigned dst_factor = i < 3 ? blend->rt[rt].rgb_dst_factor : blend->rt[rt].alpha_dst_factor;
+            unsigned func = i < 3 ? blend->rt[rt].rgb_func : blend->rt[rt].alpha_func;
+            boolean func_commutative = lp_build_blend_func_commutative(func);
+
+            /* It makes no sense to blend unless values are normalized */
+            assert(type.norm);
+
+            /*
+             * Compute src/dst factors.
+             */
+
+            bld.factor[0][0][i] = src[i];
+            bld.factor[0][1][i] = lp_build_blend_soa_factor(&bld, src_factor, i);
+            bld.factor[1][0][i] = dst[i];
+            bld.factor[1][1][i] = lp_build_blend_soa_factor(&bld, dst_factor, i);
+
+            /*
+             * Compute src/dst terms
+             */
+
+            for(k = 0; k < 2; ++k) {
+               /* See if this multiplication has been previously computed */
+               for(j = 0; j < i; ++j) {
+                  if((bld.factor[k][0][j] == bld.factor[k][0][i] &&
+                      bld.factor[k][1][j] == bld.factor[k][1][i]) ||
+                     (bld.factor[k][0][j] == bld.factor[k][1][i] &&
+                      bld.factor[k][1][j] == bld.factor[k][0][i]))
+                     break;
+               }
+
+               if(j < i)
+                  bld.term[k][i] = bld.term[k][j];
+               else
+                  bld.term[k][i] = lp_build_mul(&bld.base, bld.factor[k][0][i], bld.factor[k][1][i]);
+
+               if (src_factor == PIPE_BLENDFACTOR_ZERO &&
+                   (dst_factor == PIPE_BLENDFACTOR_DST_ALPHA ||
+                    dst_factor == PIPE_BLENDFACTOR_INV_DST_ALPHA)) {
+                  /* XXX special case these combos to work around an apparent
+                   * bug in LLVM.
+                   * This hack disables the check for multiplication by zero
+                   * in lp_bld_mul().  When we optimize away the
+                   * multiplication, something goes wrong during code
+                   * generation and we segfault at runtime.
+                   */
+                  LLVMValueRef zeroSave = bld.base.zero;
+                  bld.base.zero = NULL;
+                  bld.term[k][i] = lp_build_mul(&bld.base, bld.factor[k][0][i],
+                                                bld.factor[k][1][i]);
+                  bld.base.zero = zeroSave;
+               }
+            }
+
+            /*
+             * Combine terms
+             */
+
+            /* See if this function has been previously applied */
+            for(j = 0; j < i; ++j) {
+               unsigned prev_func = j < 3 ? blend->rt[rt].rgb_func : blend->rt[rt].alpha_func;
+               unsigned func_reverse = lp_build_blend_func_reverse(func, prev_func);
+
+               if((!func_reverse &&
+                   bld.term[0][j] == bld.term[0][i] &&
+                   bld.term[1][j] == bld.term[1][i]) ||
+                  ((func_commutative || func_reverse) &&
+                   bld.term[0][j] == bld.term[1][i] &&
+                   bld.term[1][j] == bld.term[0][i]))
+                  break;
+            }
+
+            if(j < i)
+               res[i] = res[j];
+            else
+               res[i] = lp_build_blend_func(&bld.base, func, bld.term[0][i], bld.term[1][i]);
+         }
+         else {
+            res[i] = src[i];
+         }
+      }
+      else {
+         res[i] = dst[i];
+      }
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
new file mode 100644
index 0000000000..e05bbe5011
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -0,0 +1,726 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Depth/stencil testing to LLVM IR translation.
+ *
+ * To be done accurately/efficiently the depth/stencil test must be done with
+ * the same type/format of the depth/stencil buffer, which implies massaging
+ * the incoming depths to fit into place. Using a more straightforward
+ * type/format for depth/stencil values internally and only convert when
+ * flushing would avoid this, but it would most likely result in depth fighting
+ * artifacts.
+ *
+ * We are free to use a different pixel layout though. Since our basic
+ * processing unit is a quad (2x2 pixel block) we store the depth/stencil
+ * values tiled, a quad at time. That is, a depth buffer containing 
+ *
+ *  Z11 Z12 Z13 Z14 ...
+ *  Z21 Z22 Z23 Z24 ...
+ *  Z31 Z32 Z33 Z34 ...
+ *  Z41 Z42 Z43 Z44 ...
+ *  ... ... ... ... ...
+ *
+ * will actually be stored in memory as
+ *
+ *  Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ...
+ *  Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
+ *  ... ... ... ... ... ... ... ... ...
+ *
+ *
+ * Stencil test:
+ * Two-sided stencil test is supported but probably not as efficient as
+ * it could be.  Currently, we use if/then/else constructs to do the
+ * operations for front vs. back-facing polygons.  We could probably do
+ * both the front and back arithmetic then use a Select() instruction to
+ * choose the result depending on polyon orientation.  We'd have to
+ * measure performance both ways and see which is better.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_format.h"
+
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_flow.h"
+#include "gallivm/lp_bld_intr.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_swizzle.h"
+
+#include "lp_bld_depth.h"
+
+
+/** Used to select fields from pipe_stencil_state */
+enum stencil_op {
+   S_FAIL_OP,
+   Z_FAIL_OP,
+   Z_PASS_OP
+};
+
+
+
+/**
+ * Do the stencil test comparison (compare FB stencil values against ref value).
+ * This will be used twice when generating two-sided stencil code.
+ * \param stencil  the front/back stencil state
+ * \param stencilRef  the stencil reference value, replicated as a vector
+ * \param stencilVals  vector of stencil values from framebuffer
+ * \return vector mask of pass/fail values (~0 or 0)
+ */
+static LLVMValueRef
+lp_build_stencil_test_single(struct lp_build_context *bld,
+                             const struct pipe_stencil_state *stencil,
+                             LLVMValueRef stencilRef,
+                             LLVMValueRef stencilVals)
+{
+   const unsigned stencilMax = 255; /* XXX fix */
+   struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   assert(type.sign);
+
+   assert(stencil->enabled);
+
+   if (stencil->valuemask != stencilMax) {
+      /* compute stencilRef = stencilRef & valuemask */
+      LLVMValueRef valuemask = lp_build_const_int_vec(type, stencil->valuemask);
+      stencilRef = LLVMBuildAnd(bld->builder, stencilRef, valuemask, "");
+      /* compute stencilVals = stencilVals & valuemask */
+      stencilVals = LLVMBuildAnd(bld->builder, stencilVals, valuemask, "");
+   }
+
+   res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals);
+
+   return res;
+}
+
+
+/**
+ * Do the one or two-sided stencil test comparison.
+ * \sa lp_build_stencil_test_single
+ * \param face  an integer indicating front (+) or back (-) facing polygon.
+ *              If NULL, assume front-facing.
+ */
+static LLVMValueRef
+lp_build_stencil_test(struct lp_build_context *bld,
+                      const struct pipe_stencil_state stencil[2],
+                      LLVMValueRef stencilRefs[2],
+                      LLVMValueRef stencilVals,
+                      LLVMValueRef face)
+{
+   LLVMValueRef res;
+
+   assert(stencil[0].enabled);
+
+   if (stencil[1].enabled && face) {
+      /* do two-sided test */
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef front_facing;
+      LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
+      LLVMValueRef result = bld->undef;
+
+      flow_ctx = lp_build_flow_create(bld->builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      lp_build_flow_scope_declare(flow_ctx, &result);
+
+      /* front_facing = face > 0.0 */
+      front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
+
+      lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
+      {
+         result = lp_build_stencil_test_single(bld, &stencil[0],
+                                               stencilRefs[0], stencilVals);
+      }
+      lp_build_else(&if_ctx);
+      {
+         result = lp_build_stencil_test_single(bld, &stencil[1],
+                                               stencilRefs[1], stencilVals);
+      }
+      lp_build_endif(&if_ctx);
+
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+
+      res = result;
+   }
+   else {
+      /* do single-side test */
+      res = lp_build_stencil_test_single(bld, &stencil[0],
+                                         stencilRefs[0], stencilVals);
+   }
+
+   return res;
+}
+
+
+/**
+ * Apply the stencil operator (add/sub/keep/etc) to the given vector
+ * of stencil values.
+ * \return  new stencil values vector
+ */
+static LLVMValueRef
+lp_build_stencil_op_single(struct lp_build_context *bld,
+                           const struct pipe_stencil_state *stencil,
+                           enum stencil_op op,
+                           LLVMValueRef stencilRef,
+                           LLVMValueRef stencilVals,
+                           LLVMValueRef mask)
+
+{
+   const unsigned stencilMax = 255; /* XXX fix */
+   struct lp_type type = bld->type;
+   LLVMValueRef res;
+   LLVMValueRef max = lp_build_const_int_vec(type, stencilMax);
+   unsigned stencil_op;
+
+   assert(type.sign);
+
+   switch (op) {
+   case S_FAIL_OP:
+      stencil_op = stencil->fail_op;
+      break;
+   case Z_FAIL_OP:
+      stencil_op = stencil->zfail_op;
+      break;
+   case Z_PASS_OP:
+      stencil_op = stencil->zpass_op;
+      break;
+   default:
+      assert(0 && "Invalid stencil_op mode");
+      stencil_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   switch (stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      res = stencilVals;
+      /* we can return early for this case */
+      return res;
+   case PIPE_STENCIL_OP_ZERO:
+      res = bld->zero;
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      res = stencilRef;
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      res = lp_build_add(bld, stencilVals, bld->one);
+      res = lp_build_min(bld, res, max);
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      res = lp_build_sub(bld, stencilVals, bld->one);
+      res = lp_build_max(bld, res, bld->zero);
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      res = lp_build_add(bld, stencilVals, bld->one);
+      res = LLVMBuildAnd(bld->builder, res, max, "");
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      res = lp_build_sub(bld, stencilVals, bld->one);
+      res = LLVMBuildAnd(bld->builder, res, max, "");
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      res = LLVMBuildNot(bld->builder, stencilVals, "");
+      res = LLVMBuildAnd(bld->builder, res, max, "");
+      break;
+   default:
+      assert(0 && "bad stencil op mode");
+      res = NULL;
+   }
+
+   if (stencil->writemask != stencilMax) {
+      /* compute res = (res & mask) | (stencilVals & ~mask) */
+      LLVMValueRef mask = lp_build_const_int_vec(type, stencil->writemask);
+      LLVMValueRef cmask = LLVMBuildNot(bld->builder, mask, "notWritemask");
+      LLVMValueRef t1 = LLVMBuildAnd(bld->builder, res, mask, "t1");
+      LLVMValueRef t2 = LLVMBuildAnd(bld->builder, stencilVals, cmask, "t2");
+      res = LLVMBuildOr(bld->builder, t1, t2, "t1_or_t2");
+   }
+
+   /* only the update the vector elements enabled by 'mask' */
+   res = lp_build_select(bld, mask, res, stencilVals);
+
+   return res;
+}
+
+
+/**
+ * Do the one or two-sided stencil test op/update.
+ */
+static LLVMValueRef
+lp_build_stencil_op(struct lp_build_context *bld,
+                    const struct pipe_stencil_state stencil[2],
+                    enum stencil_op op,
+                    LLVMValueRef stencilRefs[2],
+                    LLVMValueRef stencilVals,
+                    LLVMValueRef mask,
+                    LLVMValueRef face)
+
+{
+   assert(stencil[0].enabled);
+
+   if (stencil[1].enabled && face) {
+      /* do two-sided op */
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef front_facing;
+      LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
+      LLVMValueRef result = bld->undef;
+
+      flow_ctx = lp_build_flow_create(bld->builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      lp_build_flow_scope_declare(flow_ctx, &result);
+
+      /* front_facing = face > 0.0 */
+      front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
+
+      lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
+      {
+         result = lp_build_stencil_op_single(bld, &stencil[0], op,
+                                             stencilRefs[0], stencilVals, mask);
+      }
+      lp_build_else(&if_ctx);
+      {
+         result = lp_build_stencil_op_single(bld, &stencil[1], op,
+                                             stencilRefs[1], stencilVals, mask);
+      }
+      lp_build_endif(&if_ctx);
+
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+
+      return result;
+   }
+   else {
+      /* do single-sided op */
+      return lp_build_stencil_op_single(bld, &stencil[0], op,
+                                        stencilRefs[0], stencilVals, mask);
+   }
+}
+
+
+
+/**
+ * Return a type appropriate for depth/stencil testing.
+ */
+struct lp_type
+lp_depth_type(const struct util_format_description *format_desc,
+              unsigned length)
+{
+   struct lp_type type;
+   unsigned swizzle;
+
+   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+
+   swizzle = format_desc->swizzle[0];
+   assert(swizzle < 4);
+
+   memset(&type, 0, sizeof type);
+   type.width = format_desc->block.bits;
+
+   if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
+      type.floating = TRUE;
+      assert(swizzle == 0);
+      assert(format_desc->channel[swizzle].size == format_desc->block.bits);
+   }
+   else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+      assert(format_desc->block.bits <= 32);
+      if(format_desc->channel[swizzle].normalized)
+         type.norm = TRUE;
+   }
+   else
+      assert(0);
+
+   assert(type.width <= length);
+   type.length = length / type.width;
+
+   return type;
+}
+
+
+/**
+ * Compute bitmask and bit shift to apply to the incoming fragment Z values
+ * and the Z buffer values needed before doing the Z comparison.
+ *
+ * Note that we leave the Z bits in the position that we find them
+ * in the Z buffer (typically 0xffffff00 or 0x00ffffff).  That lets us
+ * get by with fewer bit twiddling steps.
+ */
+static boolean
+get_z_shift_and_mask(const struct util_format_description *format_desc,
+                     unsigned *shift, unsigned *mask)
+{
+   const unsigned total_bits = format_desc->block.bits;
+   unsigned z_swizzle;
+   unsigned chan;
+   unsigned padding_left, padding_right;
+   
+   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+
+   z_swizzle = format_desc->swizzle[0];
+
+   if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+      return FALSE;
+
+   padding_right = 0;
+   for (chan = 0; chan < z_swizzle; ++chan)
+      padding_right += format_desc->channel[chan].size;
+
+   padding_left =
+      total_bits - (padding_right + format_desc->channel[z_swizzle].size);
+
+   if (padding_left || padding_right) {
+      unsigned long long mask_left = (1ULL << (total_bits - padding_left)) - 1;
+      unsigned long long mask_right = (1ULL << (padding_right)) - 1;
+      *mask = mask_left ^ mask_right;
+   }
+   else {
+      *mask = 0xffffffff;
+   }
+
+   *shift = padding_left;
+
+   return TRUE;
+}
+
+
+/**
+ * Compute bitmask and bit shift to apply to the framebuffer pixel values
+ * to put the stencil bits in the least significant position.
+ * (i.e. 0x000000ff)
+ */
+static boolean
+get_s_shift_and_mask(const struct util_format_description *format_desc,
+                     unsigned *shift, unsigned *mask)
+{
+   unsigned s_swizzle;
+   unsigned chan, sz;
+
+   s_swizzle = format_desc->swizzle[1];
+
+   if (s_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+      return FALSE;
+
+   *shift = 0;
+   for (chan = 0; chan < s_swizzle; chan++)
+      *shift += format_desc->channel[chan].size;
+
+   sz = format_desc->channel[s_swizzle].size;
+   *mask = (1U << sz) - 1U;
+
+   return TRUE;
+}
+
+
+/**
+ * Perform the occlusion test and increase the counter.
+ * Test the depth mask. Add the number of channel which has none zero mask
+ * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}.
+ * The counter will add 4.
+ *
+ * \param type holds element type of the mask vector.
+ * \param maskvalue is the depth test mask.
+ * \param counter is a pointer of the uint32 counter.
+ */
+static void
+lp_build_occlusion_count(LLVMBuilderRef builder,
+                         struct lp_type type,
+                         LLVMValueRef maskvalue,
+                         LLVMValueRef counter)
+{
+   LLVMValueRef countmask = lp_build_const_int_vec(type, 1);
+   LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
+   LLVMTypeRef i8v16 = LLVMVectorType(LLVMInt8Type(), 16);
+   LLVMValueRef counti = LLVMBuildBitCast(builder, countv, i8v16, "counti");
+   LLVMValueRef maskarray[4] = {
+      LLVMConstInt(LLVMInt32Type(), 0, 0),
+      LLVMConstInt(LLVMInt32Type(), 4, 0),
+      LLVMConstInt(LLVMInt32Type(), 8, 0),
+      LLVMConstInt(LLVMInt32Type(), 12, 0),
+   };
+   LLVMValueRef shufflemask = LLVMConstVector(maskarray, 4);
+   LLVMValueRef shufflev =  LLVMBuildShuffleVector(builder, counti, LLVMGetUndef(i8v16), shufflemask, "shufflev");
+   LLVMValueRef shuffle = LLVMBuildBitCast(builder, shufflev, LLVMInt32Type(), "shuffle");
+   LLVMValueRef count = lp_build_intrinsic_unary(builder, "llvm.ctpop.i32", LLVMInt32Type(), shuffle);
+   LLVMValueRef orig = LLVMBuildLoad(builder, counter, "orig");
+   LLVMValueRef incr = LLVMBuildAdd(builder, orig, count, "incr");
+   LLVMBuildStore(builder, incr, counter);
+}
+
+
+
+/**
+ * Generate code for performing depth and/or stencil tests.
+ * We operate on a vector of values (typically a 2x2 quad).
+ *
+ * \param depth  the depth test state
+ * \param stencil  the front/back stencil state
+ * \param type  the data type of the fragment depth/stencil values
+ * \param format_desc  description of the depth/stencil surface
+ * \param mask  the alive/dead pixel mask for the quad (vector)
+ * \param stencil_refs  the front/back stencil ref values (scalar)
+ * \param z_src  the incoming depth/stencil values (a 2x2 quad)
+ * \param zs_dst_ptr  pointer to depth/stencil values in framebuffer
+ * \param facing  contains float value indicating front/back facing polygon
+ */
+void
+lp_build_depth_stencil_test(LLVMBuilderRef builder,
+                            const struct pipe_depth_state *depth,
+                            const struct pipe_stencil_state stencil[2],
+                            struct lp_type type,
+                            const struct util_format_description *format_desc,
+                            struct lp_build_mask_context *mask,
+                            LLVMValueRef stencil_refs[2],
+                            LLVMValueRef z_src,
+                            LLVMValueRef zs_dst_ptr,
+                            LLVMValueRef face,
+                            LLVMValueRef counter)
+{
+   struct lp_build_context bld;
+   struct lp_build_context sbld;
+   struct lp_type s_type;
+   LLVMValueRef zs_dst, z_dst = NULL;
+   LLVMValueRef stencil_vals = NULL;
+   LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
+   LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
+   LLVMValueRef orig_mask = mask->value;
+
+   /* Sanity checking */
+   {
+      const unsigned z_swizzle = format_desc->swizzle[0];
+      const unsigned s_swizzle = format_desc->swizzle[1];
+
+      assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE ||
+             s_swizzle != UTIL_FORMAT_SWIZZLE_NONE);
+
+      assert(depth->enabled || stencil[0].enabled);
+
+      assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
+      assert(format_desc->block.width == 1);
+      assert(format_desc->block.height == 1);
+
+      if (stencil[0].enabled) {
+         assert(format_desc->format == PIPE_FORMAT_Z24_UNORM_S8_USCALED ||
+                format_desc->format == PIPE_FORMAT_S8_USCALED_Z24_UNORM);
+      }
+
+      assert(z_swizzle < 4);
+      assert(format_desc->block.bits == type.width);
+      if (type.floating) {
+         assert(z_swizzle == 0);
+         assert(format_desc->channel[z_swizzle].type ==
+                UTIL_FORMAT_TYPE_FLOAT);
+         assert(format_desc->channel[z_swizzle].size ==
+                format_desc->block.bits);
+      }
+      else {
+         assert(format_desc->channel[z_swizzle].type ==
+                UTIL_FORMAT_TYPE_UNSIGNED);
+         assert(format_desc->channel[z_swizzle].normalized);
+         assert(!type.fixed);
+         assert(!type.sign);
+         assert(type.norm);
+      }
+   }
+
+
+   /* Setup build context for Z vals */
+   lp_build_context_init(&bld, builder, type);
+
+   /* Setup build context for stencil vals */
+   s_type = lp_type_int_vec(type.width);
+   lp_build_context_init(&sbld, builder, s_type);
+
+   /* Load current z/stencil value from z/stencil buffer */
+   zs_dst = LLVMBuildLoad(builder, zs_dst_ptr, "");
+
+   lp_build_name(zs_dst, "zsbufval");
+
+
+   /* Compute and apply the Z/stencil bitmasks and shifts.
+    */
+   {
+      unsigned z_shift, z_mask;
+      unsigned s_shift, s_mask;
+
+      if (get_z_shift_and_mask(format_desc, &z_shift, &z_mask)) {
+         if (z_shift) {
+            LLVMValueRef shift = lp_build_const_int_vec(type, z_shift);
+            z_src = LLVMBuildLShr(builder, z_src, shift, "");
+         }
+
+         if (z_mask != 0xffffffff) {
+            LLVMValueRef mask = lp_build_const_int_vec(type, z_mask);
+            z_src = LLVMBuildAnd(builder, z_src, mask, "");
+            z_dst = LLVMBuildAnd(builder, zs_dst, mask, "");
+            z_bitmask = mask;  /* used below */
+         }
+         else {
+            z_dst = zs_dst;
+         }
+
+         lp_build_name(z_dst, "zsbuf.z");
+      }
+
+      if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) {
+         if (s_shift) {
+            LLVMValueRef shift = lp_build_const_int_vec(type, s_shift);
+            stencil_vals = LLVMBuildLShr(builder, zs_dst, shift, "");
+            stencil_shift = shift;  /* used below */
+         }
+         else {
+            stencil_vals = zs_dst;
+         }
+
+         if (s_mask != 0xffffffff) {
+            LLVMValueRef mask = lp_build_const_int_vec(type, s_mask);
+            stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
+         }
+
+         lp_build_name(stencil_vals, "stencil");
+      }
+   }
+
+
+   if (stencil[0].enabled) {
+      /* convert scalar stencil refs into vectors */
+      stencil_refs[0] = lp_build_broadcast_scalar(&bld, stencil_refs[0]);
+      stencil_refs[1] = lp_build_broadcast_scalar(&bld, stencil_refs[1]);
+
+      s_pass_mask = lp_build_stencil_test(&sbld, stencil,
+                                          stencil_refs, stencil_vals, face);
+
+      /* apply stencil-fail operator */
+      {
+         LLVMValueRef s_fail_mask = lp_build_andc(&bld, orig_mask, s_pass_mask);
+         stencil_vals = lp_build_stencil_op(&sbld, stencil, S_FAIL_OP,
+                                            stencil_refs, stencil_vals,
+                                            s_fail_mask, face);
+      }
+   }
+
+   if (depth->enabled) {
+      /* compare src Z to dst Z, returning 'pass' mask */
+      z_pass = lp_build_cmp(&bld, depth->func, z_src, z_dst);
+
+      if (!stencil[0].enabled) {
+         /* We can potentially skip all remaining operations here, but only
+          * if stencil is disabled because we still need to update the stencil
+          * buffer values.  Don't need to update Z buffer values.
+          */
+         lp_build_mask_update(mask, z_pass);
+      }
+
+      if (depth->writemask) {
+         LLVMValueRef zselectmask = mask->value;
+
+         /* mask off bits that failed Z test */
+         zselectmask = LLVMBuildAnd(builder, zselectmask, z_pass, "");
+
+         /* mask off bits that failed stencil test */
+         if (s_pass_mask) {
+            zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, "");
+         }
+
+         /* if combined Z/stencil format, mask off the stencil bits */
+         if (z_bitmask) {
+            zselectmask = LLVMBuildAnd(builder, zselectmask, z_bitmask, "");
+         }
+
+         /* Mix the old and new Z buffer values.
+          * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
+          */
+         z_dst = lp_build_select(&bld, zselectmask, z_src, z_dst);
+      }
+
+      if (stencil[0].enabled) {
+         /* update stencil buffer values according to z pass/fail result */
+         LLVMValueRef z_fail_mask, z_pass_mask;
+
+         /* apply Z-fail operator */
+         z_fail_mask = lp_build_andc(&bld, orig_mask, z_pass);
+         stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_FAIL_OP,
+                                            stencil_refs, stencil_vals,
+                                            z_fail_mask, face);
+
+         /* apply Z-pass operator */
+         z_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, z_pass, "");
+         stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+                                            stencil_refs, stencil_vals,
+                                            z_pass_mask, face);
+      }
+   }
+   else {
+      /* No depth test: apply Z-pass operator to stencil buffer values which
+       * passed the stencil test.
+       */
+      s_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, s_pass_mask, "");
+      stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+                                         stencil_refs, stencil_vals,
+                                         s_pass_mask, face);
+   }
+
+   /* The Z bits are already in the right place but we may need to shift the
+    * stencil bits before ORing Z with Stencil to make the final pixel value.
+    */
+   if (stencil_vals && stencil_shift)
+      stencil_vals = LLVMBuildShl(bld.builder, stencil_vals,
+                                  stencil_shift, "");
+
+   /* Finally, merge/store the z/stencil values */
+   if ((depth->enabled && depth->writemask) ||
+       (stencil[0].enabled && stencil[0].writemask)) {
+
+      if (z_dst && stencil_vals)
+         zs_dst = LLVMBuildOr(bld.builder, z_dst, stencil_vals, "");
+      else if (z_dst)
+         zs_dst = z_dst;
+      else
+         zs_dst = stencil_vals;
+
+      LLVMBuildStore(builder, zs_dst, zs_dst_ptr);
+   }
+
+   if (s_pass_mask)
+      lp_build_mask_update(mask, s_pass_mask);
+
+   if (depth->enabled && stencil[0].enabled)
+      lp_build_mask_update(mask, z_pass);
+
+   if (counter)
+      lp_build_occlusion_count(builder, type, mask->value, counter);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
new file mode 100644
index 0000000000..e257a5bd7d
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -0,0 +1,67 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * Depth/stencil testing to LLVM IR translation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_DEPTH_H
+#define LP_BLD_DEPTH_H
+
+
+#include "gallivm/lp_bld.h"
+
+ 
+struct pipe_depth_state;
+struct util_format_description;
+struct lp_type;
+struct lp_build_mask_context;
+
+
+struct lp_type
+lp_depth_type(const struct util_format_description *format_desc,
+              unsigned length);
+
+
+void
+lp_build_depth_stencil_test(LLVMBuilderRef builder,
+                            const struct pipe_depth_state *depth,
+                            const struct pipe_stencil_state stencil[2],
+                            struct lp_type type,
+                            const struct util_format_description *format_desc,
+                            struct lp_build_mask_context *mask,
+                            LLVMValueRef stencil_refs[2],
+                            LLVMValueRef zs_src,
+                            LLVMValueRef zs_dst_ptr,
+                            LLVMValueRef facing,
+                            LLVMValueRef counter);
+
+
+#endif /* !LP_BLD_DEPTH_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
new file mode 100644
index 0000000000..90d2b26f9f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -0,0 +1,388 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Position and shader input interpolation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_shader_tokens.h"
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "tgsi/tgsi_scan.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_swizzle.h"
+#include "lp_bld_interp.h"
+
+
+/*
+ * The shader JIT function operates on blocks of quads.
+ * Each block has 2x2 quads and each quad has 2x2 pixels.
+ *
+ * We iterate over the quads in order 0, 1, 2, 3:
+ *
+ * #################
+ * #   |   #   |   #
+ * #---0---#---1---#
+ * #   |   #   |   #
+ * #################
+ * #   |   #   |   #
+ * #---2---#---3---#
+ * #   |   #   |   #
+ * #################
+ *
+ * Within each quad, we have four pixels which are represented in SOA
+ * order:
+ *
+ * #########
+ * # 0 | 1 #
+ * #---+---#
+ * # 2 | 3 #
+ * #########
+ *
+ * So the green channel (for example) of the four pixels is stored in
+ * a single vector register: {g0, g1, g2, g3}.
+ */
+
+
+static const unsigned char quad_offset_x[4] = {0, 1, 0, 1};
+static const unsigned char quad_offset_y[4] = {0, 0, 1, 1};
+
+
+static void
+attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
+{
+   if(attrib == 0)
+      lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
+   else
+      lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
+}
+
+
+/**
+ * Initialize the bld->a0, dadx, dady fields.  This involves fetching
+ * those values from the arrays which are passed into the JIT function.
+ */
+static void
+coeffs_init(struct lp_build_interp_soa_context *bld,
+            LLVMValueRef a0_ptr,
+            LLVMValueRef dadx_ptr,
+            LLVMValueRef dady_ptr)
+{
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+   LLVMBuilderRef builder = coeff_bld->builder;
+   LLVMValueRef zero = LLVMConstNull(coeff_bld->elem_type);
+   LLVMValueRef one = LLVMConstReal(coeff_bld->elem_type, 1.0);
+   LLVMValueRef i0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   LLVMValueRef i1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
+   LLVMValueRef i2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
+   LLVMValueRef i3 = LLVMConstInt(LLVMInt32Type(), 3, 0);
+   LLVMValueRef oow = NULL;
+   unsigned attrib;
+   unsigned chan;
+
+   /* TODO: Use more vector operations */
+
+   for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      const unsigned mask = bld->mask[attrib];
+      const unsigned interp = bld->interp[attrib];
+      for (chan = 0; chan < NUM_CHANNELS; ++chan) {
+         if (mask & (1 << chan)) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), attrib*NUM_CHANNELS + chan, 0);
+            LLVMValueRef a0 = zero;
+            LLVMValueRef dadx = zero;
+            LLVMValueRef dady = zero;
+            LLVMValueRef dadxy = zero;
+            LLVMValueRef dadq;
+            LLVMValueRef dadq2;
+            LLVMValueRef a;
+
+            switch (interp) {
+            case LP_INTERP_PERSPECTIVE:
+               /* fall-through */
+
+            case LP_INTERP_LINEAR:
+               if (attrib == 0 && chan == 0) {
+                  dadxy = dadx = one;
+               }
+               else if (attrib == 0 && chan == 1) {
+                  dadxy = dady = one;
+               }
+               else {
+                  dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), "");
+                  dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), "");
+                  dadxy = LLVMBuildAdd(builder, dadx, dady, "");
+                  attrib_name(dadx, attrib, chan, ".dadx");
+                  attrib_name(dady, attrib, chan, ".dady");
+                  attrib_name(dadxy, attrib, chan, ".dadxy");
+               }
+               /* fall-through */
+
+            case LP_INTERP_CONSTANT:
+            case LP_INTERP_FACING:
+               a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), "");
+               attrib_name(a0, attrib, chan, ".a0");
+               break;
+
+            case LP_INTERP_POSITION:
+               /* Nothing to do as the position coeffs are already setup in slot 0 */
+               continue;
+
+            default:
+               assert(0);
+               break;
+            }
+
+            /*
+             * dadq = {0, dadx, dady, dadx + dady}
+             */
+
+            dadq = coeff_bld->undef;
+            dadq = LLVMBuildInsertElement(builder, dadq, zero,  i0, "");
+            dadq = LLVMBuildInsertElement(builder, dadq, dadx,  i1, "");
+            dadq = LLVMBuildInsertElement(builder, dadq, dady,  i2, "");
+            dadq = LLVMBuildInsertElement(builder, dadq, dadxy, i3, "");
+
+            /*
+             * dadq2 = 2 * dq
+             */
+
+            dadq2 = LLVMBuildAdd(builder, dadq, dadq, "");
+
+            /*
+             * a = a0 + x * dadx + y * dady
+             */
+
+            if (attrib == 0 && chan == 0) {
+               a = bld->x;
+            }
+            else if (attrib == 0 && chan == 1) {
+               a = bld->y;
+            }
+            else {
+               a = a0;
+               if (interp != LP_INTERP_CONSTANT &&
+                   interp != LP_INTERP_FACING) {
+                  a = LLVMBuildAdd(builder, a,
+                                   LLVMBuildMul(builder, bld->x, dadx, ""),
+                                   "");
+                  a = LLVMBuildAdd(builder, a,
+                                   LLVMBuildMul(builder, bld->y, dady, ""),
+                                   "");
+               }
+            }
+
+            /*
+             * a = {a, a, a, a}
+             */
+
+            a = lp_build_broadcast(builder, coeff_bld->vec_type, a);
+
+            /*
+             * Compute the attrib values on the upper-left corner of each quad.
+             */
+
+            a = LLVMBuildAdd(builder, a, dadq2, "");
+
+            /*
+             * a    *= 1 / w
+             * dadq *= 1 / w
+             */
+
+            if (interp == LP_INTERP_PERSPECTIVE) {
+               LLVMValueRef w = bld->a[0][3];
+               assert(attrib != 0);
+               assert(bld->mask[0] & TGSI_WRITEMASK_W);
+               if (!oow) {
+                  oow = lp_build_rcp(coeff_bld, w);
+                  lp_build_name(oow, "oow");
+               }
+               a = lp_build_mul(coeff_bld, a, oow);
+               dadq = lp_build_mul(coeff_bld, dadq, oow);
+            }
+
+            attrib_name(a, attrib, chan, ".a");
+            attrib_name(dadq, attrib, chan, ".dadq");
+
+            bld->a   [attrib][chan] = a;
+            bld->dadq[attrib][chan] = dadq;
+         }
+      }
+   }
+}
+
+
+/**
+ * Increment the shader input attribute values.
+ * This is called when we move from one quad to the next.
+ */
+static void
+attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
+{
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+   LLVMValueRef shuffle = lp_build_const_int_vec(coeff_bld->type, quad_index);
+   unsigned attrib;
+   unsigned chan;
+
+   assert(quad_index < 4);
+
+   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      const unsigned mask = bld->mask[attrib];
+      const unsigned interp = bld->interp[attrib];
+      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+         if(mask & (1 << chan)) {
+            LLVMValueRef a = coeff_bld->undef;
+            if (interp == LP_INTERP_CONSTANT ||
+                interp == LP_INTERP_FACING) {
+               a = bld->a[attrib][chan];
+            }
+            else if (interp == LP_INTERP_POSITION) {
+               assert(attrib > 0);
+               a = bld->attribs[0][chan];
+            }
+            else {
+               a = bld->a[attrib][chan];
+
+               /*
+                * Broadcast the attribute value for this quad into all elements
+                */
+
+               a = LLVMBuildShuffleVector(coeff_bld->builder,
+                                          a, coeff_bld->undef, shuffle, "");
+
+               /*
+                * Add the derivatives
+                */
+
+               a = lp_build_add(coeff_bld, a, bld->dadq[attrib][chan]);
+
+               attrib_name(a, attrib, chan, "");
+            }
+            bld->attribs[attrib][chan] = a;
+         }
+      }
+   }
+}
+
+
+/**
+ * Generate the position vectors.
+ *
+ * Parameter x0, y0 are the integer values with upper left coordinates.
+ */
+static void
+pos_init(struct lp_build_interp_soa_context *bld,
+         LLVMValueRef x0,
+         LLVMValueRef y0)
+{
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+
+   bld->x = LLVMBuildSIToFP(coeff_bld->builder, x0, coeff_bld->elem_type, "");
+   bld->y = LLVMBuildSIToFP(coeff_bld->builder, y0, coeff_bld->elem_type, "");
+}
+
+
+/**
+ * Initialize fragment shader input attribute info.
+ */
+void
+lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
+                         unsigned num_inputs,
+                         const struct lp_shader_input *inputs,
+                         LLVMBuilderRef builder,
+                         struct lp_type type,
+                         LLVMValueRef a0_ptr,
+                         LLVMValueRef dadx_ptr,
+                         LLVMValueRef dady_ptr,
+                         LLVMValueRef x0,
+                         LLVMValueRef y0)
+{
+   struct lp_type coeff_type;
+   unsigned attrib;
+   unsigned chan;
+
+   memset(bld, 0, sizeof *bld);
+
+   memset(&coeff_type, 0, sizeof coeff_type);
+   coeff_type.floating = TRUE;
+   coeff_type.sign = TRUE;
+   coeff_type.width = 32;
+   coeff_type.length = QUAD_SIZE;
+
+   /* XXX: we don't support interpolating into any other types */
+   assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);
+
+   lp_build_context_init(&bld->coeff_bld, builder, coeff_type);
+
+   /* For convenience */
+   bld->pos = bld->attribs[0];
+   bld->inputs = (const LLVMValueRef (*)[NUM_CHANNELS]) bld->attribs[1];
+
+   /* Position */
+   bld->num_attribs = 1;
+   bld->mask[0] = TGSI_WRITEMASK_XYZW;
+   bld->interp[0] = LP_INTERP_LINEAR;
+
+   /* Inputs */
+   for (attrib = 0; attrib < num_inputs; ++attrib) {
+      bld->mask[1 + attrib] = inputs[attrib].usage_mask;
+      bld->interp[1 + attrib] = inputs[attrib].interp;
+   }
+   bld->num_attribs = 1 + num_inputs;
+
+   /* Ensure all masked out input channels have a valid value */
+   for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      for (chan = 0; chan < NUM_CHANNELS; ++chan) {
+         bld->attribs[attrib][chan] = bld->coeff_bld.undef;
+      }
+   }
+
+   pos_init(bld, x0, y0);
+
+   coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
+
+   attribs_update(bld, 0);
+}
+
+
+/**
+ * Advance the position and inputs to the given quad within the block.
+ */
+void
+lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
+                           int quad_index)
+{
+   assert(quad_index < 4);
+
+   attribs_update(bld, quad_index);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
new file mode 100644
index 0000000000..2905513301
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -0,0 +1,94 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Position and shader input interpolation.
+ *
+ * Special attention is given to the interpolation of side by side quads.
+ * Multiplications are made only for the first quad. Interpolation of
+ * inputs for posterior quads are done exclusively with additions, and
+ * perspective divide if necessary.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_INTERP_H
+#define LP_BLD_INTERP_H
+
+
+#include "gallivm/lp_bld.h"
+#include "gallivm/lp_bld_type.h"
+
+#include "tgsi/tgsi_exec.h"
+
+#include "lp_setup.h"
+
+
+struct lp_build_interp_soa_context
+{
+   /* QUAD_SIZE x float */
+   struct lp_build_context coeff_bld;
+
+   unsigned num_attribs;
+   unsigned mask[1 + PIPE_MAX_SHADER_INPUTS]; /**< TGSI_WRITE_MASK_x */
+   enum lp_interp interp[1 + PIPE_MAX_SHADER_INPUTS];
+
+   LLVMValueRef x;
+   LLVMValueRef y;
+
+   LLVMValueRef a   [1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   LLVMValueRef dadq[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+
+   LLVMValueRef attribs[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+
+   /*
+    * Convenience pointers. Callers may access this one.
+    */
+   const LLVMValueRef *pos;
+   const LLVMValueRef (*inputs)[NUM_CHANNELS];
+};
+
+
+void
+lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
+                         unsigned num_inputs,
+                         const struct lp_shader_input *inputs,
+                         LLVMBuilderRef builder,
+                         struct lp_type type,
+                         LLVMValueRef a0_ptr,
+                         LLVMValueRef dadx_ptr,
+                         LLVMValueRef dady_ptr,
+                         LLVMValueRef x,
+                         LLVMValueRef y);
+
+void
+lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
+                           int quad_index);
+
+
+#endif /* LP_BLD_INTERP_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_clear.c b/src/gallium/drivers/llvmpipe/lp_clear.c
new file mode 100644
index 0000000000..3e8c410925
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_clear.c
@@ -0,0 +1,58 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2009 VMware, Inc.  All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Michel Dänzer
+ */
+
+
+#include "pipe/p_defines.h"
+#include "lp_clear.h"
+#include "lp_context.h"
+#include "lp_setup.h"
+
+
+/**
+ * Clear the given buffers to the specified values.
+ * No masking, no scissor (clear entire buffer).
+ */
+void
+llvmpipe_clear(struct pipe_context *pipe, 
+               unsigned buffers,
+               const float *rgba,
+               double depth,
+               unsigned stencil)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if (llvmpipe->no_rast)
+      return;
+
+   lp_setup_clear( llvmpipe->setup, rgba, depth, stencil, buffers );
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_clear.h b/src/gallium/drivers/llvmpipe/lp_clear.h
new file mode 100644
index 0000000000..6d4ffccdf4
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_clear.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ */
+
+#ifndef LP_CLEAR_H
+#define LP_CLEAR_H
+
+#include "pipe/p_state.h"
+struct pipe_context;
+
+extern void
+llvmpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
+               double depth, unsigned stencil);
+
+
+#endif /* LP_CLEAR_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
new file mode 100644
index 0000000000..3db4f12ebb
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -0,0 +1,163 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+#include "lp_clear.h"
+#include "lp_context.h"
+#include "lp_flush.h"
+#include "lp_perf.h"
+#include "lp_state.h"
+#include "lp_surface.h"
+#include "lp_query.h"
+#include "lp_setup.h"
+
+static void llvmpipe_destroy( struct pipe_context *pipe )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+   uint i, j;
+
+   lp_print_counters();
+
+   /* This will also destroy llvmpipe->setup:
+    */
+   if (llvmpipe->draw)
+      draw_destroy( llvmpipe->draw );
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&llvmpipe->framebuffer.cbufs[i], NULL);
+   }
+
+   pipe_surface_reference(&llvmpipe->framebuffer.zsbuf, NULL);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      pipe_sampler_view_reference(&llvmpipe->fragment_sampler_views[i], NULL);
+   }
+
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      pipe_sampler_view_reference(&llvmpipe->vertex_sampler_views[i], NULL);
+   }
+
+   for (i = 0; i < Elements(llvmpipe->constants); i++) {
+      for (j = 0; j < Elements(llvmpipe->constants[i]); j++) {
+         pipe_resource_reference(&llvmpipe->constants[i][j], NULL);
+      }
+   }
+
+   align_free( llvmpipe );
+}
+
+
+struct pipe_context *
+llvmpipe_create_context( struct pipe_screen *screen, void *priv )
+{
+   struct llvmpipe_context *llvmpipe;
+
+   llvmpipe = align_malloc(sizeof(struct llvmpipe_context), 16);
+   if (!llvmpipe)
+      return NULL;
+
+   util_init_math();
+
+   memset(llvmpipe, 0, sizeof *llvmpipe);
+
+   make_empty_list(&llvmpipe->fs_variants_list);
+
+   llvmpipe->pipe.winsys = screen->winsys;
+   llvmpipe->pipe.screen = screen;
+   llvmpipe->pipe.priv = priv;
+
+   /* Init the pipe context methods */
+   llvmpipe->pipe.destroy = llvmpipe_destroy;
+   llvmpipe->pipe.set_framebuffer_state = llvmpipe_set_framebuffer_state;
+   llvmpipe->pipe.clear = llvmpipe_clear;
+   llvmpipe->pipe.flush = llvmpipe_flush;
+
+   llvmpipe_init_blend_funcs(llvmpipe);
+   llvmpipe_init_clip_funcs(llvmpipe);
+   llvmpipe_init_draw_funcs(llvmpipe);
+   llvmpipe_init_sampler_funcs(llvmpipe);
+   llvmpipe_init_query_funcs( llvmpipe );
+   llvmpipe_init_vertex_funcs(llvmpipe);
+   llvmpipe_init_so_funcs(llvmpipe);
+   llvmpipe_init_fs_funcs(llvmpipe);
+   llvmpipe_init_vs_funcs(llvmpipe);
+   llvmpipe_init_gs_funcs(llvmpipe);
+   llvmpipe_init_rasterizer_funcs(llvmpipe);
+   llvmpipe_init_context_resource_funcs( &llvmpipe->pipe );
+   llvmpipe_init_surface_functions(llvmpipe);
+
+   /*
+    * Create drawing context and plug our rendering stage into it.
+    */
+   llvmpipe->draw = draw_create(&llvmpipe->pipe);
+   if (!llvmpipe->draw)
+      goto fail;
+
+   /* FIXME: devise alternative to draw_texture_samplers */
+
+   if (debug_get_bool_option( "LP_NO_RAST", FALSE ))
+      llvmpipe->no_rast = TRUE;
+
+   llvmpipe->setup = lp_setup_create( &llvmpipe->pipe,
+                                      llvmpipe->draw );
+   if (!llvmpipe->setup)
+      goto fail;
+
+   /* plug in AA line/point stages */
+   draw_install_aaline_stage(llvmpipe->draw, &llvmpipe->pipe);
+   draw_install_aapoint_stage(llvmpipe->draw, &llvmpipe->pipe);
+   draw_install_pstipple_stage(llvmpipe->draw, &llvmpipe->pipe);
+
+   /* convert points and lines into triangles: */
+   draw_wide_point_threshold(llvmpipe->draw, 0.0);
+   draw_wide_line_threshold(llvmpipe->draw, 0.0);
+
+#if USE_DRAW_STAGE_PSTIPPLE
+   /* Do polygon stipple w/ texture map + frag prog? */
+   draw_install_pstipple_stage(llvmpipe->draw, &llvmpipe->pipe);
+#endif
+
+   lp_reset_counters();
+
+   return &llvmpipe->pipe;
+
+ fail:
+   llvmpipe_destroy(&llvmpipe->pipe);
+   return NULL;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
new file mode 100644
index 0000000000..986e604ce7
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -0,0 +1,132 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef LP_CONTEXT_H
+#define LP_CONTEXT_H
+
+#include "pipe/p_context.h"
+
+#include "draw/draw_vertex.h"
+
+#include "lp_tex_sample.h"
+#include "lp_jit.h"
+#include "lp_setup.h"
+#include "lp_state_fs.h"
+
+
+struct llvmpipe_vbuf_render;
+struct draw_context;
+struct draw_stage;
+struct lp_fragment_shader;
+struct lp_vertex_shader;
+struct lp_blend_state;
+struct lp_setup_context;
+struct lp_velems_state;
+
+struct llvmpipe_context {
+   struct pipe_context pipe;  /**< base class */
+
+   /** Constant state objects */
+   const struct pipe_blend_state *blend;
+   const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_sampler_state *vertex_samplers[PIPE_MAX_VERTEX_SAMPLERS];
+   const struct pipe_depth_stencil_alpha_state *depth_stencil;
+   const struct pipe_rasterizer_state *rasterizer;
+   struct lp_fragment_shader *fs;
+   const struct lp_vertex_shader *vs;
+   const struct lp_geometry_shader *gs;
+   const struct lp_velems_state *velems;
+   const struct lp_so_state *so;
+
+   /** Other rendering state */
+   struct pipe_blend_color blend_color;
+   struct pipe_stencil_ref stencil_ref;
+   struct pipe_clip_state clip;
+   struct pipe_resource *constants[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
+   struct pipe_sampler_view *vertex_sampler_views[PIPE_MAX_VERTEX_SAMPLERS];
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   struct {
+      struct llvmpipe_resource *buffer[PIPE_MAX_SO_BUFFERS];
+      int offset[PIPE_MAX_SO_BUFFERS];
+      int so_count[PIPE_MAX_SO_BUFFERS];
+      int num_buffers;
+   } so_target;
+
+   unsigned num_samplers;
+   unsigned num_fragment_sampler_views;
+   unsigned num_vertex_samplers;
+   unsigned num_vertex_sampler_views;
+   unsigned num_vertex_buffers;
+
+   unsigned dirty; /**< Mask of LP_NEW_x flags */
+
+   int active_query_count;
+
+   /** Mapped vertex buffers */
+   ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS];
+   
+   /** Vertex format */
+   struct vertex_info vertex_info;
+
+   /** Fragment shader input interpolation info */
+   unsigned num_inputs;
+   struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
+
+   /** The tiling engine */
+   struct lp_setup_context *setup;
+
+   /** The primitive drawing context */
+   struct draw_context *draw;
+
+   unsigned tex_timestamp;
+   boolean no_rast;
+
+   struct lp_fs_variant_list_item fs_variants_list;
+   unsigned nr_fs_variants;
+};
+
+
+struct pipe_context *
+llvmpipe_create_context( struct pipe_screen *screen, void *priv );
+
+
+static INLINE struct llvmpipe_context *
+llvmpipe_context( struct pipe_context *pipe )
+{
+   return (struct llvmpipe_context *)pipe;
+}
+
+#endif /* LP_CONTEXT_H */
+
diff --git a/src/gallium/drivers/llvmpipe/lp_debug.h b/src/gallium/drivers/llvmpipe/lp_debug.h
new file mode 100644
index 0000000000..92fb2b3ee5
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_debug.h
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef LP_DEBUG_H
+#define LP_DEBUG_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+
+extern void
+st_print_current(void);
+
+
+#define DEBUG_PIPE      0x1
+#define DEBUG_TGSI      0x2
+#define DEBUG_TEX       0x4
+#define DEBUG_SETUP     0x10
+#define DEBUG_RAST      0x20
+#define DEBUG_QUERY     0x40
+#define DEBUG_SCREEN    0x80
+#define DEBUG_SHOW_TILES    0x200
+#define DEBUG_SHOW_SUBTILES 0x400
+#define DEBUG_COUNTERS      0x800
+
+
+#ifdef DEBUG
+extern int LP_DEBUG;
+#else
+#define LP_DEBUG 0
+#endif
+
+void st_debug_init( void );
+
+static INLINE void
+LP_DBG( unsigned flag, const char *fmt, ... )
+{
+    if (LP_DEBUG & flag)
+    {
+        va_list args;
+
+        va_start( args, fmt );
+        debug_vprintf( fmt, args );
+        va_end( args );
+    }
+}
+
+
+#endif /* LP_DEBUG_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
new file mode 100644
index 0000000000..98780d7631
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -0,0 +1,138 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "util/u_prim.h"
+
+#include "lp_context.h"
+#include "lp_state.h"
+
+#include "draw/draw_context.h"
+
+
+
+/**
+ * Draw vertex arrays, with optional indexing.
+ * Basically, map the vertex buffers (and drawing surfaces), then hand off
+ * the drawing to the 'draw' module.
+ */
+static void
+llvmpipe_draw_range_elements(struct pipe_context *pipe,
+                             struct pipe_resource *indexBuffer,
+                             unsigned indexSize,
+                             int indexBias,
+                             unsigned min_index,
+                             unsigned max_index,
+                             unsigned mode, unsigned start, unsigned count)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(pipe);
+   struct draw_context *draw = lp->draw;
+   unsigned i;
+
+   if (lp->dirty)
+      llvmpipe_update_derived( lp );
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < lp->num_vertex_buffers; i++) {
+      void *buf = llvmpipe_resource_data(lp->vertex_buffer[i].buffer);
+      draw_set_mapped_vertex_buffer(draw, i, buf);
+   }
+
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      void *mapped_indexes = llvmpipe_resource_data(indexBuffer);
+      draw_set_mapped_element_buffer_range(draw, indexSize, indexBias,
+                                           min_index,
+                                           max_index,
+                                           mapped_indexes);
+   }
+   else {
+      /* no index/element buffer */
+      draw_set_mapped_element_buffer_range(draw, 0, 0, start,
+                                           start + count - 1, NULL);
+   }
+
+   /* draw! */
+   draw_arrays(draw, mode, start, count);
+
+   /*
+    * unmap vertex/index buffers
+    */
+   for (i = 0; i < lp->num_vertex_buffers; i++) {
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+   }
+   if (indexBuffer) {
+      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
+   }
+
+   /*
+    * TODO: Flush only when a user vertex/index buffer is present
+    * (or even better, modify draw module to do this
+    * internally when this condition is seen?)
+    */
+   draw_flush(draw);
+}
+
+
+static void
+llvmpipe_draw_elements(struct pipe_context *pipe,
+                       struct pipe_resource *indexBuffer,
+                       unsigned indexSize,
+                       int indexBias,
+                       unsigned mode, unsigned start, unsigned count)
+{
+   llvmpipe_draw_range_elements( pipe, indexBuffer,
+                                 indexSize, indexBias,
+                                 0, 0xffffffff,
+                                 mode, start, count );
+}
+
+
+static void
+llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   llvmpipe_draw_elements(pipe, NULL, 0, 0, mode, start, count);
+}
+
+
+void
+llvmpipe_init_draw_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.draw_arrays = llvmpipe_draw_arrays;
+   llvmpipe->pipe.draw_elements = llvmpipe_draw_elements;
+   llvmpipe->pipe.draw_range_elements = llvmpipe_draw_range_elements;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_fence.c b/src/gallium/drivers/llvmpipe/lp_fence.c
new file mode 100644
index 0000000000..75d8d2b825
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_fence.c
@@ -0,0 +1,152 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_screen.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "lp_debug.h"
+#include "lp_fence.h"
+
+
+/**
+ * Create a new fence object.
+ *
+ * The rank will be the number of bins in the scene.  Whenever a rendering
+ * thread hits a fence command, it'll increment the fence counter.  When
+ * the counter == the rank, the fence is finished.
+ *
+ * \param rank  the expected finished value of the fence counter.
+ */
+struct lp_fence *
+lp_fence_create(unsigned rank)
+{
+   struct lp_fence *fence = CALLOC_STRUCT(lp_fence);
+
+   pipe_reference_init(&fence->reference, 1);
+
+   pipe_mutex_init(fence->mutex);
+   pipe_condvar_init(fence->signalled);
+
+   fence->rank = rank;
+
+   return fence;
+}
+
+
+/** Destroy a fence.  Called when refcount hits zero. */
+static void
+lp_fence_destroy(struct lp_fence *fence)
+{
+   pipe_mutex_destroy(fence->mutex);
+   pipe_condvar_destroy(fence->signalled);
+   FREE(fence);
+}
+
+
+/**
+ * For reference counting.
+ * This is a Gallium API function.
+ */
+static void
+llvmpipe_fence_reference(struct pipe_screen *screen,
+                         struct pipe_fence_handle **ptr,
+                         struct pipe_fence_handle *fence)
+{
+   struct lp_fence *old = (struct lp_fence *) *ptr;
+   struct lp_fence *f = (struct lp_fence *) fence;
+
+   if (pipe_reference(&old->reference, &f->reference)) {
+      lp_fence_destroy(old);
+   }
+}
+
+
+/**
+ * Has the fence been executed/finished?
+ * This is a Gallium API function.
+ */
+static int
+llvmpipe_fence_signalled(struct pipe_screen *screen,
+                         struct pipe_fence_handle *fence,
+                         unsigned flag)
+{
+   struct lp_fence *f = (struct lp_fence *) fence;
+
+   return f->count == f->rank;
+}
+
+
+/**
+ * Wait for the fence to finish.
+ * This is a Gallium API function.
+ */
+static int
+llvmpipe_fence_finish(struct pipe_screen *screen,
+                      struct pipe_fence_handle *fence_handle,
+                      unsigned flag)
+{
+   struct lp_fence *fence = (struct lp_fence *) fence_handle;
+
+   pipe_mutex_lock(fence->mutex);
+   while (fence->count < fence->rank) {
+      pipe_condvar_wait(fence->signalled, fence->mutex);
+   }
+   pipe_mutex_unlock(fence->mutex);
+
+   return 0;
+}
+
+
+/**
+ * Called by the rendering threads to increment the fence counter.
+ * When the counter == the rank, the fence is finished.
+ */
+void
+lp_fence_signal(struct lp_fence *fence)
+{
+   pipe_mutex_lock(fence->mutex);
+
+   fence->count++;
+   assert(fence->count <= fence->rank);
+
+   LP_DBG(DEBUG_RAST, "%s count=%u rank=%u\n", __FUNCTION__,
+          fence->count, fence->rank);
+
+   pipe_condvar_signal(fence->signalled);
+
+   pipe_mutex_unlock(fence->mutex);
+}
+
+
+void
+llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen)
+{
+   screen->fence_reference = llvmpipe_fence_reference;
+   screen->fence_signalled = llvmpipe_fence_signalled;
+   screen->fence_finish = llvmpipe_fence_finish;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_fence.h b/src/gallium/drivers/llvmpipe/lp_fence.h
new file mode 100644
index 0000000000..d9270f5784
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_fence.h
@@ -0,0 +1,64 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_FENCE_H
+#define LP_FENCE_H
+
+
+#include "os/os_thread.h"
+#include "pipe/p_state.h"
+
+
+struct pipe_screen;
+
+
+struct lp_fence
+{
+   struct pipe_reference reference;
+
+   pipe_mutex mutex;
+   pipe_condvar signalled;
+
+   unsigned rank;
+   unsigned count;
+};
+
+
+struct lp_fence *
+lp_fence_create(unsigned rank);
+
+
+void
+lp_fence_signal(struct lp_fence *fence);
+
+
+void
+llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen);
+
+
+#endif /* LP_FENCE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c
new file mode 100644
index 0000000000..0cd288bb73
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_flush.c
@@ -0,0 +1,146 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "util/u_string.h"
+#include "draw/draw_context.h"
+#include "lp_flush.h"
+#include "lp_context.h"
+#include "lp_setup.h"
+
+
+/**
+ * \param flags  bitmask of PIPE_FLUSH_x flags
+ * \param fence  if non-null, returns pointer to a fench which can be waited on
+ */
+void
+llvmpipe_flush( struct pipe_context *pipe,
+		unsigned flags,
+                struct pipe_fence_handle **fence )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   draw_flush(llvmpipe->draw);
+
+   if (fence) {
+      /* if we're going to flush the setup/rasterization modules, emit
+       * a fence.
+       * XXX this (and the code below) may need fine tuning...
+       */
+      *fence = lp_setup_fence( llvmpipe->setup );
+   }
+
+   /* ask the setup module to flush */
+   lp_setup_flush(llvmpipe->setup, flags);
+
+   /* Enable to dump BMPs of the color/depth buffers each frame */
+   if (0) {
+      if (flags & PIPE_FLUSH_FRAME) {
+         static unsigned frame_no = 1;
+         char filename[256];
+         unsigned i;
+
+         for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++) {
+            util_snprintf(filename, sizeof(filename), "cbuf%u_%u", i, frame_no);
+            debug_dump_surface_bmp(&llvmpipe->pipe, filename, llvmpipe->framebuffer.cbufs[0]);
+         }
+
+         if (0) {
+            util_snprintf(filename, sizeof(filename), "zsbuf_%u", frame_no);
+            debug_dump_surface_bmp(&llvmpipe->pipe, filename, llvmpipe->framebuffer.zsbuf);
+         }
+
+         ++frame_no;
+      }
+   }
+}
+
+
+/**
+ * Flush context if necessary.
+ *
+ * Returns FALSE if it would have block, but do_not_block was set, TRUE
+ * otherwise.
+ *
+ * TODO: move this logic to an auxiliary library?
+ */
+boolean
+llvmpipe_flush_resource(struct pipe_context *pipe,
+                        struct pipe_resource *resource,
+                        unsigned face,
+                        unsigned level,
+                        unsigned flush_flags,
+                        boolean read_only,
+                        boolean cpu_access,
+                        boolean do_not_block)
+{
+   unsigned referenced;
+
+   referenced = pipe->is_resource_referenced(pipe, resource, face, level);
+
+   if ((referenced & PIPE_REFERENCED_FOR_WRITE) ||
+       ((referenced & PIPE_REFERENCED_FOR_READ) && !read_only)) {
+
+      if (cpu_access) {
+         /*
+          * Flush and wait.
+          */
+
+         struct pipe_fence_handle *fence = NULL;
+
+         if (do_not_block)
+            return FALSE;
+
+         /*
+          * Do the unswizzling in parallel.
+          *
+          * XXX: Don't abuse the PIPE_FLUSH_FRAME flag for this.
+          */
+         flush_flags |= PIPE_FLUSH_FRAME;
+
+         llvmpipe_flush(pipe, flush_flags, &fence);
+
+         if (fence) {
+            pipe->screen->fence_finish(pipe->screen, fence, 0);
+            pipe->screen->fence_reference(pipe->screen, &fence, NULL);
+         }
+      } else {
+         /*
+          * Just flush.
+          */
+
+         llvmpipe_flush(pipe, flush_flags, NULL);
+      }
+   }
+
+   return TRUE;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.h b/src/gallium/drivers/llvmpipe/lp_flush.h
new file mode 100644
index 0000000000..7b605681a9
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_flush.h
@@ -0,0 +1,50 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_FLUSH_H
+#define LP_FLUSH_H
+
+#include "pipe/p_compiler.h"
+
+struct pipe_context;
+struct pipe_fence_handle;
+
+void
+llvmpipe_flush(struct pipe_context *pipe, unsigned flags,
+               struct pipe_fence_handle **fence);
+
+boolean
+llvmpipe_flush_resource(struct pipe_context *pipe,
+                        struct pipe_resource *resource,
+                        unsigned face,
+                        unsigned level,
+                        unsigned flush_flags,
+                        boolean read_only,
+                        boolean cpu_access,
+                        boolean do_not_block);
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
new file mode 100644
index 0000000000..23aa34ddec
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -0,0 +1,207 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * C - JIT interfaces
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include <llvm-c/Transforms/Scalar.h>
+
+#include "util/u_memory.h"
+#include "util/u_cpu_detect.h"
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_debug.h"
+#include "lp_screen.h"
+#include "gallivm/lp_bld_intr.h"
+#include "lp_jit.h"
+
+
+static void
+lp_jit_init_globals(struct llvmpipe_screen *screen)
+{
+   LLVMTypeRef texture_type;
+
+   /* struct lp_jit_texture */
+   {
+      LLVMTypeRef elem_types[LP_JIT_TEXTURE_NUM_FIELDS];
+
+      elem_types[LP_JIT_TEXTURE_WIDTH]  = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_HEIGHT] = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_DEPTH] = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_LAST_LEVEL] = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_ROW_STRIDE] =
+         LLVMArrayType(LLVMInt32Type(), LP_MAX_TEXTURE_LEVELS);
+      elem_types[LP_JIT_TEXTURE_IMG_STRIDE] =
+         LLVMArrayType(LLVMInt32Type(), LP_MAX_TEXTURE_LEVELS);
+      elem_types[LP_JIT_TEXTURE_DATA] =
+         LLVMArrayType(LLVMPointerType(LLVMInt8Type(), 0),
+                       LP_MAX_TEXTURE_LEVELS);
+
+      texture_type = LLVMStructType(elem_types, Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, width,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_WIDTH);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, height,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_HEIGHT);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, depth,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_DEPTH);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, last_level,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_LAST_LEVEL);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, row_stride,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_ROW_STRIDE);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, img_stride,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_IMG_STRIDE);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, data,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_DATA);
+      LP_CHECK_STRUCT_SIZE(struct lp_jit_texture,
+                           screen->target, texture_type);
+
+      LLVMAddTypeName(screen->module, "texture", texture_type);
+   }
+
+   /* struct lp_jit_context */
+   {
+      LLVMTypeRef elem_types[LP_JIT_CTX_COUNT];
+      LLVMTypeRef context_type;
+
+      elem_types[LP_JIT_CTX_CONSTANTS] = LLVMPointerType(LLVMFloatType(), 0);
+      elem_types[LP_JIT_CTX_ALPHA_REF] = LLVMFloatType();
+      elem_types[LP_JIT_CTX_STENCIL_REF_FRONT] = LLVMInt32Type();
+      elem_types[LP_JIT_CTX_STENCIL_REF_BACK] = LLVMInt32Type();
+      elem_types[LP_JIT_CTX_SCISSOR_XMIN] = LLVMFloatType();
+      elem_types[LP_JIT_CTX_SCISSOR_YMIN] = LLVMFloatType();
+      elem_types[LP_JIT_CTX_SCISSOR_XMAX] = LLVMFloatType();
+      elem_types[LP_JIT_CTX_SCISSOR_YMAX] = LLVMFloatType();
+      elem_types[LP_JIT_CTX_BLEND_COLOR] = LLVMPointerType(LLVMInt8Type(), 0);
+      elem_types[LP_JIT_CTX_TEXTURES] = LLVMArrayType(texture_type,
+                                                      PIPE_MAX_SAMPLERS);
+
+      context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, constants,
+                             screen->target, context_type,
+                             LP_JIT_CTX_CONSTANTS);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, alpha_ref_value,
+                             screen->target, context_type,
+                             LP_JIT_CTX_ALPHA_REF);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, stencil_ref_front,
+                             screen->target, context_type,
+                             LP_JIT_CTX_STENCIL_REF_FRONT);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, stencil_ref_back,
+                             screen->target, context_type,
+                             LP_JIT_CTX_STENCIL_REF_BACK);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_xmin,
+                             screen->target, context_type,
+                             LP_JIT_CTX_SCISSOR_XMIN);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_ymin,
+                             screen->target, context_type,
+                             LP_JIT_CTX_SCISSOR_YMIN);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_xmax,
+                             screen->target, context_type,
+                             LP_JIT_CTX_SCISSOR_XMAX);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_ymax,
+                             screen->target, context_type,
+                             LP_JIT_CTX_SCISSOR_YMAX);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,
+                             screen->target, context_type,
+                             LP_JIT_CTX_BLEND_COLOR);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures,
+                             screen->target, context_type,
+                             LP_JIT_CTX_TEXTURES);
+      LP_CHECK_STRUCT_SIZE(struct lp_jit_context,
+                           screen->target, context_type);
+
+      LLVMAddTypeName(screen->module, "context", context_type);
+
+      screen->context_ptr_type = LLVMPointerType(context_type, 0);
+   }
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      LLVMDumpModule(screen->module);
+   }
+}
+
+
+void
+lp_jit_screen_cleanup(struct llvmpipe_screen *screen)
+{
+   if(screen->engine)
+      LLVMDisposeExecutionEngine(screen->engine);
+
+   if(screen->pass)
+      LLVMDisposePassManager(screen->pass);
+}
+
+
+void
+lp_jit_screen_init(struct llvmpipe_screen *screen)
+{
+   lp_build_init();
+
+   screen->module = lp_build_module;
+   screen->provider = lp_build_provider;
+   screen->engine = lp_build_engine;
+   screen->target = lp_build_target;
+
+   screen->pass = LLVMCreateFunctionPassManager(screen->provider);
+   LLVMAddTargetData(screen->target, screen->pass);
+
+   if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) {
+      /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+       * but there are more on SVN. */
+      /* TODO: Add more passes */
+      LLVMAddCFGSimplificationPass(screen->pass);
+      LLVMAddPromoteMemoryToRegisterPass(screen->pass);
+      LLVMAddConstantPropagationPass(screen->pass);
+      if(util_cpu_caps.has_sse4_1) {
+         /* FIXME: There is a bug in this pass, whereby the combination of fptosi
+          * and sitofp (necessary for trunc/floor/ceil/round implementation)
+          * somehow becomes invalid code.
+          */
+         LLVMAddInstructionCombiningPass(screen->pass);
+      }
+      LLVMAddGVNPass(screen->pass);
+   } else {
+      /* We need at least this pass to prevent the backends to fail in
+       * unexpected ways.
+       */
+      LLVMAddPromoteMemoryToRegisterPass(screen->pass);
+   }
+
+   lp_jit_init_globals(screen);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
new file mode 100644
index 0000000000..8d06e65725
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -0,0 +1,180 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * C - JIT interfaces
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_JIT_H
+#define LP_JIT_H
+
+
+#include "gallivm/lp_bld_struct.h"
+
+#include "pipe/p_state.h"
+#include "lp_texture.h"
+
+
+struct llvmpipe_screen;
+
+
+struct lp_jit_texture
+{
+   uint32_t width;
+   uint32_t height;
+   uint32_t depth;
+   uint32_t last_level;
+   uint32_t row_stride[LP_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[LP_MAX_TEXTURE_LEVELS];
+   const void *data[LP_MAX_TEXTURE_LEVELS];
+};
+
+
+enum {
+   LP_JIT_TEXTURE_WIDTH = 0,
+   LP_JIT_TEXTURE_HEIGHT,
+   LP_JIT_TEXTURE_DEPTH,
+   LP_JIT_TEXTURE_LAST_LEVEL,
+   LP_JIT_TEXTURE_ROW_STRIDE,
+   LP_JIT_TEXTURE_IMG_STRIDE,
+   LP_JIT_TEXTURE_DATA,
+   LP_JIT_TEXTURE_NUM_FIELDS  /* number of fields above */
+};
+
+
+
+/**
+ * This structure is passed directly to the generated fragment shader.
+ *
+ * It contains the derived state.
+ *
+ * Changes here must be reflected in the lp_jit_context_* macros and
+ * lp_jit_init_types function. Changes to the ordering should be avoided.
+ *
+ * Only use types with a clear size and padding here, in particular prefer the
+ * stdint.h types to the basic integer types.
+ */
+struct lp_jit_context
+{
+   const float *constants;
+
+   float alpha_ref_value;
+
+   uint32_t stencil_ref_front, stencil_ref_back;
+
+   /** floats, not ints */
+   float scissor_xmin, scissor_ymin, scissor_xmax, scissor_ymax;
+
+   /* FIXME: store (also?) in floats */
+   uint8_t *blend_color;
+
+   struct lp_jit_texture textures[PIPE_MAX_SAMPLERS];
+};
+
+
+/**
+ * These enum values must match the position of the fields in the
+ * lp_jit_context struct above.
+ */
+enum {
+   LP_JIT_CTX_CONSTANTS = 0,
+   LP_JIT_CTX_ALPHA_REF,
+   LP_JIT_CTX_STENCIL_REF_FRONT,
+   LP_JIT_CTX_STENCIL_REF_BACK,
+   LP_JIT_CTX_SCISSOR_XMIN,
+   LP_JIT_CTX_SCISSOR_YMIN,
+   LP_JIT_CTX_SCISSOR_XMAX,
+   LP_JIT_CTX_SCISSOR_YMAX,
+   LP_JIT_CTX_BLEND_COLOR,
+   LP_JIT_CTX_TEXTURES,
+   LP_JIT_CTX_COUNT
+};
+
+
+#define lp_jit_context_constants(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_CONSTANTS, "constants")
+
+#define lp_jit_context_alpha_ref_value(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_ALPHA_REF, "alpha_ref_value")
+
+#define lp_jit_context_stencil_ref_front_value(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_STENCIL_REF_FRONT, "stencil_ref_front")
+
+#define lp_jit_context_stencil_ref_back_value(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_STENCIL_REF_BACK, "stencil_ref_back")
+
+#define lp_jit_context_scissor_xmin_value(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_XMIN, "scissor_xmin")
+
+#define lp_jit_context_scissor_ymin_value(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_YMIN, "scissor_ymin")
+
+#define lp_jit_context_scissor_xmax_value(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_XMAX, "scissor_xmax")
+
+#define lp_jit_context_scissor_ymax_value(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_YMAX, "scissor_ymax")
+
+#define lp_jit_context_blend_color(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_BLEND_COLOR, "blend_color")
+
+#define lp_jit_context_textures(_builder, _ptr) \
+   lp_build_struct_get_ptr(_builder, _ptr, LP_JIT_CTX_TEXTURES, "textures")
+
+
+
+typedef void
+(*lp_jit_frag_func)(const struct lp_jit_context *context,
+                    uint32_t x,
+                    uint32_t y,
+                    float facing,
+                    const void *a0,
+                    const void *dadx,
+                    const void *dady,
+                    uint8_t **color,
+                    void *depth,
+                    const int32_t c1,
+                    const int32_t c2,
+                    const int32_t c3,
+                    const int32_t *step1,
+                    const int32_t *step2,
+                    const int32_t *step3,
+                    uint32_t *counter);
+
+
+void
+lp_jit_screen_cleanup(struct llvmpipe_screen *screen);
+
+
+void
+lp_jit_screen_init(struct llvmpipe_screen *screen);
+
+
+#endif /* LP_JIT_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_limits.h b/src/gallium/drivers/llvmpipe/lp_limits.h
new file mode 100644
index 0000000000..d1c431475d
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_limits.h
@@ -0,0 +1,75 @@
+/**************************************************************************
+ * 
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Implementation limits for LLVMpipe driver.
+ */
+
+#ifndef LP_LIMITS_H
+#define LP_LIMITS_H
+
+
+/**
+ * Tile size (width and height). This needs to be a power of two.
+ */
+#define TILE_ORDER 6
+#define TILE_SIZE (1 << TILE_ORDER)
+
+
+/**
+ * Max texture sizes
+ */
+#define LP_MAX_TEXTURE_2D_LEVELS 13  /* 4K x 4K for now */
+#define LP_MAX_TEXTURE_3D_LEVELS 10  /* 512 x 512 x 512 for now */
+
+
+/** This must be the larger of LP_MAX_TEXTURE_2D/3D_LEVELS */
+#define LP_MAX_TEXTURE_LEVELS LP_MAX_TEXTURE_2D_LEVELS
+
+
+/**
+ * Max drawing surface size is the max texture size
+ */
+#define LP_MAX_HEIGHT (1 << (LP_MAX_TEXTURE_LEVELS - 1))
+#define LP_MAX_WIDTH  (1 << (LP_MAX_TEXTURE_LEVELS - 1))
+
+
+#define LP_MAX_THREADS 8
+
+
+/**
+ * Max bytes per scene.  This may be replaced by a runtime parameter.
+ */
+#define LP_MAX_SCENE_SIZE (512 * 1024 * 1024)
+
+/**
+ * Max number of shader variants (for all shaders combined,
+ * per context) that will be kept around.
+ */
+#define LP_MAX_SHADER_VARIANTS 1024
+
+#endif /* LP_LIMITS_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_perf.c b/src/gallium/drivers/llvmpipe/lp_perf.c
new file mode 100644
index 0000000000..a316597675
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_perf.c
@@ -0,0 +1,95 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_debug.h"
+#include "lp_debug.h"
+#include "lp_perf.h"
+
+
+
+struct lp_counters lp_count;
+
+
+void
+lp_reset_counters(void)
+{
+   memset(&lp_count, 0, sizeof(lp_count));
+}
+
+
+void
+lp_print_counters(void)
+{
+   if (LP_DEBUG & DEBUG_COUNTERS) {
+      unsigned total_64, total_16, total_4;
+      float p1, p2, p3;
+
+      debug_printf("llvmpipe: nr_triangles:               %9u\n", lp_count.nr_tris);
+      debug_printf("llvmpipe: nr_culled_triangles:        %9u\n", lp_count.nr_culled_tris);
+
+      total_64 = (lp_count.nr_empty_64 + 
+                  lp_count.nr_fully_covered_64 +
+                  lp_count.nr_partially_covered_64);
+
+      p1 = 100.0 * (float) lp_count.nr_empty_64 / (float) total_64;
+      p2 = 100.0 * (float) lp_count.nr_fully_covered_64 / (float) total_64;
+      p3 = 100.0 * (float) lp_count.nr_partially_covered_64 / (float) total_64;
+
+      debug_printf("llvmpipe: nr_empty_64x64:             %9u (%2.0f%% of %u)\n", lp_count.nr_empty_64, p1, total_64);
+      debug_printf("llvmpipe: nr_fully_covered_64x64:     %9u (%2.0f%% of %u)\n", lp_count.nr_fully_covered_64, p2, total_64);
+      debug_printf("llvmpipe: nr_partially_covered_64x64: %9u (%2.0f%% of %u)\n", lp_count.nr_partially_covered_64, p3, total_64);
+
+      total_16 = (lp_count.nr_empty_16 + 
+                  lp_count.nr_fully_covered_16 +
+                  lp_count.nr_partially_covered_16);
+
+      p1 = 100.0 * (float) lp_count.nr_empty_16 / (float) total_16;
+      p2 = 100.0 * (float) lp_count.nr_fully_covered_16 / (float) total_16;
+      p3 = 100.0 * (float) lp_count.nr_partially_covered_16 / (float) total_16;
+
+      debug_printf("llvmpipe: nr_empty_16x16:             %9u (%2.0f%% of %u)\n", lp_count.nr_empty_16, p1, total_16);
+      debug_printf("llvmpipe: nr_fully_covered_16x16:     %9u (%2.0f%% of %u)\n", lp_count.nr_fully_covered_16, p2, total_16);
+      debug_printf("llvmpipe: nr_partially_covered_16x16: %9u (%2.0f%% of %u)\n", lp_count.nr_partially_covered_16, p3, total_16);
+
+      total_4 = (lp_count.nr_empty_4 + lp_count.nr_non_empty_4);
+
+      p1 = 100.0 * (float) lp_count.nr_empty_4 / (float) total_4;
+      p2 = 100.0 * (float) lp_count.nr_non_empty_4 / (float) total_4;
+
+      debug_printf("llvmpipe: nr_empty_4x4:               %9u (%2.0f%% of %u)\n", lp_count.nr_empty_4, p1, total_4);
+      debug_printf("llvmpipe: nr_non_empty_4x4:           %9u (%2.0f%% of %u)\n", lp_count.nr_non_empty_4, p2, total_4);
+
+      debug_printf("llvmpipe: nr_color_tile_clear:        %9u\n", lp_count.nr_color_tile_clear);
+      debug_printf("llvmpipe: nr_color_tile_load:         %9u\n", lp_count.nr_color_tile_load);
+      debug_printf("llvmpipe: nr_color_tile_store:        %9u\n", lp_count.nr_color_tile_store);
+
+      debug_printf("llvmpipe: nr_llvm_compiles:           %u\n", lp_count.nr_llvm_compiles);
+      debug_printf("llvmpipe: total LLVM compile time:    %.2f sec\n", lp_count.llvm_compile_time / 1000000.0);
+      debug_printf("llvmpipe: average LLVM compile time:  %.2f sec\n", lp_count.llvm_compile_time / 1000000.0 / lp_count.nr_llvm_compiles);
+
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_perf.h b/src/gallium/drivers/llvmpipe/lp_perf.h
new file mode 100644
index 0000000000..a9629dae3c
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_perf.h
@@ -0,0 +1,82 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Performance / statistic counters, etc.
+ */
+
+
+#ifndef LP_PERF_H
+#define LP_PERF_H
+
+
+/**
+ * Various counters
+ */
+struct lp_counters
+{
+   unsigned nr_tris;
+   unsigned nr_culled_tris;
+   unsigned nr_empty_64;
+   unsigned nr_fully_covered_64;
+   unsigned nr_partially_covered_64;
+   unsigned nr_empty_16;
+   unsigned nr_fully_covered_16;
+   unsigned nr_partially_covered_16;
+   unsigned nr_empty_4;
+   unsigned nr_non_empty_4;
+   unsigned nr_llvm_compiles;
+   int64_t llvm_compile_time;  /**< total, in microseconds */
+
+   unsigned nr_color_tile_clear;
+   unsigned nr_color_tile_load;
+   unsigned nr_color_tile_store;
+};
+
+
+extern struct lp_counters lp_count;
+
+
+/** Increment the named counter (only for debug builds) */
+#ifdef DEBUG
+#define LP_COUNT(counter) lp_count.counter++
+#define LP_COUNT_ADD(counter, incr)  lp_count.counter += (incr)
+#else
+#define LP_COUNT(counter)
+#define LP_COUNT_ADD(counter, incr) (void) incr
+#endif
+
+
+extern void
+lp_reset_counters(void);
+
+
+extern void
+lp_print_counters(void);
+
+
+#endif /* LP_PERF_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_public.h b/src/gallium/drivers/llvmpipe/lp_public.h
new file mode 100644
index 0000000000..ec6b660b48
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_public.h
@@ -0,0 +1,10 @@
+#ifndef LP_PUBLIC_H
+#define LP_PUBLIC_H
+
+struct pipe_screen;
+struct sw_winsys;
+
+struct pipe_screen *
+llvmpipe_create_screen(struct sw_winsys *winsys);
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c
new file mode 100644
index 0000000000..c902c04684
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -0,0 +1,146 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *    Keith Whitwell, Qicheng Christopher Li, Brian Paul
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "lp_context.h"
+#include "lp_flush.h"
+#include "lp_query.h"
+#include "lp_rast.h"
+#include "lp_rast_priv.h"
+#include "lp_state.h"
+
+
+static struct llvmpipe_query *llvmpipe_query( struct pipe_query *p )
+{
+   return (struct llvmpipe_query *)p;
+}
+
+static struct pipe_query *
+llvmpipe_create_query(struct pipe_context *pipe, 
+		      unsigned type)
+{
+   struct llvmpipe_query *pq;
+
+   assert(type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+   pq = CALLOC_STRUCT( llvmpipe_query );
+   if (pq) {
+      pipe_mutex_init(pq->mutex);
+   }
+
+   return (struct pipe_query *) pq;
+}
+
+
+static void
+llvmpipe_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct llvmpipe_query *pq = llvmpipe_query(q);
+   pipe_mutex_destroy(pq->mutex);
+   FREE(pq);
+}
+
+
+static boolean
+llvmpipe_get_query_result(struct pipe_context *pipe, 
+			  struct pipe_query *q,
+			  boolean wait,
+			  void *vresult)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+   struct llvmpipe_query *pq = llvmpipe_query(q);
+   uint64_t *result = (uint64_t *)vresult;
+
+   if (!pq->done) {
+      lp_setup_flush(llvmpipe->setup, 0);
+   }
+
+   if (pq->done) {
+      *result = pq->result;
+   }
+
+   return pq->done;
+}
+
+
+static void
+llvmpipe_begin_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+   struct llvmpipe_query *pq = llvmpipe_query(q);
+
+   /* Check if the query is already in the scene.  If so, we need to
+    * flush the scene now.  Real apps shouldn't re-use a query in a
+    * frame of rendering.
+    */
+   if (pq->binned) {
+      struct pipe_fence_handle *fence;
+      llvmpipe_flush(pipe, 0, &fence);
+      if (fence) {
+         pipe->screen->fence_finish(pipe->screen, fence, 0);
+         pipe->screen->fence_reference(pipe->screen, &fence, NULL);
+      }
+   }
+
+   lp_setup_begin_query(llvmpipe->setup, pq);
+
+   llvmpipe->active_query_count++;
+   llvmpipe->dirty |= LP_NEW_QUERY;
+}
+
+
+static void
+llvmpipe_end_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+   struct llvmpipe_query *pq = llvmpipe_query(q);
+
+   lp_setup_end_query(llvmpipe->setup, pq);
+
+   assert(llvmpipe->active_query_count);
+   llvmpipe->active_query_count--;
+   llvmpipe->dirty |= LP_NEW_QUERY;
+}
+
+
+void llvmpipe_init_query_funcs(struct llvmpipe_context *llvmpipe )
+{
+   llvmpipe->pipe.create_query = llvmpipe_create_query;
+   llvmpipe->pipe.destroy_query = llvmpipe_destroy_query;
+   llvmpipe->pipe.begin_query = llvmpipe_begin_query;
+   llvmpipe->pipe.end_query = llvmpipe_end_query;
+   llvmpipe->pipe.get_query_result = llvmpipe_get_query_result;
+}
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_query.h b/src/gallium/drivers/llvmpipe/lp_query.h
new file mode 100644
index 0000000000..721c41cb5c
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_query.h
@@ -0,0 +1,59 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *    Keith Whitwell, Qicheng Christopher Li, Brian Paul
+ */
+
+#ifndef LP_QUERY_H
+#define LP_QUERY_H
+
+#include <limits.h>
+#include "os/os_thread.h"
+#include "lp_limits.h"
+
+
+struct llvmpipe_context;
+
+
+struct llvmpipe_query {
+   uint64_t count[LP_MAX_THREADS];  /**< a counter for each thread */
+   uint64_t result;                 /**< total of all counters */
+
+   pipe_mutex mutex;
+   unsigned num_tiles, tile_count;
+
+   boolean done;
+   boolean binned;  /**< has this query been binned in the scene? */
+};
+
+
+extern void llvmpipe_init_query_funcs(struct llvmpipe_context * );
+
+
+#endif /* LP_QUERY_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
new file mode 100644
index 0000000000..50e44dcb2b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -0,0 +1,1032 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <limits.h>
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_surface.h"
+
+#include "lp_scene_queue.h"
+#include "lp_debug.h"
+#include "lp_fence.h"
+#include "lp_perf.h"
+#include "lp_query.h"
+#include "lp_rast.h"
+#include "lp_rast_priv.h"
+#include "lp_tile_soa.h"
+#include "gallivm/lp_bld_debug.h"
+#include "lp_scene.h"
+
+
+/**
+ * Begin rasterizing a scene.
+ * Called once per scene by one thread.
+ */
+static void
+lp_rast_begin( struct lp_rasterizer *rast,
+               struct lp_scene *scene )
+{
+   const struct pipe_framebuffer_state *fb = &scene->fb;
+   int i;
+
+   rast->curr_scene = scene;
+
+   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
+
+   rast->state.nr_cbufs = scene->fb.nr_cbufs;
+   
+   for (i = 0; i < rast->state.nr_cbufs; i++) {
+      struct pipe_surface *cbuf = scene->fb.cbufs[i];
+      llvmpipe_resource_map(cbuf->texture,
+                            cbuf->face,
+                            cbuf->level,
+                            cbuf->zslice,
+                            LP_TEX_USAGE_READ_WRITE,
+                            LP_TEX_LAYOUT_NONE);
+   }
+
+   if (fb->zsbuf) {
+      struct pipe_surface *zsbuf = scene->fb.zsbuf;
+      rast->zsbuf.stride = llvmpipe_resource_stride(zsbuf->texture, zsbuf->level);
+      rast->zsbuf.blocksize = 
+         util_format_get_blocksize(zsbuf->texture->format);
+
+      rast->zsbuf.map = llvmpipe_resource_map(zsbuf->texture,
+                                             zsbuf->face,
+                                             zsbuf->level,
+                                             zsbuf->zslice,
+                                             LP_TEX_USAGE_READ_WRITE,
+                                             LP_TEX_LAYOUT_NONE);
+      assert(rast->zsbuf.map);
+   }
+
+   lp_scene_bin_iter_begin( scene );
+}
+
+
+static void
+lp_rast_end( struct lp_rasterizer *rast )
+{
+   struct lp_scene *scene = rast->curr_scene;
+   unsigned i;
+
+   /* Unmap color buffers */
+   for (i = 0; i < rast->state.nr_cbufs; i++) {
+      struct pipe_surface *cbuf = scene->fb.cbufs[i];
+      llvmpipe_resource_unmap(cbuf->texture,
+                             cbuf->face,
+                             cbuf->level,
+                             cbuf->zslice);
+   }
+
+   /* Unmap z/stencil buffer */
+   if (rast->zsbuf.map) {
+      struct pipe_surface *zsbuf = scene->fb.zsbuf;
+      llvmpipe_resource_unmap(zsbuf->texture,
+                             zsbuf->face,
+                             zsbuf->level,
+                             zsbuf->zslice);
+      rast->zsbuf.map = NULL;
+   }
+
+   lp_scene_reset( rast->curr_scene );
+
+   rast->curr_scene = NULL;
+
+#ifdef DEBUG
+   if (0)
+      debug_printf("Post render scene: tile unswizzle: %u tile swizzle: %u\n",
+                   lp_tile_unswizzle_count, lp_tile_swizzle_count);
+#endif
+}
+
+
+/**
+ * Begining rasterization of a tile.
+ * \param x  window X position of the tile, in pixels
+ * \param y  window Y position of the tile, in pixels
+ */
+static void
+lp_rast_tile_begin(struct lp_rasterizer_task *task,
+                   unsigned x, unsigned y)
+{
+   struct lp_rasterizer *rast = task->rast;
+   struct lp_scene *scene = rast->curr_scene;
+   enum lp_texture_usage usage;
+   unsigned buf;
+
+   LP_DBG(DEBUG_RAST, "%s %d,%d\n", __FUNCTION__, x, y);
+
+   assert(x % TILE_SIZE == 0);
+   assert(y % TILE_SIZE == 0);
+
+   task->x = x;
+   task->y = y;
+
+   if (scene->has_color_clear)
+      usage = LP_TEX_USAGE_WRITE_ALL;
+   else
+      usage = LP_TEX_USAGE_READ_WRITE;
+
+   /* get pointers to color tile(s) */
+   for (buf = 0; buf < rast->state.nr_cbufs; buf++) {
+      struct pipe_surface *cbuf = rast->curr_scene->fb.cbufs[buf];
+      struct llvmpipe_resource *lpt;
+      assert(cbuf);
+      lpt = llvmpipe_resource(cbuf->texture);
+      task->color_tiles[buf] = llvmpipe_get_texture_tile(lpt,
+                                                         cbuf->face + cbuf->zslice,
+                                                         cbuf->level,
+                                                         usage,
+                                                         x, y);
+      assert(task->color_tiles[buf]);
+   }
+
+   /* get pointer to depth/stencil tile */
+   {
+      struct pipe_surface *zsbuf = rast->curr_scene->fb.zsbuf;
+      if (zsbuf) {
+         struct llvmpipe_resource *lpt = llvmpipe_resource(zsbuf->texture);
+
+         if (scene->has_depthstencil_clear)
+            usage = LP_TEX_USAGE_WRITE_ALL;
+         else
+            usage = LP_TEX_USAGE_READ_WRITE;
+
+         /* "prime" the tile: convert data from linear to tiled if necessary
+          * and update the tile's layout info.
+          */
+         (void) llvmpipe_get_texture_tile(lpt,
+                                          zsbuf->face + zsbuf->zslice,
+                                          zsbuf->level,
+                                          usage,
+                                          x, y);
+         /* Get actual pointer to the tile data.  Note that depth/stencil
+          * data is tiled differently than color data.
+          */
+         task->depth_tile = lp_rast_get_depth_block_pointer(rast, x, y);
+
+         assert(task->depth_tile);
+      }
+      else {
+         task->depth_tile = NULL;
+      }
+   }
+}
+
+
+/**
+ * Clear the rasterizer's current color tile.
+ * This is a bin command called during bin processing.
+ */
+void
+lp_rast_clear_color(struct lp_rasterizer_task *task,
+                    const union lp_rast_cmd_arg arg)
+{
+   struct lp_rasterizer *rast = task->rast;
+   const uint8_t *clear_color = arg.clear_color;
+
+   unsigned i;
+
+   LP_DBG(DEBUG_RAST, "%s 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__, 
+              clear_color[0],
+              clear_color[1],
+              clear_color[2],
+              clear_color[3]);
+
+   if (clear_color[0] == clear_color[1] &&
+       clear_color[1] == clear_color[2] &&
+       clear_color[2] == clear_color[3]) {
+      /* clear to grayscale value {x, x, x, x} */
+      for (i = 0; i < rast->state.nr_cbufs; i++) {
+         uint8_t *ptr = task->color_tiles[i];
+	 memset(ptr, clear_color[0], TILE_SIZE * TILE_SIZE * 4);
+      }
+   }
+   else {
+      /* Non-gray color.
+       * Note: if the swizzled tile layout changes (see TILE_PIXEL) this code
+       * will need to change.  It'll be pretty obvious when clearing no longer
+       * works.
+       */
+      const unsigned chunk = TILE_SIZE / 4;
+      for (i = 0; i < rast->state.nr_cbufs; i++) {
+         uint8_t *c = task->color_tiles[i];
+         unsigned j;
+
+         for (j = 0; j < 4 * TILE_SIZE; j++) {
+            memset(c, clear_color[0], chunk);
+            c += chunk;
+            memset(c, clear_color[1], chunk);
+            c += chunk;
+            memset(c, clear_color[2], chunk);
+            c += chunk;
+            memset(c, clear_color[3], chunk);
+            c += chunk;
+         }
+      }
+   }
+
+   LP_COUNT(nr_color_tile_clear);
+}
+
+
+/**
+ * Clear the rasterizer's current z/stencil tile.
+ * This is a bin command called during bin processing.
+ */
+void
+lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
+                       const union lp_rast_cmd_arg arg)
+{
+   struct lp_rasterizer *rast = task->rast;
+   const struct lp_rast_clearzs *clearzs = arg.clear_zstencil;
+   unsigned clear_value = clearzs->clearzs_value;
+   unsigned clear_mask = clearzs->clearzs_mask;
+   const unsigned height = TILE_SIZE / TILE_VECTOR_HEIGHT;
+   const unsigned width = TILE_SIZE * TILE_VECTOR_HEIGHT;
+   const unsigned block_size = rast->zsbuf.blocksize;
+   const unsigned dst_stride = rast->zsbuf.stride * TILE_VECTOR_HEIGHT;
+   uint8_t *dst;
+   unsigned i, j;
+
+   LP_DBG(DEBUG_RAST, "%s 0x%x%x\n", __FUNCTION__, clear_value, clear_mask);
+
+   /*
+    * Clear the aera of the swizzled depth/depth buffer matching this tile, in
+    * stripes of TILE_VECTOR_HEIGHT x TILE_SIZE at a time.
+    *
+    * The swizzled depth format is such that the depths for
+    * TILE_VECTOR_HEIGHT x TILE_VECTOR_WIDTH pixels have consecutive offsets.
+    */
+
+   dst = task->depth_tile;
+
+   assert(dst == lp_rast_get_depth_block_pointer(rast, task->x, task->y));
+
+   switch (block_size) {
+   case 1:
+      memset(dst, (uint8_t) clear_value, height * width);
+      break;
+   case 2:
+      for (i = 0; i < height; i++) {
+         uint16_t *row = (uint16_t *)dst;
+         for (j = 0; j < width; j++)
+            *row++ = (uint16_t) clear_value;
+         dst += dst_stride;
+      }
+      break;
+   case 4:
+      if (clear_mask == 0xffffffff) {
+         for (i = 0; i < height; i++) {
+            uint32_t *row = (uint32_t *)dst;
+            for (j = 0; j < width; j++)
+               *row++ = clear_value;
+            dst += dst_stride;
+         }
+      }
+      else {
+         for (i = 0; i < height; i++) {
+            uint32_t *row = (uint32_t *)dst;
+            for (j = 0; j < width; j++) {
+               uint32_t tmp = ~clear_mask & *row;
+               *row++ = (clear_value & clear_mask) | tmp;
+            }
+            dst += dst_stride;
+         }
+      }
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+/**
+ * Load tile color from the framebuffer surface.
+ * This is a bin command called during bin processing.
+ */
+#if 0
+void
+lp_rast_load_color(struct lp_rasterizer_task *task,
+                   const union lp_rast_cmd_arg arg)
+{
+   struct lp_rasterizer *rast = task->rast;
+   unsigned buf;
+   enum lp_texture_usage usage;
+
+   LP_DBG(DEBUG_RAST, "%s at %u, %u\n", __FUNCTION__, x, y);
+
+   if (scene->has_color_clear)
+      usage = LP_TEX_USAGE_WRITE_ALL;
+   else
+      usage = LP_TEX_USAGE_READ_WRITE;
+
+   /* Get pointers to color tile(s).
+    * This will convert linear data to tiled if needed.
+    */
+   for (buf = 0; buf < rast->state.nr_cbufs; buf++) {
+      struct pipe_surface *cbuf = rast->curr_scene->fb.cbufs[buf];
+      struct llvmpipe_texture *lpt;
+      assert(cbuf);
+      lpt = llvmpipe_texture(cbuf->texture);
+      task->color_tiles[buf] = llvmpipe_get_texture_tile(lpt,
+                                                         cbuf->face + cbuf->zslice,
+                                                         cbuf->level,
+                                                         usage,
+                                                         task->x, task->y);
+      assert(task->color_tiles[buf]);
+   }
+}
+#endif
+
+
+/**
+ * Convert the color tile from tiled to linear layout.
+ * This is generally only done when we're flushing the scene just prior to
+ * SwapBuffers.  If we didn't do this here, we'd have to convert the entire
+ * tiled color buffer to linear layout in the llvmpipe_texture_unmap()
+ * function.  It's better to do it here to take advantage of
+ * threading/parallelism.
+ * This is a bin command which is stored in all bins.
+ */
+void
+lp_rast_store_color( struct lp_rasterizer_task *task,
+                     const union lp_rast_cmd_arg arg)
+{
+   struct lp_rasterizer *rast = task->rast;
+   struct lp_scene *scene = rast->curr_scene;
+   unsigned buf;
+
+   for (buf = 0; buf < rast->state.nr_cbufs; buf++) {
+      struct pipe_surface *cbuf = scene->fb.cbufs[buf];
+      const unsigned face = cbuf->face, level = cbuf->level;
+      struct llvmpipe_resource *lpt = llvmpipe_resource(cbuf->texture);
+      /* this will convert the tiled data to linear if needed */
+      (void) llvmpipe_get_texture_tile_linear(lpt, face, level,
+                                              LP_TEX_USAGE_READ,
+                                              task->x, task->y);
+   }
+}
+
+
+/**
+ * This is a bin command called during bin processing.
+ */
+void
+lp_rast_set_state(struct lp_rasterizer_task *task,
+                  const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_state *state = arg.set_state;
+
+   LP_DBG(DEBUG_RAST, "%s %p\n", __FUNCTION__, (void *) state);
+
+   /* just set the current state pointer for this rasterizer */
+   task->current_state = state;
+}
+
+
+/**
+ * Run the shader on all blocks in a tile.  This is used when a tile is
+ * completely contained inside a triangle.
+ * This is a bin command called during bin processing.
+ */
+void
+lp_rast_shade_tile(struct lp_rasterizer_task *task,
+                   const union lp_rast_cmd_arg arg)
+{
+   struct lp_rasterizer *rast = task->rast;
+   const struct lp_rast_state *state = task->current_state;
+   const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
+   struct lp_fragment_shader_variant *variant = state->variant;
+   const unsigned tile_x = task->x, tile_y = task->y;
+   unsigned x, y;
+
+   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
+
+   /* render the whole 64x64 tile in 4x4 chunks */
+   for (y = 0; y < TILE_SIZE; y += 4){
+      for (x = 0; x < TILE_SIZE; x += 4) {
+         uint8_t *color[PIPE_MAX_COLOR_BUFS];
+         uint32_t *depth;
+         unsigned i;
+
+         /* color buffer */
+         for (i = 0; i < rast->state.nr_cbufs; i++)
+            color[i] = lp_rast_get_color_block_pointer(task, i,
+                                                       tile_x + x, tile_y + y);
+
+         /* depth buffer */
+         depth = lp_rast_get_depth_block_pointer(rast, tile_x + x, tile_y + y);
+
+         /* run shader on 4x4 block */
+         variant->jit_function[RAST_WHOLE]( &state->jit_context,
+                                          tile_x + x, tile_y + y,
+                                          inputs->facing,
+                                          inputs->a0,
+                                          inputs->dadx,
+                                          inputs->dady,
+                                          color,
+                                          depth,
+                                          INT_MIN, INT_MIN, INT_MIN,
+                                          NULL, NULL, NULL, &task->vis_counter);
+      }
+   }
+}
+
+
+/**
+ * Compute shading for a 4x4 block of pixels.
+ * This is a bin command called during bin processing.
+ * \param x  X position of quad in window coords
+ * \param y  Y position of quad in window coords
+ */
+void lp_rast_shade_quads( struct lp_rasterizer_task *task,
+                          const struct lp_rast_shader_inputs *inputs,
+                          unsigned x, unsigned y,
+                          int32_t c1, int32_t c2, int32_t c3)
+{
+   const struct lp_rast_state *state = task->current_state;
+   struct lp_fragment_shader_variant *variant = state->variant;
+   struct lp_rasterizer *rast = task->rast;
+   uint8_t *color[PIPE_MAX_COLOR_BUFS];
+   void *depth;
+   unsigned i;
+
+   assert(state);
+
+   /* Sanity checks */
+   assert(x % TILE_VECTOR_WIDTH == 0);
+   assert(y % TILE_VECTOR_HEIGHT == 0);
+
+   assert((x % 4) == 0);
+   assert((y % 4) == 0);
+
+   /* color buffer */
+   for (i = 0; i < rast->state.nr_cbufs; i++) {
+      color[i] = lp_rast_get_color_block_pointer(task, i, x, y);
+      assert(lp_check_alignment(color[i], 16));
+   }
+
+   /* depth buffer */
+   depth = lp_rast_get_depth_block_pointer(rast, x, y);
+
+
+   assert(lp_check_alignment(state->jit_context.blend_color, 16));
+
+   assert(lp_check_alignment(inputs->step[0], 16));
+   assert(lp_check_alignment(inputs->step[1], 16));
+   assert(lp_check_alignment(inputs->step[2], 16));
+
+   /* run shader on 4x4 block */
+   variant->jit_function[RAST_EDGE_TEST]( &state->jit_context,
+                                        x, y,
+                                        inputs->facing,
+                                        inputs->a0,
+                                        inputs->dadx,
+                                        inputs->dady,
+                                        color,
+                                        depth,
+                                        c1, c2, c3,
+                                        inputs->step[0],
+                                        inputs->step[1],
+                                        inputs->step[2],
+					&task->vis_counter);
+}
+
+
+/**
+ * Set top row and left column of the tile's pixels to white.  For debugging.
+ */
+static void
+outline_tile(uint8_t *tile)
+{
+   const uint8_t val = 0xff;
+   unsigned i;
+
+   for (i = 0; i < TILE_SIZE; i++) {
+      TILE_PIXEL(tile, i, 0, 0) = val;
+      TILE_PIXEL(tile, i, 0, 1) = val;
+      TILE_PIXEL(tile, i, 0, 2) = val;
+      TILE_PIXEL(tile, i, 0, 3) = val;
+
+      TILE_PIXEL(tile, 0, i, 0) = val;
+      TILE_PIXEL(tile, 0, i, 1) = val;
+      TILE_PIXEL(tile, 0, i, 2) = val;
+      TILE_PIXEL(tile, 0, i, 3) = val;
+   }
+}
+
+
+/**
+ * Draw grid of gray lines at 16-pixel intervals across the tile to
+ * show the sub-tile boundaries.  For debugging.
+ */
+static void
+outline_subtiles(uint8_t *tile)
+{
+   const uint8_t val = 0x80;
+   const unsigned step = 16;
+   unsigned i, j;
+
+   for (i = 0; i < TILE_SIZE; i += step) {
+      for (j = 0; j < TILE_SIZE; j++) {
+         TILE_PIXEL(tile, i, j, 0) = val;
+         TILE_PIXEL(tile, i, j, 1) = val;
+         TILE_PIXEL(tile, i, j, 2) = val;
+         TILE_PIXEL(tile, i, j, 3) = val;
+
+         TILE_PIXEL(tile, j, i, 0) = val;
+         TILE_PIXEL(tile, j, i, 1) = val;
+         TILE_PIXEL(tile, j, i, 2) = val;
+         TILE_PIXEL(tile, j, i, 3) = val;
+      }
+   }
+
+   outline_tile(tile);
+}
+
+
+
+/**
+ * Called when we're done writing to a color tile.
+ */
+static void
+lp_rast_tile_end(struct lp_rasterizer_task *task)
+{
+#ifdef DEBUG
+   if (LP_DEBUG & (DEBUG_SHOW_SUBTILES | DEBUG_SHOW_TILES)) {
+      struct lp_rasterizer *rast = task->rast;
+      unsigned buf;
+
+      for (buf = 0; buf < rast->state.nr_cbufs; buf++) {
+         uint8_t *color = lp_rast_get_color_block_pointer(task, buf,
+                                                          task->x, task->y);
+
+         if (LP_DEBUG & DEBUG_SHOW_SUBTILES)
+            outline_subtiles(color);
+         else if (LP_DEBUG & DEBUG_SHOW_TILES)
+            outline_tile(color);
+      }
+   }
+#else
+   (void) outline_subtiles;
+#endif
+
+   /* debug */
+   memset(task->color_tiles, 0, sizeof(task->color_tiles));
+   task->depth_tile = NULL;
+}
+
+
+
+/**
+ * Signal on a fence.  This is called during bin execution/rasterization.
+ * Called per thread.
+ */
+void
+lp_rast_fence(struct lp_rasterizer_task *task,
+              const union lp_rast_cmd_arg arg)
+{
+   struct lp_fence *fence = arg.fence;
+   lp_fence_signal(fence);
+}
+
+
+/**
+ * Begin a new occlusion query.
+ * This is a bin command put in all bins.
+ * Called per thread.
+ */
+void
+lp_rast_begin_query(struct lp_rasterizer_task *task,
+                    const union lp_rast_cmd_arg arg)
+{
+   /* Reset the the per-task counter */
+   task->vis_counter = 0;
+}
+ 
+
+/**
+ * End the current occlusion query.
+ * This is a bin command put in all bins.
+ * Called per thread.
+ */
+void
+lp_rast_end_query(struct lp_rasterizer_task *task,
+                  const union lp_rast_cmd_arg arg)
+{
+   struct llvmpipe_query *pq = arg.query_obj;
+
+   pipe_mutex_lock(pq->mutex);
+   {
+      /* Accumulate the visible fragment counter from this tile in
+       * the query object.
+       */
+      pq->count[task->thread_index] += task->vis_counter;
+
+      /* check if this is the last tile in the scene */
+      pq->tile_count++;
+      if (pq->tile_count == pq->num_tiles) {
+         uint i;
+
+         /* sum the per-thread counters for the query */
+         pq->result = 0;
+         for (i = 0; i < LP_MAX_THREADS; i++) {
+            pq->result += pq->count[i];
+         }
+
+         /* reset counters (in case this query is re-used in the scene) */
+         memset(pq->count, 0, sizeof(pq->count));
+
+         pq->tile_count = 0;
+         pq->binned = FALSE;
+         pq->done = TRUE;
+      }
+   }
+   pipe_mutex_unlock(pq->mutex);
+}
+
+
+
+/**
+ * Rasterize commands for a single bin.
+ * \param x, y  position of the bin's tile in the framebuffer
+ * Must be called between lp_rast_begin() and lp_rast_end().
+ * Called per thread.
+ */
+static void
+rasterize_bin(struct lp_rasterizer_task *task,
+              const struct cmd_bin *bin,
+              int x, int y)
+{
+   const struct cmd_block_list *commands = &bin->commands;
+   struct cmd_block *block;
+   unsigned k;
+
+   lp_rast_tile_begin( task, x * TILE_SIZE, y * TILE_SIZE );
+
+   /* simply execute each of the commands in the block list */
+   for (block = commands->head; block; block = block->next) {
+      for (k = 0; k < block->count; k++) {
+         block->cmd[k]( task, block->arg[k] );
+      }
+   }
+
+   lp_rast_tile_end(task);
+
+   /* Free data for this bin.
+    */
+   lp_scene_bin_reset( task->rast->curr_scene, x, y);
+}
+
+
+#define RAST(x) { lp_rast_##x, #x }
+
+static struct {
+   lp_rast_cmd cmd;
+   const char *name;
+} cmd_names[] = 
+{
+   RAST(clear_color),
+   RAST(clear_zstencil),
+   RAST(triangle),
+   RAST(shade_tile),
+   RAST(set_state),
+   RAST(store_color),
+   RAST(fence),
+   RAST(begin_query),
+   RAST(end_query),
+};
+
+static void
+debug_bin( const struct cmd_bin *bin )
+{
+   const struct cmd_block *head = bin->commands.head;
+   int i, j;
+
+   for (i = 0; i < head->count; i++) {
+      debug_printf("%d: ", i);
+      for (j = 0; j < Elements(cmd_names); j++) {
+         if (head->cmd[i] == cmd_names[j].cmd) {
+            debug_printf("%s\n", cmd_names[j].name);
+            break;
+         }
+      }
+      if (j == Elements(cmd_names))
+         debug_printf("...other\n");
+   }
+
+}
+
+/* An empty bin is one that just loads the contents of the tile and
+ * stores them again unchanged.  This typically happens when bins have
+ * been flushed for some reason in the middle of a frame, or when
+ * incremental updates are being made to a render target.
+ * 
+ * Try to avoid doing pointless work in this case.
+ */
+static boolean
+is_empty_bin( const struct cmd_bin *bin )
+{
+   const struct cmd_block *head = bin->commands.head;
+   int i;
+   
+   if (0)
+      debug_bin(bin);
+   
+   /* We emit at most two load-tile commands at the start of the first
+    * command block.  In addition we seem to emit a couple of
+    * set-state commands even in empty bins.
+    *
+    * As a heuristic, if a bin has more than 4 commands, consider it
+    * non-empty.
+    */
+   if (head->next != NULL ||
+       head->count > 4) {
+      return FALSE;
+   }
+
+   for (i = 0; i < head->count; i++)
+      if (head->cmd[i] != lp_rast_set_state) {
+         return FALSE;
+      }
+
+   return TRUE;
+}
+
+
+
+/**
+ * Rasterize/execute all bins within a scene.
+ * Called per thread.
+ */
+static void
+rasterize_scene(struct lp_rasterizer_task *task,
+                struct lp_scene *scene)
+{
+   /* loop over scene bins, rasterize each */
+#if 0
+   {
+      unsigned i, j;
+      for (i = 0; i < scene->tiles_x; i++) {
+         for (j = 0; j < scene->tiles_y; j++) {
+            struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
+            rasterize_bin(task, bin, i, j);
+         }
+      }
+   }
+#else
+   {
+      struct cmd_bin *bin;
+      int x, y;
+
+      assert(scene);
+      while ((bin = lp_scene_bin_iter_next(scene, &x, &y))) {
+         if (!is_empty_bin( bin ))
+            rasterize_bin(task, bin, x, y);
+      }
+   }
+#endif
+}
+
+
+/**
+ * Called by setup module when it has something for us to render.
+ */
+void
+lp_rast_queue_scene( struct lp_rasterizer *rast,
+                     struct lp_scene *scene)
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   if (rast->num_threads == 0) {
+      /* no threading */
+
+      lp_rast_begin( rast, scene );
+
+      rasterize_scene( &rast->tasks[0], scene );
+
+      lp_scene_reset( scene );
+
+      lp_rast_end( rast );
+
+      rast->curr_scene = NULL;
+   }
+   else {
+      /* threaded rendering! */
+      unsigned i;
+
+      lp_scene_enqueue( rast->full_scenes, scene );
+
+      /* signal the threads that there's work to do */
+      for (i = 0; i < rast->num_threads; i++) {
+         pipe_semaphore_signal(&rast->tasks[i].work_ready);
+      }
+   }
+
+   LP_DBG(DEBUG_SETUP, "%s done \n", __FUNCTION__);
+}
+
+
+void
+lp_rast_finish( struct lp_rasterizer *rast )
+{
+   if (rast->num_threads == 0) {
+      /* nothing to do */
+   }
+   else {
+      int i;
+
+      /* wait for work to complete */
+      for (i = 0; i < rast->num_threads; i++) {
+         pipe_semaphore_wait(&rast->tasks[i].work_done);
+      }
+   }
+}
+
+
+/**
+ * This is the thread's main entrypoint.
+ * It's a simple loop:
+ *   1. wait for work
+ *   2. do work
+ *   3. signal that we're done
+ */
+static PIPE_THREAD_ROUTINE( thread_func, init_data )
+{
+   struct lp_rasterizer_task *task = (struct lp_rasterizer_task *) init_data;
+   struct lp_rasterizer *rast = task->rast;
+   boolean debug = false;
+
+   while (1) {
+      /* wait for work */
+      if (debug)
+         debug_printf("thread %d waiting for work\n", task->thread_index);
+      pipe_semaphore_wait(&task->work_ready);
+
+      if (rast->exit_flag)
+         break;
+
+      if (task->thread_index == 0) {
+         /* thread[0]:
+          *  - get next scene to rasterize
+          *  - map the framebuffer surfaces
+          */
+         lp_rast_begin( rast, 
+                        lp_scene_dequeue( rast->full_scenes, TRUE ) );
+      }
+
+      /* Wait for all threads to get here so that threads[1+] don't
+       * get a null rast->curr_scene pointer.
+       */
+      pipe_barrier_wait( &rast->barrier );
+
+      /* do work */
+      if (debug)
+         debug_printf("thread %d doing work\n", task->thread_index);
+
+      rasterize_scene(task,
+                      rast->curr_scene);
+      
+      /* wait for all threads to finish with this scene */
+      pipe_barrier_wait( &rast->barrier );
+
+      /* XXX: shouldn't be necessary:
+       */
+      if (task->thread_index == 0) {
+         lp_rast_end( rast );
+      }
+
+      /* signal done with work */
+      if (debug)
+         debug_printf("thread %d done working\n", task->thread_index);
+
+      pipe_semaphore_signal(&task->work_done);
+   }
+
+   return NULL;
+}
+
+
+/**
+ * Initialize semaphores and spawn the threads.
+ */
+static void
+create_rast_threads(struct lp_rasterizer *rast)
+{
+   unsigned i;
+
+   /* NOTE: if num_threads is zero, we won't use any threads */
+   for (i = 0; i < rast->num_threads; i++) {
+      pipe_semaphore_init(&rast->tasks[i].work_ready, 0);
+      pipe_semaphore_init(&rast->tasks[i].work_done, 0);
+      rast->threads[i] = pipe_thread_create(thread_func,
+                                            (void *) &rast->tasks[i]);
+   }
+}
+
+
+
+/**
+ * Create new lp_rasterizer.  If num_threads is zero, don't create any
+ * new threads, do rendering synchronously.
+ * \param num_threads  number of rasterizer threads to create
+ */
+struct lp_rasterizer *
+lp_rast_create( unsigned num_threads )
+{
+   struct lp_rasterizer *rast;
+   unsigned i;
+
+   rast = CALLOC_STRUCT(lp_rasterizer);
+   if(!rast)
+      return NULL;
+
+   rast->full_scenes = lp_scene_queue_create();
+
+   for (i = 0; i < Elements(rast->tasks); i++) {
+      struct lp_rasterizer_task *task = &rast->tasks[i];
+      task->rast = rast;
+      task->thread_index = i;
+   }
+
+   rast->num_threads = num_threads;
+
+   create_rast_threads(rast);
+
+   /* for synchronizing rasterization threads */
+   pipe_barrier_init( &rast->barrier, rast->num_threads );
+
+   return rast;
+}
+
+
+/* Shutdown:
+ */
+void lp_rast_destroy( struct lp_rasterizer *rast )
+{
+   unsigned i;
+
+   /* Set exit_flag and signal each thread's work_ready semaphore.
+    * Each thread will be woken up, notice that the exit_flag is set and
+    * break out of its main loop.  The thread will then exit.
+    */
+   rast->exit_flag = TRUE;
+   for (i = 0; i < rast->num_threads; i++) {
+      pipe_semaphore_signal(&rast->tasks[i].work_ready);
+   }
+
+   /* Wait for threads to terminate before cleaning up per-thread data */
+   for (i = 0; i < rast->num_threads; i++) {
+      pipe_thread_wait(rast->threads[i]);
+   }
+
+   /* Clean up per-thread data */
+   for (i = 0; i < rast->num_threads; i++) {
+      pipe_semaphore_destroy(&rast->tasks[i].work_ready);
+      pipe_semaphore_destroy(&rast->tasks[i].work_done);
+   }
+
+   /* for synchronizing rasterization threads */
+   pipe_barrier_destroy( &rast->barrier );
+
+   lp_scene_queue_destroy(rast->full_scenes);
+
+   FREE(rast);
+}
+
+
+/** Return number of rasterization threads */
+unsigned
+lp_rast_get_num_threads( struct lp_rasterizer *rast )
+{
+   return rast->num_threads;
+}
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
new file mode 100644
index 0000000000..80ca68f5a2
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -0,0 +1,252 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * The rast code is concerned with rasterization of command bins.
+ * Each screen tile has a bin associated with it.  To render the
+ * scene we iterate over the tile bins and execute the commands
+ * in each bin.
+ * We'll do that with multiple threads...
+ */
+
+
+#ifndef LP_RAST_H
+#define LP_RAST_H
+
+#include "pipe/p_compiler.h"
+#include "lp_jit.h"
+
+
+struct lp_rasterizer;
+struct lp_scene;
+struct lp_fence;
+struct cmd_bin;
+
+/** For sub-pixel positioning */
+#define FIXED_ORDER 4
+#define FIXED_ONE (1<<FIXED_ORDER)
+
+
+struct lp_rasterizer_task;
+
+
+/**
+ * Rasterization state.
+ * Objects of this type are put into the shared data bin and pointed
+ * to by commands in the per-tile bins.
+ */
+struct lp_rast_state {
+   /* State for the shader.  This also contains state which feeds into
+    * the fragment shader, such as blend color and alpha ref value.
+    */
+   struct lp_jit_context jit_context;
+   
+   /* The shader itself.  Probably we also need to pass a pointer to
+    * the tile color/z/stencil data somehow
+     */
+   struct lp_fragment_shader_variant *variant;
+};
+
+
+/**
+ * Coefficients necessary to run the shader at a given location.
+ * First coefficient is position.
+ * These pointers point into the bin data buffer.
+ */
+struct lp_rast_shader_inputs {
+   float facing;     /** Positive for front-facing, negative for back-facing */
+
+   float (*a0)[4];
+   float (*dadx)[4];
+   float (*dady)[4];
+
+   /* edge/step info for 3 edges and 4x4 block of pixels */
+   PIPE_ALIGN_VAR(16) int step[3][16];
+};
+
+struct lp_rast_clearzs {
+   unsigned clearzs_value;
+   unsigned clearzs_mask;
+};
+
+
+/**
+ * Rasterization information for a triangle known to be in this bin,
+ * plus inputs to run the shader:
+ * These fields are tile- and bin-independent.
+ * Objects of this type are put into the lp_setup_context::data buffer.
+ */
+struct lp_rast_triangle {
+#ifdef DEBUG
+   float v[3][2];
+#endif
+
+   /* one-pixel sized trivial accept offsets for each plane */
+   int ei1;                   
+   int ei2;
+   int ei3;
+
+   /* one-pixel sized trivial reject offsets for each plane */
+   int eo1;                   
+   int eo2;
+   int eo3;
+
+   /* y deltas for vertex pairs (in fixed pt) */
+   int dy12;
+   int dy23;
+   int dy31;
+
+   /* x deltas for vertex pairs (in fixed pt) */
+   int dx12;
+   int dx23;
+   int dx31;
+
+   /* edge function values at minx,miny ?? */
+   int c1, c2, c3;
+
+   /* inputs for the shader */
+   PIPE_ALIGN_VAR(16) struct lp_rast_shader_inputs inputs;
+};
+
+
+
+struct lp_rasterizer *
+lp_rast_create( unsigned num_threads );
+
+void
+lp_rast_destroy( struct lp_rasterizer * );
+
+unsigned
+lp_rast_get_num_threads( struct lp_rasterizer * );
+
+void 
+lp_rast_queue_scene( struct lp_rasterizer *rast,
+                     struct lp_scene *scene );
+
+void
+lp_rast_finish( struct lp_rasterizer *rast );
+
+
+union lp_rast_cmd_arg {
+   const struct lp_rast_shader_inputs *shade_tile;
+   const struct lp_rast_triangle *triangle;
+   const struct lp_rast_state *set_state;
+   uint8_t clear_color[4];
+   const struct lp_rast_clearzs *clear_zstencil;
+   struct lp_fence *fence;
+   struct llvmpipe_query *query_obj;
+};
+
+
+/* Cast wrappers.  Hopefully these compile to noops!
+ */
+static INLINE union lp_rast_cmd_arg
+lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile )
+{
+   union lp_rast_cmd_arg arg;
+   arg.shade_tile = shade_tile;
+   return arg;
+}
+
+static INLINE union lp_rast_cmd_arg
+lp_rast_arg_triangle( const struct lp_rast_triangle *triangle )
+{
+   union lp_rast_cmd_arg arg;
+   arg.triangle = triangle;
+   return arg;
+}
+
+static INLINE union lp_rast_cmd_arg
+lp_rast_arg_state( const struct lp_rast_state *state )
+{
+   union lp_rast_cmd_arg arg;
+   arg.set_state = state;
+   return arg;
+}
+
+static INLINE union lp_rast_cmd_arg
+lp_rast_arg_fence( struct lp_fence *fence )
+{
+   union lp_rast_cmd_arg arg;
+   arg.fence = fence;
+   return arg;
+}
+
+
+static INLINE union lp_rast_cmd_arg
+lp_rast_arg_clearzs( const struct lp_rast_clearzs *clearzs )
+{
+   union lp_rast_cmd_arg arg;
+   arg.clear_zstencil = clearzs;
+   return arg;
+}
+
+static INLINE union lp_rast_cmd_arg
+lp_rast_arg_null( void )
+{
+   union lp_rast_cmd_arg arg;
+   arg.set_state = NULL;
+   return arg;
+}
+
+
+/**
+ * Binnable Commands.
+ * These get put into bins by the setup code and are called when
+ * the bins are executed.
+ */
+
+void lp_rast_clear_color( struct lp_rasterizer_task *, 
+                          const union lp_rast_cmd_arg );
+
+void lp_rast_clear_zstencil( struct lp_rasterizer_task *, 
+                             const union lp_rast_cmd_arg );
+
+void lp_rast_set_state( struct lp_rasterizer_task *, 
+                        const union lp_rast_cmd_arg );
+
+void lp_rast_triangle( struct lp_rasterizer_task *, 
+                       const union lp_rast_cmd_arg );
+
+void lp_rast_shade_tile( struct lp_rasterizer_task *,
+                         const union lp_rast_cmd_arg );
+
+void lp_rast_fence( struct lp_rasterizer_task *,
+                    const union lp_rast_cmd_arg );
+
+void lp_rast_store_color( struct lp_rasterizer_task *,
+                          const union lp_rast_cmd_arg );
+
+
+void lp_rast_begin_query(struct lp_rasterizer_task *,
+                         const union lp_rast_cmd_arg );
+
+void lp_rast_end_query(struct lp_rasterizer_task *,
+                       const union lp_rast_cmd_arg );
+
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
new file mode 100644
index 0000000000..d33dd49f3a
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -0,0 +1,226 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef LP_RAST_PRIV_H
+#define LP_RAST_PRIV_H
+
+#include "os/os_thread.h"
+#include "util/u_format.h"
+#include "gallivm/lp_bld_debug.h"
+#include "lp_rast.h"
+#include "lp_scene.h"
+#include "lp_state.h"
+#include "lp_texture.h"
+#include "lp_tile_soa.h"
+#include "lp_limits.h"
+
+
+struct lp_rasterizer;
+
+
+/**
+ * Per-thread rasterization state
+ */
+struct lp_rasterizer_task
+{
+   unsigned x, y;          /**< Pos of this tile in framebuffer, in pixels */
+
+   uint8_t *color_tiles[PIPE_MAX_COLOR_BUFS];
+   uint8_t *depth_tile;
+
+   const struct lp_rast_state *current_state;
+
+   /** "back" pointer */
+   struct lp_rasterizer *rast;
+
+   /** "my" index */
+   unsigned thread_index;
+
+   /* occlude counter for visiable pixels */
+   uint32_t vis_counter;
+
+   pipe_semaphore work_ready;
+   pipe_semaphore work_done;
+};
+
+
+/**
+ * This is the state required while rasterizing tiles.
+ * Note that this contains per-thread information too.
+ * The tile size is TILE_SIZE x TILE_SIZE pixels.
+ */
+struct lp_rasterizer
+{
+   boolean exit_flag;
+
+   /* Framebuffer stuff
+    */
+   struct {
+      uint8_t *map;
+      unsigned stride;
+      unsigned blocksize;
+   } zsbuf;
+
+   struct {
+      unsigned nr_cbufs;
+      unsigned clear_color;
+      unsigned clear_depth;
+      char clear_stencil;
+   } state;
+
+   /** The incoming queue of scenes ready to rasterize */
+   struct lp_scene_queue *full_scenes;
+
+   /**
+    * The outgoing queue of processed scenes to return to setup module
+    *
+    * XXX: while scenes are per-context but the rasterizer is
+    * (potentially) shared, these empty scenes should be returned to
+    * the context which created them rather than retained here.
+    */
+   /*   struct lp_scene_queue *empty_scenes; */
+
+   /** The scene currently being rasterized by the threads */
+   struct lp_scene *curr_scene;
+
+   /** A task object for each rasterization thread */
+   struct lp_rasterizer_task tasks[LP_MAX_THREADS];
+
+   unsigned num_threads;
+   pipe_thread threads[LP_MAX_THREADS];
+
+   /** For synchronizing the rasterization threads */
+   pipe_barrier barrier;
+};
+
+
+void lp_rast_shade_quads( struct lp_rasterizer_task *task,
+                          const struct lp_rast_shader_inputs *inputs,
+                          unsigned x, unsigned y,
+                          int32_t c1, int32_t c2, int32_t c3);
+
+
+/**
+ * Get the pointer to a 4x4 depth/stencil block.
+ * We'll map the z/stencil buffer on demand here.
+ * Note that this may be called even when there's no z/stencil buffer - return
+ * NULL in that case.
+ * \param x, y location of 4x4 block in window coords
+ */
+static INLINE void *
+lp_rast_get_depth_block_pointer(const struct lp_rasterizer *rast,
+                                unsigned x, unsigned y)
+{
+   void *depth;
+
+   assert((x % TILE_VECTOR_WIDTH) == 0);
+   assert((y % TILE_VECTOR_HEIGHT) == 0);
+
+   assert(rast->zsbuf.map || !rast->curr_scene->fb.zsbuf);
+
+   if (!rast->zsbuf.map)
+      return NULL;
+
+   depth = (rast->zsbuf.map +
+            rast->zsbuf.stride * y +
+            rast->zsbuf.blocksize * x * TILE_VECTOR_HEIGHT);
+
+   assert(lp_check_alignment(depth, 16));
+   return depth;
+}
+
+
+/**
+ * Get the pointer to a 4x4 color block (within a 64x64 tile).
+ * We'll map the color buffer on demand here.
+ * Note that this may be called even when there's no color buffers - return
+ * NULL in that case.
+ * \param x, y location of 4x4 block in window coords
+ */
+static INLINE uint8_t *
+lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task,
+                                unsigned buf, unsigned x, unsigned y)
+{
+   unsigned px, py, pixel_offset;
+   uint8_t *color;
+
+   assert((x % TILE_VECTOR_WIDTH) == 0);
+   assert((y % TILE_VECTOR_HEIGHT) == 0);
+
+   color = task->color_tiles[buf];
+   assert(color);
+
+   px = x % TILE_SIZE;
+   py = y % TILE_SIZE;
+   pixel_offset = tile_pixel_offset(px, py, 0);
+
+   color = color + pixel_offset;
+
+   assert(lp_check_alignment(color, 16));
+   return color;
+}
+
+
+
+/**
+ * Shade all pixels in a 4x4 block.  The fragment code omits the
+ * triangle in/out tests.
+ * \param x, y location of 4x4 block in window coords
+ */
+static INLINE void
+lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
+                         const struct lp_rast_shader_inputs *inputs,
+                         unsigned x, unsigned y )
+{
+   struct lp_rasterizer *rast = task->rast;
+   const struct lp_rast_state *state = task->current_state;
+   struct lp_fragment_shader_variant *variant = state->variant;
+   uint8_t *color[PIPE_MAX_COLOR_BUFS];
+   void *depth;
+   unsigned i;
+
+   /* color buffer */
+   for (i = 0; i < rast->state.nr_cbufs; i++)
+      color[i] = lp_rast_get_color_block_pointer(task, i, x, y);
+
+   depth = lp_rast_get_depth_block_pointer(rast, x, y);
+
+   /* run shader on 4x4 block */
+   variant->jit_function[RAST_WHOLE]( &state->jit_context,
+                                    x, y,
+                                    inputs->facing,
+                                    inputs->a0,
+                                    inputs->dadx,
+                                    inputs->dady,
+                                    color,
+                                    depth,
+                                    INT_MIN, INT_MIN, INT_MIN,
+                                    NULL, NULL, NULL, &task->vis_counter );
+}
+
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
new file mode 100644
index 0000000000..a5f0d14c95
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -0,0 +1,280 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Rasterization for binned triangles within a tile
+ */
+
+#include <limits.h>
+#include "util/u_math.h"
+#include "lp_debug.h"
+#include "lp_perf.h"
+#include "lp_rast_priv.h"
+#include "lp_tile_soa.h"
+
+
+/**
+ * Map an index in [0,15] to an x,y position, multiplied by 4.
+ * This is used to get the position of each subtile in a 4x4
+ * grid of edge step values.
+ * Note: we can use some bit twiddling to compute these values instead
+ * of using a look-up table, but there's no measurable performance
+ * difference.
+ */
+static const int pos_table4[16][2] = {
+   { 0, 0 },
+   { 4, 0 },
+   { 0, 4 },
+   { 4, 4 },
+   { 8, 0 },
+   { 12, 0 },
+   { 8, 4 },
+   { 12, 4 },
+   { 0, 8 },
+   { 4, 8 },
+   { 0, 12 },
+   { 4, 12 },
+   { 8, 8 },
+   { 12, 8 },
+   { 8, 12 },
+   { 12, 12 }
+};
+
+
+static const int pos_table16[16][2] = {
+   { 0, 0 },
+   { 16, 0 },
+   { 0, 16 },
+   { 16, 16 },
+   { 32, 0 },
+   { 48, 0 },
+   { 32, 16 },
+   { 48, 16 },
+   { 0, 32 },
+   { 16, 32 },
+   { 0, 48 },
+   { 16, 48 },
+   { 32, 32 },
+   { 48, 32 },
+   { 32, 48 },
+   { 48, 48 }
+};
+
+
+/**
+ * Shade all pixels in a 4x4 block.
+ */
+static void
+block_full_4(struct lp_rasterizer_task *task,
+             const struct lp_rast_triangle *tri,
+             int x, int y)
+{
+   lp_rast_shade_quads_all(task, &tri->inputs, x, y);
+}
+
+
+/**
+ * Shade all pixels in a 16x16 block.
+ */
+static void
+block_full_16(struct lp_rasterizer_task *task,
+              const struct lp_rast_triangle *tri,
+              int x, int y)
+{
+   unsigned ix, iy;
+   assert(x % 16 == 0);
+   assert(y % 16 == 0);
+   for (iy = 0; iy < 16; iy += 4)
+      for (ix = 0; ix < 16; ix += 4)
+	 block_full_4(task, tri, x + ix, y + iy);
+}
+
+
+/**
+ * Pass the 4x4 pixel block to the shader function.
+ * Determination of which of the 16 pixels lies inside the triangle
+ * will be done as part of the fragment shader.
+ */
+static void
+do_block_4(struct lp_rasterizer_task *task,
+           const struct lp_rast_triangle *tri,
+           int x, int y,
+           int c1, int c2, int c3)
+{
+   assert(x >= 0);
+   assert(y >= 0);
+
+   lp_rast_shade_quads(task, &tri->inputs, x, y, -c1, -c2, -c3);
+}
+
+
+/**
+ * Evaluate a 16x16 block of pixels to determine which 4x4 subblocks are in/out
+ * of the triangle's bounds.
+ */
+static void
+do_block_16(struct lp_rasterizer_task *task,
+            const struct lp_rast_triangle *tri,
+            int x, int y,
+            int c0, int c1, int c2)
+{
+   unsigned mask = 0;
+   int eo[3];
+   int c[3];
+   int i, j;
+
+   assert(x >= 0);
+   assert(y >= 0);
+   assert(x % 16 == 0);
+   assert(y % 16 == 0);
+
+   eo[0] = tri->eo1 * 4;
+   eo[1] = tri->eo2 * 4;
+   eo[2] = tri->eo3 * 4;
+
+   c[0] = c0;
+   c[1] = c1;
+   c[2] = c2;
+
+   for (j = 0; j < 3; j++) {
+      const int *step = tri->inputs.step[j];
+      const int cx = c[j] + eo[j];
+
+      /* Mask has bits set whenever we are outside any of the edges.
+       */
+      for (i = 0; i < 16; i++) {
+         int out = cx + step[i] * 4;
+         mask |= (out >> 31) & (1 << i);
+      }
+   }
+
+   mask = ~mask & 0xffff;
+   while (mask) {
+      int i = ffs(mask) - 1;
+      int px = x + pos_table4[i][0];
+      int py = y + pos_table4[i][1];
+      int cx1 = c0 + tri->inputs.step[0][i] * 4;
+      int cx2 = c1 + tri->inputs.step[1][i] * 4;
+      int cx3 = c2 + tri->inputs.step[2][i] * 4;
+
+      mask &= ~(1 << i);
+
+      /* Don't bother testing if the 4x4 block is entirely in/out of
+       * the triangle.  It's a little faster to do it in the jit code.
+       */
+      LP_COUNT(nr_non_empty_4);
+      do_block_4(task, tri, px, py, cx1, cx2, cx3);
+   }
+}
+
+
+/**
+ * Scan the tile in chunks and figure out which pixels to rasterize
+ * for this triangle.
+ */
+void
+lp_rast_triangle(struct lp_rasterizer_task *task,
+                 const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle;
+   const int x = task->x, y = task->y;
+   int ei[3], eo[3], c[3];
+   unsigned outmask, inmask, partial_mask;
+   unsigned i, j;
+
+   c[0] = tri->c1 + tri->dx12 * y - tri->dy12 * x;
+   c[1] = tri->c2 + tri->dx23 * y - tri->dy23 * x;
+   c[2] = tri->c3 + tri->dx31 * y - tri->dy31 * x;
+
+   eo[0] = tri->eo1 * 16;
+   eo[1] = tri->eo2 * 16;
+   eo[2] = tri->eo3 * 16;
+
+   ei[0] = tri->ei1 * 16;
+   ei[1] = tri->ei2 * 16;
+   ei[2] = tri->ei3 * 16;
+
+   outmask = 0;
+   inmask = 0xffff;
+
+   for (j = 0; j < 3; j++) {
+      const int *step = tri->inputs.step[j];
+      const int cox = c[j] + eo[j];
+      const int cio = ei[j]- eo[j];
+
+      /* Outmask has bits set whenever we are outside any of the
+       * edges.
+       */
+      /* Inmask has bits set whenever we are inside all of the edges.
+       */
+      for (i = 0; i < 16; i++) {
+         int out = cox + step[i] * 16;
+         int in = out + cio;
+         outmask |= (out >> 31) & (1 << i);
+         inmask &= ~((in >> 31) & (1 << i));
+      }
+   }
+
+   assert((outmask & inmask) == 0);
+
+   if (outmask == 0xffff)
+      return;
+
+   /* Invert mask, so that bits are set whenever we are at least
+    * partially inside all of the edges:
+    */
+   partial_mask = ~inmask & ~outmask & 0xffff;
+
+   /* Iterate over partials:
+    */
+   while (partial_mask) {
+      int i = ffs(partial_mask) - 1;
+      int px = x + pos_table16[i][0];
+      int py = y + pos_table16[i][1];
+      int cx1 = c[0] + tri->inputs.step[0][i] * 16;
+      int cx2 = c[1] + tri->inputs.step[1][i] * 16;
+      int cx3 = c[2] + tri->inputs.step[2][i] * 16;
+
+      partial_mask &= ~(1 << i);
+
+      LP_COUNT(nr_partially_covered_16);
+      do_block_16(task, tri, px, py, cx1, cx2, cx3);
+   }
+
+   /* Iterate over fulls: 
+    */
+   while (inmask) {
+      int i = ffs(inmask) - 1;
+      int px = x + pos_table16[i][0];
+      int py = y + pos_table16[i][1];
+
+      inmask &= ~(1 << i);
+
+      LP_COUNT(nr_fully_covered_16);
+      block_full_16(task, tri, px, py);
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
new file mode 100644
index 0000000000..845c175cf2
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -0,0 +1,465 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_framebuffer.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_simple_list.h"
+#include "lp_scene.h"
+#include "lp_scene_queue.h"
+
+
+/** List of texture references */
+struct texture_ref {
+   struct pipe_resource *texture;
+   struct texture_ref *prev, *next;  /**< linked list w/ u_simple_list.h */
+};
+
+
+
+/**
+ * Create a new scene object.
+ * \param queue  the queue to put newly rendered/emptied scenes into
+ */
+struct lp_scene *
+lp_scene_create( struct pipe_context *pipe,
+                 struct lp_scene_queue *queue )
+{
+   unsigned i, j;
+   struct lp_scene *scene = CALLOC_STRUCT(lp_scene);
+   if (!scene)
+      return NULL;
+
+   scene->pipe = pipe;
+   scene->empty_queue = queue;
+
+   for (i = 0; i < TILES_X; i++) {
+      for (j = 0; j < TILES_Y; j++) {
+         struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
+         bin->commands.head = bin->commands.tail = CALLOC_STRUCT(cmd_block);
+      }
+   }
+
+   scene->data.head =
+      scene->data.tail = CALLOC_STRUCT(data_block);
+
+   make_empty_list(&scene->resources);
+
+   pipe_mutex_init(scene->mutex);
+
+   return scene;
+}
+
+
+/**
+ * Free all data associated with the given scene, and the scene itself.
+ */
+void
+lp_scene_destroy(struct lp_scene *scene)
+{
+   unsigned i, j;
+
+   lp_scene_reset(scene);
+
+   for (i = 0; i < TILES_X; i++)
+      for (j = 0; j < TILES_Y; j++) {
+         struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
+         assert(bin->commands.head == bin->commands.tail);
+         FREE(bin->commands.head);
+         bin->commands.head = NULL;
+         bin->commands.tail = NULL;
+      }
+
+   FREE(scene->data.head);
+   scene->data.head = NULL;
+
+   pipe_mutex_destroy(scene->mutex);
+
+   FREE(scene);
+}
+
+
+/**
+ * Check if the scene's bins are all empty.
+ * For debugging purposes.
+ */
+boolean
+lp_scene_is_empty(struct lp_scene *scene )
+{
+   unsigned x, y;
+
+   for (y = 0; y < TILES_Y; y++) {
+      for (x = 0; x < TILES_X; x++) {
+         const struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+         const struct cmd_block_list *list = &bin->commands;
+         if (list->head != list->tail || list->head->count > 0) {
+            return FALSE;
+         }
+      }
+   }
+   return TRUE;
+}
+
+
+/* Free data for one particular bin.  May be called from the
+ * rasterizer thread(s).
+ */
+void
+lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y)
+{
+   struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+   struct cmd_block_list *list = &bin->commands;
+   struct cmd_block *block;
+   struct cmd_block *tmp;
+
+   assert(x < TILES_X);
+   assert(y < TILES_Y);
+
+   for (block = list->head; block != list->tail; block = tmp) {
+      tmp = block->next;
+      FREE(block);
+   }
+
+   assert(list->tail->next == NULL);
+   list->head = list->tail;
+   list->head->count = 0;
+}
+
+
+/**
+ * Free all the temporary data in a scene.  May be called from the
+ * rasterizer thread(s).
+ */
+void
+lp_scene_reset(struct lp_scene *scene )
+{
+   unsigned i, j;
+
+   /* Free all but last binner command lists:
+    */
+   for (i = 0; i < scene->tiles_x; i++) {
+      for (j = 0; j < scene->tiles_y; j++) {
+         lp_scene_bin_reset(scene, i, j);
+      }
+   }
+
+   assert(lp_scene_is_empty(scene));
+
+   /* Free all but last binned data block:
+    */
+   {
+      struct data_block_list *list = &scene->data;
+      struct data_block *block, *tmp;
+
+      for (block = list->head; block != list->tail; block = tmp) {
+         tmp = block->next;
+         FREE(block);
+      }
+         
+      assert(list->tail->next == NULL);
+      list->head = list->tail;
+      list->head->used = 0;
+   }
+
+   /* Release texture refs
+    */
+   {
+      struct resource_ref *ref, *next, *ref_list = &scene->resources;
+      for (ref = ref_list->next; ref != ref_list; ref = next) {
+         next = next_elem(ref);
+         pipe_resource_reference(&ref->resource, NULL);
+         FREE(ref);
+      }
+      make_empty_list(ref_list);
+   }
+
+   scene->scene_size = 0;
+
+   scene->has_color_clear = FALSE;
+   scene->has_depthstencil_clear = FALSE;
+}
+
+
+
+
+
+
+struct cmd_block *
+lp_bin_new_cmd_block( struct cmd_block_list *list )
+{
+   struct cmd_block *block = MALLOC_STRUCT(cmd_block);
+   if (block) {
+      list->tail->next = block;
+      list->tail = block;
+      block->next = NULL;
+      block->count = 0;
+   }
+   return block;
+}
+
+
+struct data_block *
+lp_bin_new_data_block( struct data_block_list *list )
+{
+   struct data_block *block = MALLOC_STRUCT(data_block);
+   if (block) {
+      list->tail->next = block;
+      list->tail = block;
+      block->next = NULL;
+      block->used = 0;
+   }
+   return block;
+}
+
+
+/**
+ * Return number of bytes used for all bin data within a scene.
+ * This does not include resources (textures) referenced by the scene.
+ */
+unsigned
+lp_scene_data_size( const struct lp_scene *scene )
+{
+   unsigned size = 0;
+   const struct data_block *block;
+   for (block = scene->data.head; block; block = block->next) {
+      size += block->used;
+   }
+   return size;
+}
+
+
+/** Return number of bytes used for a single bin */
+unsigned
+lp_scene_bin_size( const struct lp_scene *scene, unsigned x, unsigned y )
+{
+   struct cmd_bin *bin = lp_scene_get_bin((struct lp_scene *) scene, x, y);
+   const struct cmd_block *cmd;
+   unsigned size = 0;
+   for (cmd = bin->commands.head; cmd; cmd = cmd->next) {
+      size += (cmd->count *
+               (sizeof(lp_rast_cmd) + sizeof(union lp_rast_cmd_arg)));
+   }
+   return size;
+}
+
+
+/**
+ * Add a reference to a resource by the scene.
+ */
+void
+lp_scene_add_resource_reference(struct lp_scene *scene,
+                                struct pipe_resource *resource)
+{
+   struct resource_ref *ref = CALLOC_STRUCT(resource_ref);
+   if (ref) {
+      struct resource_ref *ref_list = &scene->resources;
+      pipe_resource_reference(&ref->resource, resource);
+      insert_at_tail(ref_list, ref);
+   }
+
+   scene->scene_size += llvmpipe_resource_size(resource);
+}
+
+
+/**
+ * Does this scene have a reference to the given resource?
+ */
+boolean
+lp_scene_is_resource_referenced(const struct lp_scene *scene,
+                                const struct pipe_resource *resource)
+{
+   const struct resource_ref *ref_list = &scene->resources;
+   const struct resource_ref *ref;
+   foreach (ref, ref_list) {
+      if (ref->resource == resource)
+         return TRUE;
+   }
+   return FALSE;
+}
+
+
+/**
+ * Return last command in the bin
+ */
+static lp_rast_cmd
+lp_get_last_command( const struct cmd_bin *bin )
+{
+   const struct cmd_block *tail = bin->commands.tail;
+   const unsigned i = tail->count;
+   if (i > 0)
+      return tail->cmd[i - 1];
+   else
+      return NULL;
+}
+
+
+/**
+ * Replace the arg of the last command in the bin.
+ */
+static void
+lp_replace_last_command_arg( struct cmd_bin *bin,
+                             const union lp_rast_cmd_arg arg )
+{
+   struct cmd_block *tail = bin->commands.tail;
+   const unsigned i = tail->count;
+   assert(i > 0);
+   tail->arg[i - 1] = arg;
+}
+
+
+
+/**
+ * Put a state-change command into all bins.
+ * If we find that the last command in a bin was also a state-change
+ * command, we can simply replace that one with the new one.
+ */
+void
+lp_scene_bin_state_command( struct lp_scene *scene,
+                            lp_rast_cmd cmd,
+                            const union lp_rast_cmd_arg arg )
+{
+   unsigned i, j;
+   for (i = 0; i < scene->tiles_x; i++) {
+      for (j = 0; j < scene->tiles_y; j++) {
+         struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
+         lp_rast_cmd last_cmd = lp_get_last_command(bin);
+         if (last_cmd == cmd) {
+            lp_replace_last_command_arg(bin, arg);
+         }
+         else {
+            lp_scene_bin_command( scene, i, j, cmd, arg );
+         }
+      }
+   }
+}
+
+
+/** advance curr_x,y to the next bin */
+static boolean
+next_bin(struct lp_scene *scene)
+{
+   scene->curr_x++;
+   if (scene->curr_x >= scene->tiles_x) {
+      scene->curr_x = 0;
+      scene->curr_y++;
+   }
+   if (scene->curr_y >= scene->tiles_y) {
+      /* no more bins */
+      return FALSE;
+   }
+   return TRUE;
+}
+
+
+void
+lp_scene_bin_iter_begin( struct lp_scene *scene )
+{
+   scene->curr_x = scene->curr_y = -1;
+}
+
+
+/**
+ * Return pointer to next bin to be rendered.
+ * The lp_scene::curr_x and ::curr_y fields will be advanced.
+ * Multiple rendering threads will call this function to get a chunk
+ * of work (a bin) to work on.
+ */
+struct cmd_bin *
+lp_scene_bin_iter_next( struct lp_scene *scene, int *bin_x, int *bin_y )
+{
+   struct cmd_bin *bin = NULL;
+
+   pipe_mutex_lock(scene->mutex);
+
+   if (scene->curr_x < 0) {
+      /* first bin */
+      scene->curr_x = 0;
+      scene->curr_y = 0;
+   }
+   else if (!next_bin(scene)) {
+      /* no more bins left */
+      goto end;
+   }
+
+   bin = lp_scene_get_bin(scene, scene->curr_x, scene->curr_y);
+   *bin_x = scene->curr_x;
+   *bin_y = scene->curr_y;
+
+end:
+   /*printf("return bin %p at %d, %d\n", (void *) bin, *bin_x, *bin_y);*/
+   pipe_mutex_unlock(scene->mutex);
+   return bin;
+}
+
+
+void lp_scene_begin_binning( struct lp_scene *scene,
+                             struct pipe_framebuffer_state *fb )
+{
+   assert(lp_scene_is_empty(scene));
+
+   util_copy_framebuffer_state(&scene->fb, fb);
+
+   scene->tiles_x = align(fb->width, TILE_SIZE) / TILE_SIZE;
+   scene->tiles_y = align(fb->height, TILE_SIZE) / TILE_SIZE;
+
+   assert(scene->tiles_x <= TILES_X);
+   assert(scene->tiles_y <= TILES_Y);
+}
+
+
+void lp_scene_rasterize( struct lp_scene *scene,
+                         struct lp_rasterizer *rast )
+{
+   if (0) {
+      unsigned x, y;
+      debug_printf("rasterize scene:\n");
+      debug_printf("  data size: %u\n", lp_scene_data_size(scene));
+      for (y = 0; y < scene->tiles_y; y++) {
+         for (x = 0; x < scene->tiles_x; x++) {
+            debug_printf("  bin %u, %u size: %u\n", x, y,
+                         lp_scene_bin_size(scene, x, y));
+         }
+      }
+   }
+
+   /* Enqueue the scene for rasterization, then immediately wait for
+    * it to finish.
+    */
+   lp_rast_queue_scene( rast, scene );
+
+   /* Currently just wait for the rasterizer to finish.  Some
+    * threading interactions need to be worked out, particularly once
+    * transfers become per-context:
+    */
+   lp_rast_finish( rast );
+
+   util_unreference_framebuffer_state( &scene->fb );
+
+   /* put scene into the empty list */
+   lp_scene_enqueue( scene->empty_queue, scene );
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
new file mode 100644
index 0000000000..4e55d43174
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -0,0 +1,339 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * Binner data structures and bin-related functions.
+ * Note: the "setup" code is concerned with building scenes while
+ * The "rast" code is concerned with consuming/executing scenes.
+ */
+
+#ifndef LP_SCENE_H
+#define LP_SCENE_H
+
+#include "os/os_thread.h"
+#include "lp_tile_soa.h"
+#include "lp_rast.h"
+
+struct lp_scene_queue;
+
+/* We're limited to 2K by 2K for 32bit fixed point rasterization.
+ * Will need a 64-bit version for larger framebuffers.
+ */
+#define TILES_X (LP_MAX_WIDTH / TILE_SIZE)
+#define TILES_Y (LP_MAX_HEIGHT / TILE_SIZE)
+
+
+#define CMD_BLOCK_MAX 128
+#define DATA_BLOCK_SIZE (16 * 1024 - sizeof(unsigned) - sizeof(void *))
+   
+
+
+/* switch to a non-pointer value for this:
+ */
+typedef void (*lp_rast_cmd)( struct lp_rasterizer_task *,
+                             const union lp_rast_cmd_arg );
+
+struct cmd_block {
+   lp_rast_cmd cmd[CMD_BLOCK_MAX];
+   union lp_rast_cmd_arg arg[CMD_BLOCK_MAX];
+   unsigned count;
+   struct cmd_block *next;
+};
+
+struct data_block {
+   ubyte data[DATA_BLOCK_SIZE];
+   unsigned used;
+   struct data_block *next;
+};
+
+struct cmd_block_list {
+   struct cmd_block *head;
+   struct cmd_block *tail;
+};
+
+/**
+ * For each screen tile we have one of these bins.
+ */
+struct cmd_bin {
+   struct cmd_block_list commands;
+};
+   
+
+/**
+ * This stores bulk data which is shared by all bins within a scene.
+ * Examples include triangle data and state data.  The commands in
+ * the per-tile bins will point to chunks of data in this structure.
+ */
+struct data_block_list {
+   struct data_block *head;
+   struct data_block *tail;
+};
+
+
+/** List of resource references */
+struct resource_ref {
+   struct pipe_resource *resource;
+   struct resource_ref *prev, *next;  /**< linked list w/ u_simple_list.h */
+};
+
+
+/**
+ * All bins and bin data are contained here.
+ * Per-bin data goes into the 'tile' bins.
+ * Shared data goes into the 'data' buffer.
+ *
+ * When there are multiple threads, will want to double-buffer between
+ * scenes:
+ */
+struct lp_scene {
+   struct pipe_context *pipe;
+
+   /** the framebuffer to render the scene into */
+   struct pipe_framebuffer_state fb;
+
+   /** list of resources referenced by the scene commands */
+   struct resource_ref resources;
+
+   /** Approx memory used by the scene (in bytes).  This includes the
+    * shared and per-tile bins plus any referenced resources/textures.
+    */
+   unsigned scene_size;
+
+   boolean has_color_clear;
+   boolean has_depthstencil_clear;
+
+   /**
+    * Number of active tiles in each dimension.
+    * This basically the framebuffer size divided by tile size
+    */
+   unsigned tiles_x, tiles_y;
+
+   int curr_x, curr_y;  /**< for iterating over bins */
+   pipe_mutex mutex;
+
+   /* Where to place this scene once it has been rasterized:
+    */
+   struct lp_scene_queue *empty_queue;
+
+   struct cmd_bin tile[TILES_X][TILES_Y];
+   struct data_block_list data;
+};
+
+
+
+struct lp_scene *lp_scene_create(struct pipe_context *pipe,
+                                 struct lp_scene_queue *empty_queue);
+
+void lp_scene_destroy(struct lp_scene *scene);
+
+
+
+boolean lp_scene_is_empty(struct lp_scene *scene );
+
+void lp_scene_reset(struct lp_scene *scene );
+
+
+struct data_block *lp_bin_new_data_block( struct data_block_list *list );
+
+struct cmd_block *lp_bin_new_cmd_block( struct cmd_block_list *list );
+
+unsigned lp_scene_data_size( const struct lp_scene *scene );
+
+unsigned lp_scene_bin_size( const struct lp_scene *scene, unsigned x, unsigned y );
+
+void lp_scene_add_resource_reference(struct lp_scene *scene,
+                                     struct pipe_resource *resource);
+
+boolean lp_scene_is_resource_referenced(const struct lp_scene *scene,
+                                        const struct pipe_resource *resource );
+
+
+/**
+ * Allocate space for a command/data in the bin's data buffer.
+ * Grow the block list if needed.
+ */
+static INLINE void *
+lp_scene_alloc( struct lp_scene *scene, unsigned size)
+{
+   struct data_block_list *list = &scene->data;
+   struct data_block *tail = list->tail;
+
+   if (tail->used + size > DATA_BLOCK_SIZE) {
+      tail = lp_bin_new_data_block( list );
+      if (!tail) {
+         /* out of memory */
+         return NULL;
+      }
+   }
+
+   scene->scene_size += size;
+
+   {
+      ubyte *data = tail->data + tail->used;
+      tail->used += size;
+      return data;
+   }
+}
+
+
+/**
+ * As above, but with specific alignment.
+ */
+static INLINE void *
+lp_scene_alloc_aligned( struct lp_scene *scene, unsigned size,
+			unsigned alignment )
+{
+   struct data_block_list *list = &scene->data;
+   struct data_block *tail = list->tail;
+
+   if (tail->used + size + alignment - 1 > DATA_BLOCK_SIZE) {
+      tail = lp_bin_new_data_block( list );
+      if (!tail)
+         return NULL;
+   }
+
+   scene->scene_size += size;
+
+   {
+      ubyte *data = tail->data + tail->used;
+      unsigned offset = (((uintptr_t)data + alignment - 1) & ~(alignment - 1)) - (uintptr_t)data;
+      tail->used += offset + size;
+      return data + offset;
+   }
+}
+
+
+/* Put back data if we decide not to use it, eg. culled triangles.
+ */
+static INLINE void
+lp_scene_putback_data( struct lp_scene *scene, unsigned size)
+{
+   struct data_block_list *list = &scene->data;
+   scene->scene_size -= size;
+   assert(list->tail->used >= size);
+   list->tail->used -= size;
+}
+
+
+/** Return pointer to a particular tile's bin. */
+static INLINE struct cmd_bin *
+lp_scene_get_bin(struct lp_scene *scene, unsigned x, unsigned y)
+{
+   return &scene->tile[x][y];
+}
+
+
+/** Remove all commands from a bin */
+void
+lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y);
+
+
+/* Add a command to bin[x][y].
+ */
+static INLINE void
+lp_scene_bin_command( struct lp_scene *scene,
+                unsigned x, unsigned y,
+                lp_rast_cmd cmd,
+                union lp_rast_cmd_arg arg )
+{
+   struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+   struct cmd_block_list *list = &bin->commands;
+   struct cmd_block *tail = list->tail;
+
+   assert(x < scene->tiles_x);
+   assert(y < scene->tiles_y);
+
+   if (tail->count == CMD_BLOCK_MAX) {
+      tail = lp_bin_new_cmd_block( list );
+      if (!tail) {
+         /* out of memory - simply ignore this command (for now) */
+         return;
+      }
+      assert(tail->count == 0);
+   }
+
+   {
+      unsigned i = tail->count;
+      tail->cmd[i] = cmd;
+      tail->arg[i] = arg;
+      tail->count++;
+   }
+}
+
+
+/* Add a command to all active bins.
+ */
+static INLINE void
+lp_scene_bin_everywhere( struct lp_scene *scene,
+			 lp_rast_cmd cmd,
+			 const union lp_rast_cmd_arg arg )
+{
+   unsigned i, j;
+   for (i = 0; i < scene->tiles_x; i++)
+      for (j = 0; j < scene->tiles_y; j++)
+         lp_scene_bin_command( scene, i, j, cmd, arg );
+}
+
+
+void
+lp_scene_bin_state_command( struct lp_scene *scene,
+			    lp_rast_cmd cmd,
+			    const union lp_rast_cmd_arg arg );
+
+
+static INLINE unsigned
+lp_scene_get_num_bins( const struct lp_scene *scene )
+{
+   return scene->tiles_x * scene->tiles_y;
+}
+
+
+void
+lp_scene_bin_iter_begin( struct lp_scene *scene );
+
+struct cmd_bin *
+lp_scene_bin_iter_next( struct lp_scene *scene, int *bin_x, int *bin_y );
+
+
+void
+lp_scene_rasterize( struct lp_scene *scene,
+                    struct lp_rasterizer *rast );
+
+void
+lp_scene_begin_binning( struct lp_scene *scene,
+                        struct pipe_framebuffer_state *fb );
+
+
+static INLINE unsigned
+lp_scene_get_size(const struct lp_scene *scene)
+{
+   return scene->scene_size;
+}
+
+
+#endif /* LP_BIN_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_scene_queue.c b/src/gallium/drivers/llvmpipe/lp_scene_queue.c
new file mode 100644
index 0000000000..975db43c4e
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_scene_queue.c
@@ -0,0 +1,124 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * Scene queue.  We'll use two queues.  One contains "full" scenes which
+ * are produced by the "setup" code.  The other contains "empty" scenes
+ * which are produced by the "rast" code when it finishes rendering a scene.
+ */
+
+#include "util/u_ringbuffer.h"
+#include "util/u_memory.h"
+#include "lp_scene_queue.h"
+
+
+
+#define MAX_SCENE_QUEUE 4
+
+struct scene_packet {
+   struct util_packet header;
+   struct lp_scene *scene;
+};
+
+/**
+ * A queue of scenes
+ */
+struct lp_scene_queue
+{
+   struct util_ringbuffer *ring;
+};
+
+
+
+/** Allocate a new scene queue */
+struct lp_scene_queue *
+lp_scene_queue_create(void)
+{
+   struct lp_scene_queue *queue = CALLOC_STRUCT(lp_scene_queue);
+   if (queue == NULL)
+      return NULL;
+
+   queue->ring = util_ringbuffer_create( MAX_SCENE_QUEUE * 
+                                         sizeof( struct scene_packet ) / 4);
+   if (queue->ring == NULL)
+      goto fail;
+
+   return queue;
+
+fail:
+   FREE(queue);
+   return NULL;
+}
+
+
+/** Delete a scene queue */
+void
+lp_scene_queue_destroy(struct lp_scene_queue *queue)
+{
+   util_ringbuffer_destroy(queue->ring);
+   FREE(queue);
+}
+
+
+/** Remove first lp_scene from head of queue */
+struct lp_scene *
+lp_scene_dequeue(struct lp_scene_queue *queue, boolean wait)
+{
+   struct scene_packet packet;
+   enum pipe_error ret;
+
+   packet.scene = NULL;
+
+   ret = util_ringbuffer_dequeue(queue->ring,
+                                 &packet.header,
+                                 sizeof packet / 4,
+                                 wait );
+   if (ret != PIPE_OK)
+      return NULL;
+
+   return packet.scene;
+}
+
+
+/** Add an lp_scene to tail of queue */
+void
+lp_scene_enqueue(struct lp_scene_queue *queue, struct lp_scene *scene)
+{
+   struct scene_packet packet;
+
+   packet.header.dwords = sizeof packet / 4;
+   packet.header.data24 = 0;
+   packet.scene = scene;
+
+   util_ringbuffer_enqueue(queue->ring, &packet.header);
+}
+
+
+
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_scene_queue.h b/src/gallium/drivers/llvmpipe/lp_scene_queue.h
new file mode 100644
index 0000000000..fd7c65a2c8
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_scene_queue.h
@@ -0,0 +1,51 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_SCENE_QUEUE
+#define LP_SCENE_QUEUE
+
+struct lp_scene_queue;
+struct lp_scene;
+
+
+struct lp_scene_queue *
+lp_scene_queue_create(void);
+
+void
+lp_scene_queue_destroy(struct lp_scene_queue *queue);
+
+struct lp_scene *
+lp_scene_dequeue(struct lp_scene_queue *queue, boolean wait);
+
+void
+lp_scene_enqueue(struct lp_scene_queue *queue, struct lp_scene *scene);
+
+
+
+
+#endif /* LP_BIN_QUEUE */
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
new file mode 100644
index 0000000000..6432cea862
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -0,0 +1,367 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_cpu_detect.h"
+#include "util/u_format.h"
+#include "util/u_format_s3tc.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+
+#include "gallivm/lp_bld_limits.h"
+#include "lp_texture.h"
+#include "lp_fence.h"
+#include "lp_jit.h"
+#include "lp_screen.h"
+#include "lp_context.h"
+#include "lp_debug.h"
+#include "lp_public.h"
+#include "lp_limits.h"
+
+#include "state_tracker/sw_winsys.h"
+
+#ifdef DEBUG
+int LP_DEBUG = 0;
+
+static const struct debug_named_value lp_debug_flags[] = {
+   { "pipe",   DEBUG_PIPE, NULL },
+   { "tgsi",   DEBUG_TGSI, NULL },
+   { "tex",    DEBUG_TEX, NULL },
+   { "setup",  DEBUG_SETUP, NULL },
+   { "rast",   DEBUG_RAST, NULL },
+   { "query",  DEBUG_QUERY, NULL },
+   { "screen", DEBUG_SCREEN, NULL },
+   { "show_tiles",    DEBUG_SHOW_TILES, NULL },
+   { "show_subtiles", DEBUG_SHOW_SUBTILES, NULL },
+   { "counters", DEBUG_COUNTERS, NULL },
+   DEBUG_NAMED_VALUE_END
+};
+#endif
+
+
+static const char *
+llvmpipe_get_vendor(struct pipe_screen *screen)
+{
+   return "VMware, Inc.";
+}
+
+
+static const char *
+llvmpipe_get_name(struct pipe_screen *screen)
+{
+   return "llvmpipe";
+}
+
+
+static int
+llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return PIPE_MAX_SAMPLERS;
+   case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+      return 0;
+   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+      return PIPE_MAX_SAMPLERS + PIPE_MAX_VERTEX_SAMPLERS;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 1;
+   case PIPE_CAP_SM3:
+      return 1;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return PIPE_MAX_COLOR_BUFS;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_TIMER_QUERY:
+      return 0;
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+      return 1;
+   case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+      return 1;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_TEXTURE_SWIZZLE:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return LP_MAX_TEXTURE_2D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return LP_MAX_TEXTURE_3D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return LP_MAX_TEXTURE_2D_LEVELS;
+   case PIPE_CAP_TGSI_CONT_SUPPORTED:
+      return 1;
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+      return 1;
+   case PIPE_CAP_INDEP_BLEND_ENABLE:
+      return 1;
+   case PIPE_CAP_INDEP_BLEND_FUNC:
+      return 0;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+      return 0;
+   case PIPE_CAP_MAX_VS_INSTRUCTIONS:
+   case PIPE_CAP_MAX_FS_INSTRUCTIONS:
+   case PIPE_CAP_MAX_VS_ALU_INSTRUCTIONS:
+   case PIPE_CAP_MAX_FS_ALU_INSTRUCTIONS:
+   case PIPE_CAP_MAX_VS_TEX_INSTRUCTIONS:
+   case PIPE_CAP_MAX_FS_TEX_INSTRUCTIONS:
+   case PIPE_CAP_MAX_VS_TEX_INDIRECTIONS:
+   case PIPE_CAP_MAX_FS_TEX_INDIRECTIONS:
+      /* There is no limit in number of instructions beyond available memory */
+      return 32768;
+   case PIPE_CAP_MAX_VS_CONTROL_FLOW_DEPTH:
+   case PIPE_CAP_MAX_FS_CONTROL_FLOW_DEPTH:
+      return LP_MAX_TGSI_NESTING;
+   case PIPE_CAP_MAX_VS_INPUTS:
+   case PIPE_CAP_MAX_FS_INPUTS:
+      return PIPE_MAX_ATTRIBS;
+   case PIPE_CAP_MAX_FS_CONSTS:
+   case PIPE_CAP_MAX_VS_CONSTS:
+      /* There is no limit in number of constants beyond available memory */
+      return 32768;
+   case PIPE_CAP_MAX_VS_TEMPS:
+   case PIPE_CAP_MAX_FS_TEMPS:
+      return LP_MAX_TGSI_TEMPS;
+   case PIPE_CAP_MAX_VS_ADDRS:
+   case PIPE_CAP_MAX_FS_ADDRS:
+      return LP_MAX_TGSI_ADDRS;
+   case PIPE_CAP_MAX_VS_PREDS:
+   case PIPE_CAP_MAX_FS_PREDS:
+      return LP_MAX_TGSI_PREDS;
+   case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
+      return 1;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+
+static float
+llvmpipe_get_paramf(struct pipe_screen *screen, enum pipe_cap param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 16.0; /* not actually signficant at this time */
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0; /* arbitrary */
+   case PIPE_CAP_GUARD_BAND_LEFT:
+   case PIPE_CAP_GUARD_BAND_TOP:
+   case PIPE_CAP_GUARD_BAND_RIGHT:
+   case PIPE_CAP_GUARD_BAND_BOTTOM:
+      return 0.0;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+
+/**
+ * Query format support for creating a texture, drawing surface, etc.
+ * \param format  the format to test
+ * \param type  one of PIPE_TEXTURE, PIPE_SURFACE
+ */
+static boolean
+llvmpipe_is_format_supported( struct pipe_screen *_screen,
+                              enum pipe_format format,
+                              enum pipe_texture_target target,
+                              unsigned sample_count,
+                              unsigned bind,
+                              unsigned geom_flags )
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct sw_winsys *winsys = screen->winsys;
+   const struct util_format_description *format_desc;
+
+   format_desc = util_format_description(format);
+   if (!format_desc)
+      return FALSE;
+
+   assert(target == PIPE_BUFFER ||
+          target == PIPE_TEXTURE_1D ||
+          target == PIPE_TEXTURE_2D ||
+          target == PIPE_TEXTURE_3D ||
+          target == PIPE_TEXTURE_CUBE);
+
+   if (sample_count > 1)
+      return FALSE;
+
+   if (bind & PIPE_BIND_RENDER_TARGET) {
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+
+      if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+         return FALSE;
+
+      if (format_desc->block.width != 1 ||
+          format_desc->block.height != 1)
+         return FALSE;
+   }
+
+   if (bind & PIPE_BIND_DISPLAY_TARGET) {
+      if(!winsys->is_displaytarget_format_supported(winsys, bind, format))
+         return FALSE;
+   }
+
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
+      if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+         return FALSE;
+
+      if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+
+      /* FIXME: Temporary restriction. See lp_state_fs.c. */
+      if (format_desc->block.bits != 32)
+         return FALSE;
+   }
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+      return util_format_s3tc_enabled;
+   }
+
+   /*
+    * Everything else should be supported by u_format.
+    */
+   return TRUE;
+}
+
+
+
+
+static void
+llvmpipe_flush_frontbuffer(struct pipe_screen *_screen,
+                           struct pipe_surface *surface,
+                           void *context_private)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct sw_winsys *winsys = screen->winsys;
+   struct llvmpipe_resource *texture = llvmpipe_resource(surface->texture);
+
+   assert(texture->dt);
+   if (texture->dt)
+      winsys->displaytarget_display(winsys, texture->dt, context_private);
+}
+
+
+static void
+llvmpipe_destroy_screen( struct pipe_screen *_screen )
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct sw_winsys *winsys = screen->winsys;
+
+   lp_jit_screen_cleanup(screen);
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+
+/**
+ * Create a new pipe_screen object
+ * Note: we're not presently subclassing pipe_screen (no llvmpipe_screen).
+ */
+struct pipe_screen *
+llvmpipe_create_screen(struct sw_winsys *winsys)
+{
+   struct llvmpipe_screen *screen;
+
+#ifdef PIPE_ARCH_X86
+   /* require SSE2 due to LLVM PR6960. */
+   util_cpu_detect();
+   if (!util_cpu_caps.has_sse2)
+       return NULL;
+#endif
+
+   screen = CALLOC_STRUCT(llvmpipe_screen);
+
+#ifdef DEBUG
+   LP_DEBUG = debug_get_flags_option("LP_DEBUG", lp_debug_flags, 0 );
+#endif
+
+   if (!screen)
+      return NULL;
+
+   screen->winsys = winsys;
+
+   screen->base.destroy = llvmpipe_destroy_screen;
+
+   screen->base.get_name = llvmpipe_get_name;
+   screen->base.get_vendor = llvmpipe_get_vendor;
+   screen->base.get_param = llvmpipe_get_param;
+   screen->base.get_paramf = llvmpipe_get_paramf;
+   screen->base.is_format_supported = llvmpipe_is_format_supported;
+
+   screen->base.context_create = llvmpipe_create_context;
+   screen->base.flush_frontbuffer = llvmpipe_flush_frontbuffer;
+
+   llvmpipe_init_screen_resource_funcs(&screen->base);
+   llvmpipe_init_screen_fence_funcs(&screen->base);
+
+   lp_jit_screen_init(screen);
+
+#ifdef PIPE_OS_WINDOWS
+   /* Multithreading not supported on windows until conditions and barriers are
+    * properly implemented. */
+   screen->num_threads = 0;
+#else
+#ifdef PIPE_OS_EMBEDDED
+   screen->num_threads = 0;
+#else
+   screen->num_threads = util_cpu_caps.nr_cpus;
+#endif
+   screen->num_threads = debug_get_num_option("LP_NUM_THREADS", screen->num_threads);
+   screen->num_threads = MIN2(screen->num_threads, LP_MAX_THREADS);
+#endif
+
+   util_format_s3tc_init();
+
+   return &screen->base;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.h b/src/gallium/drivers/llvmpipe/lp_screen.h
new file mode 100644
index 0000000000..eb40f6823f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_screen.h
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef LP_SCREEN_H
+#define LP_SCREEN_H
+
+#include "gallivm/lp_bld.h"
+#include <llvm-c/ExecutionEngine.h>
+
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+
+
+struct sw_winsys;
+
+
+struct llvmpipe_screen
+{
+   struct pipe_screen base;
+
+   struct sw_winsys *winsys;
+
+   LLVMModuleRef module;
+   LLVMExecutionEngineRef engine;
+   LLVMModuleProviderRef provider;
+   LLVMTargetDataRef target;
+   LLVMPassManagerRef pass;
+
+   LLVMTypeRef context_ptr_type;
+
+   unsigned num_threads;
+
+   /* Increments whenever textures are modified.  Contexts can track this.
+    */
+   unsigned timestamp;
+};
+
+
+
+
+static INLINE struct llvmpipe_screen *
+llvmpipe_screen( struct pipe_screen *pipe )
+{
+   return (struct llvmpipe_screen *)pipe;
+}
+
+
+
+#endif /* LP_SCREEN_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
new file mode 100644
index 0000000000..e8aafee33f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -0,0 +1,953 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Tiling engine.
+ *
+ * Builds per-tile display lists and executes them on calls to
+ * lp_setup_flush().
+ */
+
+#include <limits.h>
+
+#include "pipe/p_defines.h"
+#include "util/u_framebuffer.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+#include "lp_context.h"
+#include "lp_scene.h"
+#include "lp_scene_queue.h"
+#include "lp_texture.h"
+#include "lp_debug.h"
+#include "lp_fence.h"
+#include "lp_query.h"
+#include "lp_rast.h"
+#include "lp_setup_context.h"
+#include "lp_screen.h"
+#include "lp_state.h"
+#include "state_tracker/sw_winsys.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+
+
+static void set_scene_state( struct lp_setup_context *, enum setup_state );
+
+
+struct lp_scene *
+lp_setup_get_current_scene(struct lp_setup_context *setup)
+{
+   if (!setup->scene) {
+
+      /* wait for a free/empty scene
+       */
+      setup->scene = lp_scene_dequeue(setup->empty_scenes, TRUE);
+
+      assert(lp_scene_is_empty(setup->scene));
+
+      lp_scene_begin_binning(setup->scene,
+                             &setup->fb );
+   }
+   return setup->scene;
+}
+
+
+/**
+ * Check if the size of the current scene has exceeded the limit.
+ * If so, flush/render it.
+ */
+static void
+setup_check_scene_size_and_flush(struct lp_setup_context *setup)
+{
+   if (setup->scene) {
+      struct lp_scene *scene = lp_setup_get_current_scene(setup);
+      unsigned size = lp_scene_get_size(scene);
+
+      if (size > LP_MAX_SCENE_SIZE) {
+         /*printf("LLVMPIPE: scene size = %u, flushing.\n", size);*/
+         set_scene_state( setup, SETUP_FLUSHED );
+         /*assert(lp_scene_get_size(scene) == 0);*/
+      }
+   }
+}
+
+
+static void
+first_triangle( struct lp_setup_context *setup,
+                const float (*v0)[4],
+                const float (*v1)[4],
+                const float (*v2)[4])
+{
+   set_scene_state( setup, SETUP_ACTIVE );
+   lp_setup_choose_triangle( setup );
+   setup->triangle( setup, v0, v1, v2 );
+}
+
+static void
+first_line( struct lp_setup_context *setup,
+	    const float (*v0)[4],
+	    const float (*v1)[4])
+{
+   set_scene_state( setup, SETUP_ACTIVE );
+   lp_setup_choose_line( setup );
+   setup->line( setup, v0, v1 );
+}
+
+static void
+first_point( struct lp_setup_context *setup,
+	     const float (*v0)[4])
+{
+   set_scene_state( setup, SETUP_ACTIVE );
+   lp_setup_choose_point( setup );
+   setup->point( setup, v0 );
+}
+
+static void reset_context( struct lp_setup_context *setup )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   /* Reset derived state */
+   setup->constants.stored_size = 0;
+   setup->constants.stored_data = NULL;
+   setup->fs.stored = NULL;
+   setup->dirty = ~0;
+
+   /* no current bin */
+   setup->scene = NULL;
+
+   /* Reset some state:
+    */
+   setup->clear.flags = 0;
+   setup->clear.clearzs.clearzs_mask = 0;
+
+   /* Have an explicit "start-binning" call and get rid of this
+    * pointer twiddling?
+    */
+   setup->line = first_line;
+   setup->point = first_point;
+   setup->triangle = first_triangle;
+}
+
+
+/** Rasterize all scene's bins */
+static void
+lp_setup_rasterize_scene( struct lp_setup_context *setup )
+{
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+
+   lp_scene_rasterize(scene, setup->rast);
+
+   reset_context( setup );
+
+   LP_DBG(DEBUG_SETUP, "%s done \n", __FUNCTION__);
+}
+
+
+
+static void
+begin_binning( struct lp_setup_context *setup )
+{
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+   boolean need_zsload = FALSE;
+   if (setup->fb.zsbuf &&
+       ((setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) &&
+        util_format_is_depth_and_stencil(setup->fb.zsbuf->format))
+      need_zsload = TRUE;
+
+   LP_DBG(DEBUG_SETUP, "%s color: %s depth: %s\n", __FUNCTION__,
+          (setup->clear.flags & PIPE_CLEAR_COLOR) ? "clear": "load",
+          need_zsload ? "clear": "load");
+
+   if (setup->fb.nr_cbufs) {
+      if (setup->clear.flags & PIPE_CLEAR_COLOR) {
+         lp_scene_bin_everywhere( scene, 
+				  lp_rast_clear_color, 
+				  setup->clear.color );
+         scene->has_color_clear = TRUE;
+      }
+   }
+
+   if (setup->fb.zsbuf) {
+      if (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) {
+         if (!need_zsload)
+            scene->has_depthstencil_clear = TRUE;
+         lp_scene_bin_everywhere( scene,
+                                  lp_rast_clear_zstencil,
+                                  lp_rast_arg_clearzs(&setup->clear.clearzs) );
+      }
+   }
+
+   LP_DBG(DEBUG_SETUP, "%s done\n", __FUNCTION__);
+}
+
+
+/* This basically bins and then flushes any outstanding full-screen
+ * clears.  
+ *
+ * TODO: fast path for fullscreen clears and no triangles.
+ */
+static void
+execute_clears( struct lp_setup_context *setup )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   begin_binning( setup );
+   lp_setup_rasterize_scene( setup );
+}
+
+
+static void
+set_scene_state( struct lp_setup_context *setup,
+                 enum setup_state new_state )
+{
+   unsigned old_state = setup->state;
+
+   if (old_state == new_state)
+      return;
+       
+   LP_DBG(DEBUG_SETUP, "%s old %d new %d\n", __FUNCTION__, old_state, new_state);
+
+   switch (new_state) {
+   case SETUP_ACTIVE:
+      begin_binning( setup );
+      break;
+
+   case SETUP_CLEARED:
+      if (old_state == SETUP_ACTIVE) {
+         assert(0);
+         return;
+      }
+      break;
+      
+   case SETUP_FLUSHED:
+      if (old_state == SETUP_CLEARED)
+         execute_clears( setup );
+      else
+         lp_setup_rasterize_scene( setup );
+      break;
+
+   default:
+      assert(0 && "invalid setup state mode");
+   }
+
+   setup->state = new_state;
+}
+
+
+/**
+ * \param flags  bitmask of PIPE_FLUSH_x flags
+ */
+void
+lp_setup_flush( struct lp_setup_context *setup,
+                unsigned flags )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   if (setup->scene) {
+      struct lp_scene *scene = lp_setup_get_current_scene(setup);
+      union lp_rast_cmd_arg dummy = {0};
+
+      if (flags & (PIPE_FLUSH_SWAPBUFFERS |
+                   PIPE_FLUSH_FRAME)) {
+         /* Store colors in the linear color buffer(s).
+          * If we don't do this here, we'll end up converting the tiled
+          * data to linear in the texture_unmap() function, which will
+          * not be a parallel/threaded operation as here.
+          */
+         lp_scene_bin_everywhere(scene, lp_rast_store_color, dummy);
+      }
+   }
+
+   set_scene_state( setup, SETUP_FLUSHED );
+}
+
+
+void
+lp_setup_bind_framebuffer( struct lp_setup_context *setup,
+                           const struct pipe_framebuffer_state *fb )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   /* Flush any old scene.
+    */
+   set_scene_state( setup, SETUP_FLUSHED );
+
+   /* Set new state.  This will be picked up later when we next need a
+    * scene.
+    */
+   util_copy_framebuffer_state(&setup->fb, fb);
+}
+
+
+void
+lp_setup_clear( struct lp_setup_context *setup,
+                const float *color,
+                double depth,
+                unsigned stencil,
+                unsigned flags )
+{
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+   unsigned i;
+   boolean full_zs_clear = TRUE;
+   uint32_t mask = 0;
+
+   LP_DBG(DEBUG_SETUP, "%s state %d\n", __FUNCTION__, setup->state);
+
+
+   if (flags & PIPE_CLEAR_COLOR) {
+      for (i = 0; i < 4; ++i)
+         setup->clear.color.clear_color[i] = float_to_ubyte(color[i]);
+   }
+
+   if (flags & PIPE_CLEAR_DEPTHSTENCIL) {
+      if (setup->fb.zsbuf &&
+          ((flags & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) &&
+           util_format_is_depth_and_stencil(setup->fb.zsbuf->format))
+         full_zs_clear = FALSE;
+
+      if (full_zs_clear) {
+         setup->clear.clearzs.clearzs_value =
+            util_pack_z_stencil(setup->fb.zsbuf->format,
+                                depth,
+                                stencil);
+         setup->clear.clearzs.clearzs_mask = 0xffffffff;
+      }
+      else {
+         /* hmm */
+         uint32_t tmpval;
+         if (flags & PIPE_CLEAR_DEPTH) {
+            tmpval = util_pack_z(setup->fb.zsbuf->format,
+                                 depth);
+            switch (setup->fb.zsbuf->format) {
+            case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+               mask = 0xffffff;
+               break;
+            case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+               mask = 0xffffff00;
+               break;
+            default:
+               assert(0);
+            }
+         }
+         else {
+            switch (setup->fb.zsbuf->format) {
+            case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+               mask = 0xff000000;
+               tmpval = stencil << 24;
+               break;
+            case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+               mask = 0xff;
+               tmpval = stencil;
+               break;
+            default:
+               assert(0);
+               tmpval = 0;
+            }
+         }
+         setup->clear.clearzs.clearzs_mask |= mask;
+         setup->clear.clearzs.clearzs_value =
+            (setup->clear.clearzs.clearzs_value & ~mask) | (tmpval & mask);
+      }
+   }
+
+   if (setup->state == SETUP_ACTIVE) {
+      /* Add the clear to existing scene.  In the unusual case where
+       * both color and depth-stencil are being cleared when there's
+       * already been some rendering, we could discard the currently
+       * binned scene and start again, but I don't see that as being
+       * a common usage.
+       */
+      if (flags & PIPE_CLEAR_COLOR) {
+         lp_scene_bin_everywhere( scene, 
+                                  lp_rast_clear_color,
+                                  setup->clear.color );
+         scene->has_color_clear = TRUE;
+      }
+
+      if (flags & PIPE_CLEAR_DEPTHSTENCIL) {
+         if (full_zs_clear)
+            scene->has_depthstencil_clear = TRUE;
+         else
+            setup->clear.clearzs.clearzs_mask = mask;
+         lp_scene_bin_everywhere( scene,
+                                  lp_rast_clear_zstencil,
+                                  lp_rast_arg_clearzs(&setup->clear.clearzs) );
+
+
+      }
+
+   }
+   else {
+      /* Put ourselves into the 'pre-clear' state, specifically to try
+       * and accumulate multiple clears to color and depth_stencil
+       * buffers which the app or state-tracker might issue
+       * separately.
+       */
+      set_scene_state( setup, SETUP_CLEARED );
+
+      setup->clear.flags |= flags;
+   }
+}
+
+
+/**
+ * Emit a fence.
+ */
+struct pipe_fence_handle *
+lp_setup_fence( struct lp_setup_context *setup )
+{
+   if (setup->num_threads == 0) {
+      return NULL;
+   }
+   else {
+      struct lp_scene *scene = lp_setup_get_current_scene(setup);
+      const unsigned rank = lp_scene_get_num_bins( scene ); /* xxx */
+      struct lp_fence *fence = lp_fence_create(rank);
+
+      LP_DBG(DEBUG_SETUP, "%s rank %u\n", __FUNCTION__, rank);
+
+      set_scene_state( setup, SETUP_ACTIVE );
+
+      /* insert the fence into all command bins */
+      lp_scene_bin_everywhere( scene,
+                               lp_rast_fence,
+                               lp_rast_arg_fence(fence) );
+
+      return (struct pipe_fence_handle *) fence;
+   }
+}
+
+
+void 
+lp_setup_set_triangle_state( struct lp_setup_context *setup,
+                             unsigned cull_mode,
+                             boolean ccw_is_frontface,
+                             boolean scissor,
+                             boolean gl_rasterization_rules)
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   setup->ccw_is_frontface = ccw_is_frontface;
+   setup->cullmode = cull_mode;
+   setup->triangle = first_triangle;
+   setup->scissor_test = scissor;
+   setup->pixel_offset = gl_rasterization_rules ? 0.5f : 0.0f;
+}
+
+
+
+void
+lp_setup_set_fs_inputs( struct lp_setup_context *setup,
+                        const struct lp_shader_input *input,
+                        unsigned nr )
+{
+   LP_DBG(DEBUG_SETUP, "%s %p %u\n", __FUNCTION__, (void *) input, nr);
+
+   memcpy( setup->fs.input, input, nr * sizeof input[0] );
+   setup->fs.nr_inputs = nr;
+}
+
+void
+lp_setup_set_fs_variant( struct lp_setup_context *setup,
+                         struct lp_fragment_shader_variant *variant)
+{
+   LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__,
+          variant);
+   /* FIXME: reference count */
+
+   setup->fs.current.variant = variant;
+   setup->dirty |= LP_SETUP_NEW_FS;
+}
+
+void
+lp_setup_set_fs_constants(struct lp_setup_context *setup,
+                          struct pipe_resource *buffer)
+{
+   LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__, (void *) buffer);
+
+   pipe_resource_reference(&setup->constants.current, buffer);
+
+   setup->dirty |= LP_SETUP_NEW_CONSTANTS;
+}
+
+
+void
+lp_setup_set_alpha_ref_value( struct lp_setup_context *setup,
+                              float alpha_ref_value )
+{
+   LP_DBG(DEBUG_SETUP, "%s %f\n", __FUNCTION__, alpha_ref_value);
+
+   if(setup->fs.current.jit_context.alpha_ref_value != alpha_ref_value) {
+      setup->fs.current.jit_context.alpha_ref_value = alpha_ref_value;
+      setup->dirty |= LP_SETUP_NEW_FS;
+   }
+}
+
+void
+lp_setup_set_stencil_ref_values( struct lp_setup_context *setup,
+                                 const ubyte refs[2] )
+{
+   LP_DBG(DEBUG_SETUP, "%s %d %d\n", __FUNCTION__, refs[0], refs[1]);
+
+   if (setup->fs.current.jit_context.stencil_ref_front != refs[0] ||
+       setup->fs.current.jit_context.stencil_ref_back != refs[1]) {
+      setup->fs.current.jit_context.stencil_ref_front = refs[0];
+      setup->fs.current.jit_context.stencil_ref_back = refs[1];
+      setup->dirty |= LP_SETUP_NEW_FS;
+   }
+}
+
+void
+lp_setup_set_blend_color( struct lp_setup_context *setup,
+                          const struct pipe_blend_color *blend_color )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   assert(blend_color);
+
+   if(memcmp(&setup->blend_color.current, blend_color, sizeof *blend_color) != 0) {
+      memcpy(&setup->blend_color.current, blend_color, sizeof *blend_color);
+      setup->dirty |= LP_SETUP_NEW_BLEND_COLOR;
+   }
+}
+
+
+void
+lp_setup_set_scissor( struct lp_setup_context *setup,
+                      const struct pipe_scissor_state *scissor )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   assert(scissor);
+
+   if (memcmp(&setup->scissor.current, scissor, sizeof(*scissor)) != 0) {
+      setup->scissor.current = *scissor; /* struct copy */
+      setup->dirty |= LP_SETUP_NEW_SCISSOR;
+   }
+}
+
+
+void 
+lp_setup_set_flatshade_first( struct lp_setup_context *setup,
+                              boolean flatshade_first )
+{
+   setup->flatshade_first = flatshade_first;
+}
+
+
+void 
+lp_setup_set_vertex_info( struct lp_setup_context *setup,
+                          struct vertex_info *vertex_info )
+{
+   /* XXX: just silently holding onto the pointer:
+    */
+   setup->vertex_info = vertex_info;
+}
+
+
+/**
+ * Called during state validation when LP_NEW_SAMPLER_VIEW is set.
+ */
+void
+lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
+                                    unsigned num,
+                                    struct pipe_sampler_view **views)
+{
+   unsigned i;
+
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      struct pipe_sampler_view *view = i < num ? views[i] : NULL;
+
+      if(view) {
+         struct pipe_resource *tex = view->texture;
+         struct llvmpipe_resource *lp_tex = llvmpipe_resource(tex);
+         struct lp_jit_texture *jit_tex;
+         jit_tex = &setup->fs.current.jit_context.textures[i];
+         jit_tex->width = tex->width0;
+         jit_tex->height = tex->height0;
+         jit_tex->depth = tex->depth0;
+         jit_tex->last_level = tex->last_level;
+
+         /* We're referencing the texture's internal data, so save a
+          * reference to it.
+          */
+         pipe_resource_reference(&setup->fs.current_tex[i], tex);
+
+         if (!lp_tex->dt) {
+            /* regular texture - setup array of mipmap level pointers */
+            int j;
+            for (j = 0; j <= tex->last_level; j++) {
+               jit_tex->data[j] =
+                  llvmpipe_get_texture_image_all(lp_tex, j, LP_TEX_USAGE_READ,
+                                                 LP_TEX_LAYOUT_LINEAR);
+               jit_tex->row_stride[j] = lp_tex->row_stride[j];
+               jit_tex->img_stride[j] = lp_tex->img_stride[j];
+            }
+         }
+         else {
+            /* display target texture/surface */
+            /*
+             * XXX: Where should this be unmapped?
+             */
+
+            struct llvmpipe_screen *screen = llvmpipe_screen(tex->screen);
+            struct sw_winsys *winsys = screen->winsys;
+            jit_tex->data[0] = winsys->displaytarget_map(winsys, lp_tex->dt,
+							 PIPE_TRANSFER_READ);
+            jit_tex->row_stride[0] = lp_tex->row_stride[0];
+            jit_tex->img_stride[0] = lp_tex->img_stride[0];
+            assert(jit_tex->data[0]);
+         }
+      }
+   }
+
+   setup->dirty |= LP_SETUP_NEW_FS;
+}
+
+
+/**
+ * Is the given texture referenced by any scene?
+ * Note: we have to check all scenes including any scenes currently
+ * being rendered and the current scene being built.
+ */
+unsigned
+lp_setup_is_resource_referenced( const struct lp_setup_context *setup,
+                                const struct pipe_resource *texture )
+{
+   unsigned i;
+
+   /* check the render targets */
+   for (i = 0; i < setup->fb.nr_cbufs; i++) {
+      if (setup->fb.cbufs[i]->texture == texture)
+         return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+   }
+   if (setup->fb.zsbuf && setup->fb.zsbuf->texture == texture) {
+      return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+   }
+
+   /* check textures referenced by the scene */
+   for (i = 0; i < Elements(setup->scenes); i++) {
+      if (lp_scene_is_resource_referenced(setup->scenes[i], texture)) {
+         return PIPE_REFERENCED_FOR_READ;
+      }
+   }
+
+   return PIPE_UNREFERENCED;
+}
+
+
+/**
+ * Called by vbuf code when we're about to draw something.
+ */
+void
+lp_setup_update_state( struct lp_setup_context *setup )
+{
+   struct lp_scene *scene;
+
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   setup_check_scene_size_and_flush(setup);
+
+   scene = lp_setup_get_current_scene(setup);
+
+   assert(setup->fs.current.variant);
+
+   /* Some of the 'draw' pipeline stages may have changed some driver state.
+    * Make sure we've processed those state changes before anything else.
+    *
+    * XXX this is the only place where llvmpipe_context is used in the
+    * setup code.  This may get refactored/changed...
+    */
+   {
+      struct llvmpipe_context *lp = llvmpipe_context(scene->pipe);
+      if (lp->dirty) {
+         llvmpipe_update_derived(lp);
+      }
+      assert(lp->dirty == 0);
+   }
+
+   if(setup->dirty & LP_SETUP_NEW_BLEND_COLOR) {
+      uint8_t *stored;
+      unsigned i, j;
+
+      stored = lp_scene_alloc_aligned(scene, 4 * 16, 16);
+
+      if (stored) {
+         /* smear each blend color component across 16 ubyte elements */
+         for (i = 0; i < 4; ++i) {
+            uint8_t c = float_to_ubyte(setup->blend_color.current.color[i]);
+            for (j = 0; j < 16; ++j)
+               stored[i*16 + j] = c;
+         }
+
+         setup->blend_color.stored = stored;
+
+         setup->fs.current.jit_context.blend_color = setup->blend_color.stored;
+      }
+
+      setup->dirty |= LP_SETUP_NEW_FS;
+   }
+
+   if (setup->dirty & LP_SETUP_NEW_SCISSOR) {
+      float *stored;
+
+      stored = lp_scene_alloc_aligned(scene, 4 * sizeof(int32_t), 16);
+
+      if (stored) {
+         stored[0] = (float) setup->scissor.current.minx;
+         stored[1] = (float) setup->scissor.current.miny;
+         stored[2] = (float) setup->scissor.current.maxx;
+         stored[3] = (float) setup->scissor.current.maxy;
+
+         setup->scissor.stored = stored;
+
+         setup->fs.current.jit_context.scissor_xmin = stored[0];
+         setup->fs.current.jit_context.scissor_ymin = stored[1];
+         setup->fs.current.jit_context.scissor_xmax = stored[2];
+         setup->fs.current.jit_context.scissor_ymax = stored[3];
+      }
+
+      setup->dirty |= LP_SETUP_NEW_FS;
+   }
+
+   if(setup->dirty & LP_SETUP_NEW_CONSTANTS) {
+      struct pipe_resource *buffer = setup->constants.current;
+
+      if(buffer) {
+         unsigned current_size = buffer->width0;
+         const void *current_data = llvmpipe_resource_data(buffer);
+
+         /* TODO: copy only the actually used constants? */
+
+         if(setup->constants.stored_size != current_size ||
+            !setup->constants.stored_data ||
+            memcmp(setup->constants.stored_data,
+                   current_data,
+                   current_size) != 0) {
+            void *stored;
+
+            stored = lp_scene_alloc(scene, current_size);
+            if(stored) {
+               memcpy(stored,
+                      current_data,
+                      current_size);
+               setup->constants.stored_size = current_size;
+               setup->constants.stored_data = stored;
+            }
+         }
+      }
+      else {
+         setup->constants.stored_size = 0;
+         setup->constants.stored_data = NULL;
+      }
+
+      setup->fs.current.jit_context.constants = setup->constants.stored_data;
+      setup->dirty |= LP_SETUP_NEW_FS;
+   }
+
+
+   if(setup->dirty & LP_SETUP_NEW_FS) {
+      if(!setup->fs.stored ||
+         memcmp(setup->fs.stored,
+                &setup->fs.current,
+                sizeof setup->fs.current) != 0) {
+         /* The fs state that's been stored in the scene is different from
+          * the new, current state.  So allocate a new lp_rast_state object
+          * and append it to the bin's setup data buffer.
+          */
+         uint i;
+         struct lp_rast_state *stored =
+            (struct lp_rast_state *) lp_scene_alloc(scene, sizeof *stored);
+         if(stored) {
+            memcpy(stored,
+                   &setup->fs.current,
+                   sizeof setup->fs.current);
+            setup->fs.stored = stored;
+
+            /* put the state-set command into all bins */
+            lp_scene_bin_state_command( scene,
+					lp_rast_set_state, 
+					lp_rast_arg_state(setup->fs.stored) );
+         }
+
+         /* The scene now references the textures in the rasterization
+          * state record.  Note that now.
+          */
+         for (i = 0; i < Elements(setup->fs.current_tex); i++) {
+            if (setup->fs.current_tex[i])
+               lp_scene_add_resource_reference(scene, setup->fs.current_tex[i]);
+         }
+      }
+   }
+
+   setup->dirty = 0;
+
+   assert(setup->fs.stored);
+}
+
+
+
+/* Only caller is lp_setup_vbuf_destroy()
+ */
+void 
+lp_setup_destroy( struct lp_setup_context *setup )
+{
+   uint i;
+
+   reset_context( setup );
+
+   util_unreference_framebuffer_state(&setup->fb);
+
+   for (i = 0; i < Elements(setup->fs.current_tex); i++) {
+      pipe_resource_reference(&setup->fs.current_tex[i], NULL);
+   }
+
+   pipe_resource_reference(&setup->constants.current, NULL);
+
+   /* free the scenes in the 'empty' queue */
+   while (1) {
+      struct lp_scene *scene = lp_scene_dequeue(setup->empty_scenes, FALSE);
+      if (!scene)
+         break;
+      lp_scene_destroy(scene);
+   }
+
+   lp_scene_queue_destroy(setup->empty_scenes);
+
+   lp_rast_destroy( setup->rast );
+
+   FREE( setup );
+}
+
+
+/**
+ * Create a new primitive tiling engine.  Plug it into the backend of
+ * the draw module.  Currently also creates a rasterizer to use with
+ * it.
+ */
+struct lp_setup_context *
+lp_setup_create( struct pipe_context *pipe,
+                 struct draw_context *draw )
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(pipe->screen);
+   struct lp_setup_context *setup = CALLOC_STRUCT(lp_setup_context);
+   unsigned i;
+
+   if (!setup)
+      return NULL;
+
+   lp_setup_init_vbuf(setup);
+
+   setup->empty_scenes = lp_scene_queue_create();
+   if (!setup->empty_scenes)
+      goto fail;
+
+   /* XXX: move this to the screen and share between contexts:
+    */
+   setup->num_threads = screen->num_threads;
+   setup->rast = lp_rast_create(screen->num_threads);
+   if (!setup->rast) 
+      goto fail;
+
+   setup->vbuf = draw_vbuf_stage(draw, &setup->base);
+   if (!setup->vbuf)
+      goto fail;
+
+   draw_set_rasterize_stage(draw, setup->vbuf);
+   draw_set_render(draw, &setup->base);
+
+   /* create some empty scenes */
+   for (i = 0; i < MAX_SCENES; i++) {
+      setup->scenes[i] = lp_scene_create( pipe, setup->empty_scenes );
+
+      lp_scene_enqueue(setup->empty_scenes, setup->scenes[i]);
+   }
+
+   setup->triangle = first_triangle;
+   setup->line     = first_line;
+   setup->point    = first_point;
+   
+   setup->dirty = ~0;
+
+   return setup;
+
+fail:
+   if (setup->rast)
+      lp_rast_destroy( setup->rast );
+   
+   if (setup->vbuf)
+      ;
+
+   if (setup->empty_scenes)
+      lp_scene_queue_destroy(setup->empty_scenes);
+
+   FREE(setup);
+   return NULL;
+}
+
+
+/**
+ * Put a BeginQuery command into all bins.
+ */
+void
+lp_setup_begin_query(struct lp_setup_context *setup,
+                     struct llvmpipe_query *pq)
+{
+   struct lp_scene * scene = lp_setup_get_current_scene(setup);
+   union lp_rast_cmd_arg cmd_arg;
+
+   /* init the query to its beginning state */
+   pq->done = FALSE;
+   pq->tile_count = 0;
+   pq->num_tiles = scene->tiles_x * scene->tiles_y;
+   assert(pq->num_tiles > 0);
+
+   memset(pq->count, 0, sizeof(pq->count));  /* reset all counters */
+
+   cmd_arg.query_obj = pq;
+   lp_scene_bin_everywhere(scene, lp_rast_begin_query, cmd_arg);
+   pq->binned = TRUE;
+}
+
+
+/**
+ * Put an EndQuery command into all bins.
+ */
+void
+lp_setup_end_query(struct lp_setup_context *setup, struct llvmpipe_query *pq)
+{
+   struct lp_scene * scene = lp_setup_get_current_scene(setup);
+   union lp_rast_cmd_arg cmd_arg;
+
+   cmd_arg.query_obj = pq;
+   lp_scene_bin_everywhere(scene, lp_rast_end_query, cmd_arg);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
new file mode 100644
index 0000000000..6a0dc55129
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -0,0 +1,156 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#ifndef LP_SETUP_H
+#define LP_SETUP_H
+
+#include "pipe/p_compiler.h"
+#include "lp_jit.h"
+
+struct draw_context;
+struct vertex_info;
+
+enum lp_interp {
+   LP_INTERP_CONSTANT,
+   LP_INTERP_LINEAR,
+   LP_INTERP_PERSPECTIVE,
+   LP_INTERP_POSITION,
+   LP_INTERP_FACING
+};
+
+
+/**
+ * Describes how to compute the interpolation coefficients (a0, dadx, dady)
+ * from the vertices passed into our triangle/line/point functions by the
+ * draw module.
+ *
+ * Vertices are treated as an array of float[4] values, indexed by
+ * src_index.
+ */
+struct lp_shader_input {
+   enum lp_interp interp;       /* how to interpolate values */
+   unsigned src_index;          /* where to find values in incoming vertices */
+   unsigned usage_mask;         /* bitmask of TGSI_WRITEMASK_x flags */
+};
+
+struct pipe_resource;
+struct pipe_query;
+struct pipe_surface;
+struct pipe_blend_color;
+struct pipe_screen;
+struct pipe_framebuffer_state;
+struct lp_fragment_shader_variant;
+struct lp_jit_context;
+struct llvmpipe_query;
+
+
+struct lp_setup_context *
+lp_setup_create( struct pipe_context *pipe,
+                 struct draw_context *draw );
+
+void
+lp_setup_clear(struct lp_setup_context *setup,
+               const float *clear_color,
+               double clear_depth,
+               unsigned clear_stencil,
+               unsigned flags);
+
+struct pipe_fence_handle *
+lp_setup_fence( struct lp_setup_context *setup );
+
+
+void
+lp_setup_flush( struct lp_setup_context *setup,
+                unsigned flags );
+
+
+void
+lp_setup_bind_framebuffer( struct lp_setup_context *setup,
+                           const struct pipe_framebuffer_state *fb );
+
+void 
+lp_setup_set_triangle_state( struct lp_setup_context *setup,
+                             unsigned cullmode,
+                             boolean front_is_ccw,
+                             boolean scissor,
+                             boolean gl_rasterization_rules );
+
+void
+lp_setup_set_fs_inputs( struct lp_setup_context *setup,
+                        const struct lp_shader_input *interp,
+                        unsigned nr );
+
+void
+lp_setup_set_fs_variant( struct lp_setup_context *setup,
+                         struct lp_fragment_shader_variant *variant );
+
+void
+lp_setup_set_fs_constants(struct lp_setup_context *setup,
+                          struct pipe_resource *buffer);
+
+
+void
+lp_setup_set_alpha_ref_value( struct lp_setup_context *setup,
+                              float alpha_ref_value );
+
+void
+lp_setup_set_stencil_ref_values( struct lp_setup_context *setup,
+                                 const ubyte refs[2] );
+
+void
+lp_setup_set_blend_color( struct lp_setup_context *setup,
+                          const struct pipe_blend_color *blend_color );
+
+void
+lp_setup_set_scissor( struct lp_setup_context *setup,
+                      const struct pipe_scissor_state *scissor );
+
+void
+lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
+                                    unsigned num,
+                                    struct pipe_sampler_view **views);
+
+unsigned
+lp_setup_is_resource_referenced( const struct lp_setup_context *setup,
+                                const struct pipe_resource *texture );
+
+void
+lp_setup_set_flatshade_first( struct lp_setup_context *setup, 
+                              boolean flatshade_first );
+
+void
+lp_setup_set_vertex_info( struct lp_setup_context *setup, 
+                          struct vertex_info *info );
+
+void
+lp_setup_begin_query(struct lp_setup_context *setup,
+                     struct llvmpipe_query *pq);
+
+void
+lp_setup_end_query(struct lp_setup_context *setup,
+                   struct llvmpipe_query *pq);
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
new file mode 100644
index 0000000000..c8b8a2480b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -0,0 +1,162 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * The setup code is concerned with point/line/triangle setup and
+ * putting commands/data into the bins.
+ */
+
+
+#ifndef LP_SETUP_CONTEXT_H
+#define LP_SETUP_CONTEXT_H
+
+#include "lp_setup.h"
+#include "lp_rast.h"
+#include "lp_tile_soa.h"        /* for TILE_SIZE */
+#include "lp_scene.h"
+
+#include "draw/draw_vbuf.h"
+
+#define LP_SETUP_NEW_FS          0x01
+#define LP_SETUP_NEW_CONSTANTS   0x02
+#define LP_SETUP_NEW_BLEND_COLOR 0x04
+#define LP_SETUP_NEW_SCISSOR     0x08
+
+
+struct lp_scene_queue;
+
+
+/** Max number of scenes */
+#define MAX_SCENES 2
+
+
+
+/**
+ * Point/line/triangle setup context.
+ * Note: "stored" below indicates data which is stored in the bins,
+ * not arbitrary malloc'd memory.
+ *
+ *
+ * Subclass of vbuf_render, plugged directly into the draw module as
+ * the rendering backend.
+ */
+struct lp_setup_context
+{
+   struct vbuf_render base;
+
+   struct vertex_info *vertex_info;
+   uint prim;
+   uint vertex_size;
+   uint nr_vertices;
+   uint vertex_buffer_size;
+   void *vertex_buffer;
+
+   /* Final pipeline stage for draw module.  Draw module should
+    * create/install this itself now.
+    */
+   struct draw_stage *vbuf;
+   unsigned num_threads;
+   struct lp_rasterizer *rast;
+   struct lp_scene *scenes[MAX_SCENES];  /**< all the scenes */
+   struct lp_scene *scene;               /**< current scene being built */
+   struct lp_scene_queue *empty_scenes;  /**< queue of empty scenes */
+
+   boolean flatshade_first;
+   boolean ccw_is_frontface;
+   boolean scissor_test;
+   unsigned cullmode;
+   float pixel_offset;
+
+   struct pipe_framebuffer_state fb;
+
+   struct {
+      unsigned flags;
+      union lp_rast_cmd_arg color;    /**< lp_rast_clear_color() cmd */
+      struct lp_rast_clearzs clearzs; /**< lp_rast_clear_zstencil() cmd */
+   } clear;
+
+   enum setup_state {
+      SETUP_FLUSHED,
+      SETUP_CLEARED,
+      SETUP_ACTIVE
+   } state;
+   
+   struct {
+      struct lp_shader_input input[PIPE_MAX_ATTRIBS];
+      unsigned nr_inputs;
+
+      const struct lp_rast_state *stored; /**< what's in the scene */
+      struct lp_rast_state current;  /**< currently set state */
+      struct pipe_resource *current_tex[PIPE_MAX_SAMPLERS];
+   } fs;
+
+   /** fragment shader constants */
+   struct {
+      struct pipe_resource *current;
+      unsigned stored_size;
+      const void *stored_data;
+   } constants;
+
+   struct {
+      struct pipe_blend_color current;
+      uint8_t *stored;
+   } blend_color;
+
+   struct {
+      struct pipe_scissor_state current;
+      const void *stored;
+   } scissor;
+
+   unsigned dirty;   /**< bitmask of LP_SETUP_NEW_x bits */
+
+   void (*point)( struct lp_setup_context *,
+                  const float (*v0)[4]);
+
+   void (*line)( struct lp_setup_context *,
+                 const float (*v0)[4],
+                 const float (*v1)[4]);
+
+   void (*triangle)( struct lp_setup_context *,
+                     const float (*v0)[4],
+                     const float (*v1)[4],
+                     const float (*v2)[4]);
+};
+
+void lp_setup_choose_triangle( struct lp_setup_context *setup );
+void lp_setup_choose_line( struct lp_setup_context *setup );
+void lp_setup_choose_point( struct lp_setup_context *setup );
+
+struct lp_scene *lp_setup_get_current_scene(struct lp_setup_context *setup);
+
+void lp_setup_init_vbuf(struct lp_setup_context *setup);
+
+void lp_setup_update_state( struct lp_setup_context *setup );
+
+void lp_setup_destroy( struct lp_setup_context *setup );
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
new file mode 100644
index 0000000000..be41c44e6f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -0,0 +1,47 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Binning code for lines
+ */
+
+#include "lp_setup_context.h"
+
+static void line_nop( struct lp_setup_context *setup,
+                      const float (*v0)[4],
+                      const float (*v1)[4] )
+{
+}
+
+
+void 
+lp_setup_choose_line( struct lp_setup_context *setup )
+{
+   setup->line = line_nop;
+}
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
new file mode 100644
index 0000000000..9f69e6c5ce
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -0,0 +1,46 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Binning code for points
+ */
+
+#include "lp_setup_context.h"
+
+static void point_nop( struct lp_setup_context *setup,
+                       const float (*v0)[4] )
+{
+}
+
+
+void 
+lp_setup_choose_point( struct lp_setup_context *setup )
+{
+   setup->point = point_nop;
+}
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
new file mode 100644
index 0000000000..0557d35f8b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -0,0 +1,723 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Binning code for triangles
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_perf.h"
+#include "lp_setup_context.h"
+#include "lp_rast.h"
+#include "lp_state_fs.h"
+
+#define NUM_CHANNELS 4
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ */
+static void constant_coef( struct lp_setup_context *setup,
+                           struct lp_rast_triangle *tri,
+                           unsigned slot,
+			   const float value,
+                           unsigned i )
+{
+   tri->inputs.a0[slot][i] = value;
+   tri->inputs.dadx[slot][i] = 0.0f;
+   tri->inputs.dady[slot][i] = 0.0f;
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static void linear_coef( struct lp_setup_context *setup,
+                         struct lp_rast_triangle *tri,
+                         float oneoverarea,
+                         unsigned slot,
+                         const float (*v1)[4],
+                         const float (*v2)[4],
+                         const float (*v3)[4],
+                         unsigned vert_attr,
+                         unsigned i)
+{
+   float a1 = v1[vert_attr][i];
+   float a2 = v2[vert_attr][i];
+   float a3 = v3[vert_attr][i];
+
+   float da12 = a1 - a2;
+   float da31 = a3 - a1;
+   float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * oneoverarea;
+   float dady = (da31 * tri->dx12 - tri->dx31 * da12) * oneoverarea;
+
+   tri->inputs.dadx[slot][i] = dadx;
+   tri->inputs.dady[slot][i] = dady;
+
+   /* calculate a0 as the value which would be sampled for the
+    * fragment at (0,0), taking into account that we want to sample at
+    * pixel centers, in other words (0.5, 0.5).
+    *
+    * this is neat but unfortunately not a good way to do things for
+    * triangles with very large values of dadx or dady as it will
+    * result in the subtraction and re-addition from a0 of a very
+    * large number, which means we'll end up loosing a lot of the
+    * fractional bits and precision from a0.  the way to fix this is
+    * to define a0 as the sample at a pixel center somewhere near vmin
+    * instead - i'll switch to this later.
+    */
+   tri->inputs.a0[slot][i] = (a1 -
+                              (dadx * (v1[0][0] - setup->pixel_offset) +
+                               dady * (v1[0][1] - setup->pixel_offset)));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void perspective_coef( struct lp_setup_context *setup,
+                              struct lp_rast_triangle *tri,
+                              float oneoverarea,
+                              unsigned slot,
+			      const float (*v1)[4],
+			      const float (*v2)[4],
+			      const float (*v3)[4],
+			      unsigned vert_attr,
+                              unsigned i)
+{
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   float a1 = v1[vert_attr][i] * v1[0][3];
+   float a2 = v2[vert_attr][i] * v2[0][3];
+   float a3 = v3[vert_attr][i] * v3[0][3];
+   float da12 = a1 - a2;
+   float da31 = a3 - a1;
+   float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * oneoverarea;
+   float dady = (da31 * tri->dx12 - tri->dx31 * da12) * oneoverarea;
+
+   tri->inputs.dadx[slot][i] = dadx;
+   tri->inputs.dady[slot][i] = dady;
+   tri->inputs.a0[slot][i] = (a1 -
+                              (dadx * (v1[0][0] - setup->pixel_offset) +
+                               dady * (v1[0][1] - setup->pixel_offset)));
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial
+ * Z and W are copied from position_coef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_fragcoord_coef(struct lp_setup_context *setup,
+                     struct lp_rast_triangle *tri,
+                     float oneoverarea,
+                     unsigned slot,
+                     const float (*v1)[4],
+                     const float (*v2)[4],
+                     const float (*v3)[4],
+                     unsigned usage_mask)
+{
+   /*X*/
+   if (usage_mask & TGSI_WRITEMASK_X) {
+      tri->inputs.a0[slot][0] = 0.0;
+      tri->inputs.dadx[slot][0] = 1.0;
+      tri->inputs.dady[slot][0] = 0.0;
+   }
+
+   /*Y*/
+   if (usage_mask & TGSI_WRITEMASK_Y) {
+      tri->inputs.a0[slot][1] = 0.0;
+      tri->inputs.dadx[slot][1] = 0.0;
+      tri->inputs.dady[slot][1] = 1.0;
+   }
+
+   /*Z*/
+   if (usage_mask & TGSI_WRITEMASK_Z) {
+      linear_coef(setup, tri, oneoverarea, slot, v1, v2, v3, 0, 2);
+   }
+
+   /*W*/
+   if (usage_mask & TGSI_WRITEMASK_W) {
+      linear_coef(setup, tri, oneoverarea, slot, v1, v2, v3, 0, 3);
+   }
+}
+
+
+/**
+ * Setup the fragment input attribute with the front-facing value.
+ * \param frontface  is the triangle front facing?
+ */
+static void setup_facing_coef( struct lp_setup_context *setup,
+                               struct lp_rast_triangle *tri,
+                               unsigned slot,
+                               boolean frontface,
+                               unsigned usage_mask)
+{
+   /* convert TRUE to 1.0 and FALSE to -1.0 */
+   if (usage_mask & TGSI_WRITEMASK_X)
+      constant_coef( setup, tri, slot, 2.0f * frontface - 1.0f, 0 );
+
+   if (usage_mask & TGSI_WRITEMASK_Y)
+      constant_coef( setup, tri, slot, 0.0f, 1 ); /* wasted */
+
+   if (usage_mask & TGSI_WRITEMASK_Z)
+      constant_coef( setup, tri, slot, 0.0f, 2 ); /* wasted */
+
+   if (usage_mask & TGSI_WRITEMASK_W)
+      constant_coef( setup, tri, slot, 0.0f, 3 ); /* wasted */
+}
+
+
+/**
+ * Compute the tri->coef[] array dadx, dady, a0 values.
+ */
+static void setup_tri_coefficients( struct lp_setup_context *setup,
+				    struct lp_rast_triangle *tri,
+                                    float oneoverarea,
+				    const float (*v1)[4],
+				    const float (*v2)[4],
+				    const float (*v3)[4],
+				    boolean frontface)
+{
+   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
+   unsigned slot;
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+      unsigned vert_attr = setup->fs.input[slot].src_index;
+      unsigned usage_mask = setup->fs.input[slot].usage_mask;
+      unsigned i;
+
+      switch (setup->fs.input[slot].interp) {
+      case LP_INTERP_CONSTANT:
+         if (setup->flatshade_first) {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               if (usage_mask & (1 << i))
+                  constant_coef(setup, tri, slot+1, v1[vert_attr][i], i);
+         }
+         else {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               if (usage_mask & (1 << i))
+                  constant_coef(setup, tri, slot+1, v3[vert_attr][i], i);
+         }
+         break;
+
+      case LP_INTERP_LINEAR:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               linear_coef(setup, tri, oneoverarea, slot+1, v1, v2, v3, vert_attr, i);
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               perspective_coef(setup, tri, oneoverarea, slot+1, v1, v2, v3, vert_attr, i);
+         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+         break;
+
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0, so all need to ensure that the usage mask is covers all
+          * usages.
+          */
+         fragcoord_usage_mask |= usage_mask;
+         break;
+
+      case LP_INTERP_FACING:
+         setup_facing_coef(setup, tri, slot+1, frontface, usage_mask);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   /* The internal position input is in slot zero:
+    */
+   setup_fragcoord_coef(setup, tri, oneoverarea, 0, v1, v2, v3,
+                        fragcoord_usage_mask);
+}
+
+
+
+static INLINE int subpixel_snap( float a )
+{
+   return util_iround(FIXED_ONE * a - (FIXED_ONE / 2));
+}
+
+
+
+/**
+ * Alloc space for a new triangle plus the input.a0/dadx/dady arrays
+ * immediately after it.
+ * The memory is allocated from the per-scene pool, not per-tile.
+ * \param tri_size  returns number of bytes allocated
+ * \param nr_inputs  number of fragment shader inputs
+ * \return pointer to triangle space
+ */
+static INLINE struct lp_rast_triangle *
+alloc_triangle(struct lp_scene *scene, unsigned nr_inputs, unsigned *tri_size)
+{
+   unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float);
+   struct lp_rast_triangle *tri;
+   unsigned bytes;
+   char *inputs;
+
+   assert(sizeof(*tri) % 16 == 0);
+
+   bytes = sizeof(*tri) + (3 * input_array_sz);
+
+   tri = lp_scene_alloc_aligned( scene, bytes, 16 );
+
+   if (tri) {
+      inputs = (char *) (tri + 1);
+      tri->inputs.a0   = (float (*)[4]) inputs;
+      tri->inputs.dadx = (float (*)[4]) (inputs + input_array_sz);
+      tri->inputs.dady = (float (*)[4]) (inputs + 2 * input_array_sz);
+
+      *tri_size = bytes;
+   }
+
+   return tri;
+}
+
+
+/**
+ * Print triangle vertex attribs (for debug).
+ */
+static void
+print_triangle(struct lp_setup_context *setup,
+               const float (*v1)[4],
+               const float (*v2)[4],
+               const float (*v3)[4])
+{
+   uint i;
+
+   debug_printf("llvmpipe triangle\n");
+   for (i = 0; i < setup->fs.nr_inputs; i++) {
+      debug_printf("  v1[%d]:  %f %f %f %f\n", i,
+                   v1[i][0], v1[i][1], v1[i][2], v1[i][3]);
+   }
+   for (i = 0; i < setup->fs.nr_inputs; i++) {
+      debug_printf("  v2[%d]:  %f %f %f %f\n", i,
+                   v2[i][0], v2[i][1], v2[i][2], v2[i][3]);
+   }
+   for (i = 0; i < setup->fs.nr_inputs; i++) {
+      debug_printf("  v3[%d]:  %f %f %f %f\n", i,
+                   v3[i][0], v3[i][1], v3[i][2], v3[i][3]);
+   }
+}
+
+
+/**
+ * Do basic setup for triangle rasterization and determine which
+ * framebuffer tiles are touched.  Put the triangle in the scene's
+ * bins for the tiles which we overlap.
+ */
+static void 
+do_triangle_ccw(struct lp_setup_context *setup,
+		const float (*v1)[4],
+		const float (*v2)[4],
+		const float (*v3)[4],
+		boolean frontfacing )
+{
+   /* x/y positions in fixed point */
+   const int x1 = subpixel_snap(v1[0][0] + 0.5 - setup->pixel_offset);
+   const int x2 = subpixel_snap(v2[0][0] + 0.5 - setup->pixel_offset);
+   const int x3 = subpixel_snap(v3[0][0] + 0.5 - setup->pixel_offset);
+   const int y1 = subpixel_snap(v1[0][1] + 0.5 - setup->pixel_offset);
+   const int y2 = subpixel_snap(v2[0][1] + 0.5 - setup->pixel_offset);
+   const int y3 = subpixel_snap(v3[0][1] + 0.5 - setup->pixel_offset);
+
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+   struct lp_rast_triangle *tri;
+   int area;
+   float oneoverarea;
+   int minx, maxx, miny, maxy;
+   unsigned tri_bytes;
+
+   if (0)
+      print_triangle(setup, v1, v2, v3);
+
+   tri = alloc_triangle(scene, setup->fs.nr_inputs, &tri_bytes);
+   if (!tri)
+      return;
+
+#ifdef DEBUG
+   tri->v[0][0] = v1[0][0];
+   tri->v[1][0] = v2[0][0];
+   tri->v[2][0] = v3[0][0];
+   tri->v[0][1] = v1[0][1];
+   tri->v[1][1] = v2[0][1];
+   tri->v[2][1] = v3[0][1];
+#endif
+
+   tri->dx12 = x1 - x2;
+   tri->dx23 = x2 - x3;
+   tri->dx31 = x3 - x1;
+
+   tri->dy12 = y1 - y2;
+   tri->dy23 = y2 - y3;
+   tri->dy31 = y3 - y1;
+
+   area = (tri->dx12 * tri->dy31 - tri->dx31 * tri->dy12);
+
+   LP_COUNT(nr_tris);
+
+   /* Cull non-ccw and zero-sized triangles. 
+    *
+    * XXX: subject to overflow??
+    */
+   if (area <= 0) {
+      lp_scene_putback_data( scene, tri_bytes );
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   /* Bounding rectangle (in pixels) */
+   minx = (MIN3(x1, x2, x3) + (FIXED_ONE-1)) >> FIXED_ORDER;
+   maxx = (MAX3(x1, x2, x3) + (FIXED_ONE-1)) >> FIXED_ORDER;
+   miny = (MIN3(y1, y2, y3) + (FIXED_ONE-1)) >> FIXED_ORDER;
+   maxy = (MAX3(y1, y2, y3) + (FIXED_ONE-1)) >> FIXED_ORDER;
+   
+   if (setup->scissor_test) {
+      minx = MAX2(minx, setup->scissor.current.minx);
+      maxx = MIN2(maxx, setup->scissor.current.maxx);
+      miny = MAX2(miny, setup->scissor.current.miny);
+      maxy = MIN2(maxy, setup->scissor.current.maxy);
+   }
+
+   if (miny == maxy || 
+       minx == maxx) {
+      lp_scene_putback_data( scene, tri_bytes );
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   /* 
+    */
+   oneoverarea = ((float)FIXED_ONE) / (float)area;
+
+   /* Setup parameter interpolants:
+    */
+   setup_tri_coefficients( setup, tri, oneoverarea, v1, v2, v3, frontfacing );
+
+   tri->inputs.facing = frontfacing ? 1.0F : -1.0F;
+
+   /* half-edge constants, will be interated over the whole render target.
+    */
+   tri->c1 = tri->dy12 * x1 - tri->dx12 * y1;
+   tri->c2 = tri->dy23 * x2 - tri->dx23 * y2;
+   tri->c3 = tri->dy31 * x3 - tri->dx31 * y3;
+
+   /* correct for top-left fill convention:
+    */
+   if (tri->dy12 < 0 || (tri->dy12 == 0 && tri->dx12 > 0)) tri->c1++;
+   if (tri->dy23 < 0 || (tri->dy23 == 0 && tri->dx23 > 0)) tri->c2++;
+   if (tri->dy31 < 0 || (tri->dy31 == 0 && tri->dx31 > 0)) tri->c3++;
+
+   tri->dy12 *= FIXED_ONE;
+   tri->dy23 *= FIXED_ONE;
+   tri->dy31 *= FIXED_ONE;
+
+   tri->dx12 *= FIXED_ONE;
+   tri->dx23 *= FIXED_ONE;
+   tri->dx31 *= FIXED_ONE;
+
+   /* find trivial reject offsets for each edge for a single-pixel
+    * sized block.  These will be scaled up at each recursive level to
+    * match the active blocksize.  Scaling in this way works best if
+    * the blocks are square.
+    */
+   tri->eo1 = 0;
+   if (tri->dy12 < 0) tri->eo1 -= tri->dy12;
+   if (tri->dx12 > 0) tri->eo1 += tri->dx12;
+
+   tri->eo2 = 0;
+   if (tri->dy23 < 0) tri->eo2 -= tri->dy23;
+   if (tri->dx23 > 0) tri->eo2 += tri->dx23;
+
+   tri->eo3 = 0;
+   if (tri->dy31 < 0) tri->eo3 -= tri->dy31;
+   if (tri->dx31 > 0) tri->eo3 += tri->dx31;
+
+   /* Calculate trivial accept offsets from the above.
+    */
+   tri->ei1 = tri->dx12 - tri->dy12 - tri->eo1;
+   tri->ei2 = tri->dx23 - tri->dy23 - tri->eo2;
+   tri->ei3 = tri->dx31 - tri->dy31 - tri->eo3;
+
+   /* Fill in the inputs.step[][] arrays.
+    * We've manually unrolled some loops here.
+    */
+   {
+      const int xstep1 = -tri->dy12;
+      const int xstep2 = -tri->dy23;
+      const int xstep3 = -tri->dy31;
+      const int ystep1 = tri->dx12;
+      const int ystep2 = tri->dx23;
+      const int ystep3 = tri->dx31;
+
+#define SETUP_STEP(i, x, y)                                \
+      do {                                                 \
+         tri->inputs.step[0][i] = x * xstep1 + y * ystep1; \
+         tri->inputs.step[1][i] = x * xstep2 + y * ystep2; \
+         tri->inputs.step[2][i] = x * xstep3 + y * ystep3; \
+      } while (0)
+
+      SETUP_STEP(0, 0, 0);
+      SETUP_STEP(1, 1, 0);
+      SETUP_STEP(2, 0, 1);
+      SETUP_STEP(3, 1, 1);
+
+      SETUP_STEP(4, 2, 0);
+      SETUP_STEP(5, 3, 0);
+      SETUP_STEP(6, 2, 1);
+      SETUP_STEP(7, 3, 1);
+
+      SETUP_STEP(8, 0, 2);
+      SETUP_STEP(9, 1, 2);
+      SETUP_STEP(10, 0, 3);
+      SETUP_STEP(11, 1, 3);
+
+      SETUP_STEP(12, 2, 2);
+      SETUP_STEP(13, 3, 2);
+      SETUP_STEP(14, 2, 3);
+      SETUP_STEP(15, 3, 3);
+#undef STEP
+   }
+
+   /*
+    * All fields of 'tri' are now set.  The remaining code here is
+    * concerned with binning.
+    */
+
+   /* Convert to tile coordinates:
+    */
+   minx = minx / TILE_SIZE;
+   miny = miny / TILE_SIZE;
+   maxx = maxx / TILE_SIZE;
+   maxy = maxy / TILE_SIZE;
+
+   /*
+    * Clamp to framebuffer size
+    */
+   minx = MAX2(minx, 0);
+   miny = MAX2(miny, 0);
+   maxx = MIN2(maxx, scene->tiles_x - 1);
+   maxy = MIN2(maxy, scene->tiles_y - 1);
+
+   /* Determine which tile(s) intersect the triangle's bounding box
+    */
+   if (miny == maxy && minx == maxx)
+   {
+      /* Triangle is contained in a single tile:
+       */
+      lp_scene_bin_command( scene, minx, miny, lp_rast_triangle, 
+			    lp_rast_arg_triangle(tri) );
+   }
+   else 
+   {
+      int c1 = (tri->c1 + 
+                tri->dx12 * miny * TILE_SIZE - 
+                tri->dy12 * minx * TILE_SIZE);
+      int c2 = (tri->c2 + 
+                tri->dx23 * miny * TILE_SIZE -
+                tri->dy23 * minx * TILE_SIZE);
+      int c3 = (tri->c3 +
+                tri->dx31 * miny * TILE_SIZE -
+                tri->dy31 * minx * TILE_SIZE);
+
+      int ei1 = tri->ei1 << TILE_ORDER;
+      int ei2 = tri->ei2 << TILE_ORDER;
+      int ei3 = tri->ei3 << TILE_ORDER;
+
+      int eo1 = tri->eo1 << TILE_ORDER;
+      int eo2 = tri->eo2 << TILE_ORDER;
+      int eo3 = tri->eo3 << TILE_ORDER;
+
+      int xstep1 = -(tri->dy12 << TILE_ORDER);
+      int xstep2 = -(tri->dy23 << TILE_ORDER);
+      int xstep3 = -(tri->dy31 << TILE_ORDER);
+
+      int ystep1 = tri->dx12 << TILE_ORDER;
+      int ystep2 = tri->dx23 << TILE_ORDER;
+      int ystep3 = tri->dx31 << TILE_ORDER;
+      int x, y;
+
+
+      /* Test tile-sized blocks against the triangle.
+       * Discard blocks fully outside the tri.  If the block is fully
+       * contained inside the tri, bin an lp_rast_shade_tile command.
+       * Else, bin a lp_rast_triangle command.
+       */
+      for (y = miny; y <= maxy; y++)
+      {
+	 int cx1 = c1;
+	 int cx2 = c2;
+	 int cx3 = c3;
+	 boolean in = FALSE;  /* are we inside the triangle? */
+
+	 for (x = minx; x <= maxx; x++)
+	 {
+	    if (cx1 + eo1 < 0 || 
+		cx2 + eo2 < 0 ||
+		cx3 + eo3 < 0) 
+	    {
+	       /* do nothing */
+               LP_COUNT(nr_empty_64);
+	       if (in)
+		  break;  /* exiting triangle, all done with this row */
+	    }
+	    else if (cx1 + ei1 > 0 &&
+		     cx2 + ei2 > 0 &&
+		     cx3 + ei3 > 0) 
+	    {
+               /* triangle covers the whole tile- shade whole tile */
+               LP_COUNT(nr_fully_covered_64);
+	       in = TRUE;
+	       if (setup->fs.current.variant->opaque) {
+	          lp_scene_bin_reset( scene, x, y );
+	          lp_scene_bin_command( scene, x, y,
+	                                lp_rast_set_state,
+	                                lp_rast_arg_state(setup->fs.stored) );
+	       }
+               lp_scene_bin_command( scene, x, y,
+				     lp_rast_shade_tile,
+				     lp_rast_arg_inputs(&tri->inputs) );
+	    }
+	    else 
+	    { 
+               /* rasterizer/shade partial tile */
+               LP_COUNT(nr_partially_covered_64);
+	       in = TRUE;
+               lp_scene_bin_command( scene, x, y,
+				     lp_rast_triangle, 
+				     lp_rast_arg_triangle(tri) );
+	    }
+
+	    /* Iterate cx values across the region:
+	     */
+	    cx1 += xstep1;
+	    cx2 += xstep2;
+	    cx3 += xstep3;
+	 }
+      
+	 /* Iterate c values down the region:
+	  */
+	 c1 += ystep1;
+	 c2 += ystep2;
+	 c3 += ystep3;    
+      }
+   }
+}
+
+
+/**
+ * Draw triangle if it's CW, cull otherwise.
+ */
+static void triangle_cw( struct lp_setup_context *setup,
+			 const float (*v0)[4],
+			 const float (*v1)[4],
+			 const float (*v2)[4] )
+{
+   do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface );
+}
+
+
+/**
+ * Draw triangle if it's CCW, cull otherwise.
+ */
+static void triangle_ccw( struct lp_setup_context *setup,
+			 const float (*v0)[4],
+			 const float (*v1)[4],
+			 const float (*v2)[4] )
+{
+   do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface );
+}
+
+
+
+/**
+ * Draw triangle whether it's CW or CCW.
+ */
+static void triangle_both( struct lp_setup_context *setup,
+			   const float (*v0)[4],
+			   const float (*v1)[4],
+			   const float (*v2)[4] )
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0][0] - v2[0][0];
+   const float ey = v0[0][1] - v2[0][1];
+   const float fx = v1[0][0] - v2[0][0];
+   const float fy = v1[0][1] - v2[0][1];
+
+   /* det = cross(e,f).z */
+   if (ex * fy - ey * fx < 0.0f) 
+      triangle_ccw( setup, v0, v1, v2 );
+   else
+      triangle_cw( setup, v0, v1, v2 );
+}
+
+
+static void triangle_nop( struct lp_setup_context *setup,
+			  const float (*v0)[4],
+			  const float (*v1)[4],
+			  const float (*v2)[4] )
+{
+}
+
+
+void 
+lp_setup_choose_triangle( struct lp_setup_context *setup )
+{
+   switch (setup->cullmode) {
+   case PIPE_FACE_NONE:
+      setup->triangle = triangle_both;
+      break;
+   case PIPE_FACE_BACK:
+      setup->triangle = setup->ccw_is_frontface ? triangle_ccw : triangle_cw;
+      break;
+   case PIPE_FACE_FRONT:
+      setup->triangle = setup->ccw_is_frontface ? triangle_cw : triangle_ccw;
+      break;
+   default:
+      setup->triangle = triangle_nop;
+      break;
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
new file mode 100644
index 0000000000..f6a424f25a
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
@@ -0,0 +1,550 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Interface between 'draw' module's output and the llvmpipe rasterizer/setup
+ * code.  When the 'draw' module has finished filling a vertex buffer, the
+ * draw_arrays() functions below will be called.  Loop over the vertices and
+ * call the point/line/tri setup functions.
+ *
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include "lp_setup_context.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "util/u_memory.h"
+
+
+#define LP_MAX_VBUF_INDEXES 1024
+#define LP_MAX_VBUF_SIZE    4096
+
+  
+
+/** cast wrapper */
+static struct lp_setup_context *
+lp_setup_context(struct vbuf_render *vbr)
+{
+   return (struct lp_setup_context *) vbr;
+}
+
+
+
+static const struct vertex_info *
+lp_setup_get_vertex_info(struct vbuf_render *vbr)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+
+   /* vertex size/info depends on the latest state */
+   lp_setup_update_state(setup);
+
+   return setup->vertex_info;
+}
+
+
+static boolean
+lp_setup_allocate_vertices(struct vbuf_render *vbr,
+                          ushort vertex_size, ushort nr_vertices)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   unsigned size = vertex_size * nr_vertices;
+
+   if (setup->vertex_buffer_size < size) {
+      align_free(setup->vertex_buffer);
+      setup->vertex_buffer = align_malloc(size, 16);
+      setup->vertex_buffer_size = size;
+   }
+
+   setup->vertex_size = vertex_size;
+   setup->nr_vertices = nr_vertices;
+   
+   return setup->vertex_buffer != NULL;
+}
+
+static void
+lp_setup_release_vertices(struct vbuf_render *vbr)
+{
+   /* keep the old allocation for next time */
+}
+
+static void *
+lp_setup_map_vertices(struct vbuf_render *vbr)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   return setup->vertex_buffer;
+}
+
+static void 
+lp_setup_unmap_vertices(struct vbuf_render *vbr, 
+                       ushort min_index,
+                       ushort max_index )
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   assert( setup->vertex_buffer_size >= (max_index+1) * setup->vertex_size );
+   /* do nothing */
+}
+
+
+static boolean
+lp_setup_set_primitive(struct vbuf_render *vbr, unsigned prim)
+{
+   lp_setup_context(vbr)->prim = prim;
+   return TRUE;
+}
+
+typedef const float (*const_float4_ptr)[4];
+
+static INLINE const_float4_ptr get_vert( const void *vertex_buffer,
+                                         int index,
+                                         int stride )
+{
+   return (const_float4_ptr)((char *)vertex_buffer + index * stride);
+}
+
+/**
+ * draw elements / indexed primitives
+ */
+static void
+lp_setup_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   const unsigned stride = setup->vertex_info->size * sizeof(float);
+   const void *vertex_buffer = setup->vertex_buffer;
+   const boolean flatshade_first = setup->flatshade_first;
+   unsigned i;
+
+   lp_setup_update_state(setup);
+
+   switch (setup->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         setup->point( setup,
+                       get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, indices[i-1], stride),
+                      get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, indices[i-1], stride),
+                      get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, indices[i-1], stride),
+                      get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      if (nr) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, indices[nr-1], stride),
+                      get_vert(vertex_buffer, indices[0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 2; i < nr; i += 3) {
+         setup->triangle( setup,
+                          get_vert(vertex_buffer, indices[i-2], stride),
+                          get_vert(vertex_buffer, indices[i-1], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            /* emit first triangle vertex as first triangle vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i+(i&1)-1], stride),
+                             get_vert(vertex_buffer, indices[i-(i&1)], stride) );
+
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            /* emit last triangle vertex as last triangle vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i+(i&1)-2], stride),
+                             get_vert(vertex_buffer, indices[i-(i&1)-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            /* emit first non-spoke vertex as first vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[0], stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            /* emit last non-spoke vertex as last vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[0], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUADS:
+      /* GL quads don't follow provoking vertex convention */
+      if (flatshade_first) { 
+         /* emit last quad vertex as first triangle vertex */
+         for (i = 3; i < nr; i += 4) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride),
+                             get_vert(vertex_buffer, indices[i-2], stride) );
+
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride) );
+         }
+      }
+      else {
+         /* emit last quad vertex as last triangle vertex */
+         for (i = 3; i < nr; i += 4) {
+            setup->triangle( setup,
+                          get_vert(vertex_buffer, indices[i-3], stride),
+                          get_vert(vertex_buffer, indices[i-2], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride) );
+
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      /* GL quad strips don't follow provoking vertex convention */
+      if (flatshade_first) { 
+         /* emit last quad vertex as first triangle vertex */
+         for (i = 3; i < nr; i += 2) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride),
+                             get_vert(vertex_buffer, indices[i-2], stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride) );
+         }
+      }
+      else {
+         /* emit last quad vertex as last triangle vertex */
+         for (i = 3; i < nr; i += 2) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-3], stride),
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      /* Almost same as tri fan but the _first_ vertex specifies the flat
+       * shading color.
+       */
+      if (flatshade_first) { 
+         /* emit first polygon  vertex as first triangle vertex */
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[0], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      else {
+         /* emit first polygon  vertex as last triangle vertex */
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[0], stride) );
+         }
+      }
+      break;
+
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * This function is hit when the draw module is working in pass-through mode.
+ * It's up to us to convert the vertex array into point/line/tri prims.
+ */
+static void
+lp_setup_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   const unsigned stride = setup->vertex_info->size * sizeof(float);
+   const void *vertex_buffer =
+      (void *) get_vert(setup->vertex_buffer, start, stride);
+   const boolean flatshade_first = setup->flatshade_first;
+   unsigned i;
+
+   lp_setup_update_state(setup);
+
+   switch (setup->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         setup->point( setup,
+                       get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, i-1, stride),
+                      get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, i-1, stride),
+                      get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, i-1, stride),
+                      get_vert(vertex_buffer, i-0, stride) );
+      }
+      if (nr) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, nr-1, stride),
+                      get_vert(vertex_buffer, 0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 2; i < nr; i += 3) {
+         setup->triangle( setup,
+                          get_vert(vertex_buffer, i-2, stride),
+                          get_vert(vertex_buffer, i-1, stride),
+                          get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (flatshade_first) {
+         for (i = 2; i < nr; i++) {
+            /* emit first triangle vertex as first triangle vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i+(i&1)-1, stride),
+                             get_vert(vertex_buffer, i-(i&1), stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i++) {
+            /* emit last triangle vertex as last triangle vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i+(i&1)-2, stride),
+                             get_vert(vertex_buffer, i-(i&1)-1, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            /* emit first non-spoke vertex as first vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, 0, stride)  );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            /* emit last non-spoke vertex as last vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, 0, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUADS:
+      /* GL quads don't follow provoking vertex convention */
+      if (flatshade_first) { 
+         /* emit last quad vertex as first triangle vertex */
+         for (i = 3; i < nr; i += 4) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-2, stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-1, stride) );
+         }
+      }
+      else {
+         /* emit last quad vertex as last triangle vertex */
+         for (i = 3; i < nr; i += 4) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      /* GL quad strips don't follow provoking vertex convention */
+      if (flatshade_first) { 
+         /* emit last quad vertex as first triangle vertex */
+         for (i = 3; i < nr; i += 2) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-2, stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-3, stride) );
+         }
+      }
+      else {
+         /* emit last quad vertex as last triangle vertex */
+         for (i = 3; i < nr; i += 2) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      /* Almost same as tri fan but the _first_ vertex specifies the flat
+       * shading color.
+       */
+      if (flatshade_first) { 
+         /* emit first polygon  vertex as first triangle vertex */
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, 0, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      else {
+         /* emit first polygon  vertex as last triangle vertex */
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, 0, stride) );
+         }
+      }
+      break;
+
+   default:
+      assert(0);
+   }
+}
+
+
+
+static void
+lp_setup_vbuf_destroy(struct vbuf_render *vbr)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   if (setup->vertex_buffer) {
+      align_free(setup->vertex_buffer);
+      setup->vertex_buffer = NULL;
+   }
+   lp_setup_destroy(setup);
+}
+
+
+/**
+ * Create the post-transform vertex handler for the given context.
+ */
+void
+lp_setup_init_vbuf(struct lp_setup_context *setup)
+{
+   setup->base.max_indices = LP_MAX_VBUF_INDEXES;
+   setup->base.max_vertex_buffer_bytes = LP_MAX_VBUF_SIZE;
+
+   setup->base.get_vertex_info = lp_setup_get_vertex_info;
+   setup->base.allocate_vertices = lp_setup_allocate_vertices;
+   setup->base.map_vertices = lp_setup_map_vertices;
+   setup->base.unmap_vertices = lp_setup_unmap_vertices;
+   setup->base.set_primitive = lp_setup_set_primitive;
+   setup->base.draw_elements = lp_setup_draw_elements;
+   setup->base.draw_arrays = lp_setup_draw_arrays;
+   setup->base.release_vertices = lp_setup_release_vertices;
+   setup->base.destroy = lp_setup_vbuf_destroy;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
new file mode 100644
index 0000000000..05d1b93794
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -0,0 +1,135 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef LP_STATE_H
+#define LP_STATE_H
+
+#include "pipe/p_state.h"
+#include "lp_jit.h"
+#include "lp_state_fs.h"
+#include "gallivm/lp_bld.h"
+
+
+#define LP_NEW_VIEWPORT      0x1
+#define LP_NEW_RASTERIZER    0x2
+#define LP_NEW_FS            0x4
+#define LP_NEW_BLEND         0x8
+#define LP_NEW_CLIP          0x10
+#define LP_NEW_SCISSOR       0x20
+#define LP_NEW_STIPPLE       0x40
+#define LP_NEW_FRAMEBUFFER   0x80
+#define LP_NEW_DEPTH_STENCIL_ALPHA 0x100
+#define LP_NEW_CONSTANTS     0x200
+#define LP_NEW_SAMPLER       0x400
+#define LP_NEW_SAMPLER_VIEW  0x800
+#define LP_NEW_VERTEX        0x1000
+#define LP_NEW_VS            0x2000
+#define LP_NEW_QUERY         0x4000
+#define LP_NEW_BLEND_COLOR   0x8000
+#define LP_NEW_GS            0x10000
+#define LP_NEW_SO            0x20000
+#define LP_NEW_SO_BUFFERS    0x40000
+
+
+
+struct vertex_info;
+struct pipe_context;
+struct llvmpipe_context;
+
+
+
+/** Subclass of pipe_shader_state */
+struct lp_vertex_shader
+{
+   struct pipe_shader_state shader;
+   struct draw_vertex_shader *draw_data;
+};
+
+/** Subclass of pipe_shader_state */
+struct lp_geometry_shader {
+   struct pipe_shader_state shader;
+   struct draw_geometry_shader *draw_data;
+};
+
+/** Vertex element state */
+struct lp_velems_state
+{
+   unsigned count;
+   struct pipe_vertex_element velem[PIPE_MAX_ATTRIBS];
+};
+
+struct lp_so_state {
+   struct pipe_stream_output_state base;
+};
+
+
+void
+llvmpipe_set_framebuffer_state(struct pipe_context *,
+                               const struct pipe_framebuffer_state *);
+
+void
+llvmpipe_update_fs(struct llvmpipe_context *lp);
+
+void
+llvmpipe_update_derived(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_sampler_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_blend_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_vertex_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_draw_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_clip_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_vs_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_gs_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_rasterizer_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_so_funcs(struct llvmpipe_context *llvmpipe);
+
+
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_state_blend.c b/src/gallium/drivers/llvmpipe/lp_state_blend.c
new file mode 100644
index 0000000000..5b39d9d1a9
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_blend.c
@@ -0,0 +1,172 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_dump.h"
+#include "draw/draw_context.h"
+#include "lp_screen.h"
+#include "lp_context.h"
+#include "lp_state.h"
+
+
+static void *
+llvmpipe_create_blend_state(struct pipe_context *pipe,
+                            const struct pipe_blend_state *blend)
+{
+   return mem_dup(blend, sizeof(*blend));
+}
+
+
+static void
+llvmpipe_bind_blend_state(struct pipe_context *pipe, void *blend)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if (llvmpipe->blend == blend)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->blend = blend;
+
+   llvmpipe->dirty |= LP_NEW_BLEND;
+}
+
+
+static void
+llvmpipe_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   FREE( blend );
+}
+
+
+static void
+llvmpipe_set_blend_color(struct pipe_context *pipe,
+                         const struct pipe_blend_color *blend_color)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if(!blend_color)
+      return;
+
+   if(memcmp(&llvmpipe->blend_color, blend_color, sizeof *blend_color) == 0)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   memcpy(&llvmpipe->blend_color, blend_color, sizeof *blend_color);
+
+   llvmpipe->dirty |= LP_NEW_BLEND_COLOR;
+}
+
+
+/** XXX move someday?  Or consolidate all these simple state setters
+ * into one file.
+ */
+
+
+static void *
+llvmpipe_create_depth_stencil_state(struct pipe_context *pipe,
+				    const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   return mem_dup(depth_stencil, sizeof(*depth_stencil));
+}
+
+
+static void
+llvmpipe_bind_depth_stencil_state(struct pipe_context *pipe,
+                                  void *depth_stencil)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if (llvmpipe->depth_stencil == depth_stencil)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->depth_stencil = depth_stencil;
+
+   llvmpipe->dirty |= LP_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+
+static void
+llvmpipe_delete_depth_stencil_state(struct pipe_context *pipe, void *depth)
+{
+   FREE( depth );
+}
+
+
+static void
+llvmpipe_set_stencil_ref(struct pipe_context *pipe,
+                         const struct pipe_stencil_ref *stencil_ref)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if(!stencil_ref)
+      return;
+
+   if(memcmp(&llvmpipe->stencil_ref, stencil_ref, sizeof *stencil_ref) == 0)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   memcpy(&llvmpipe->stencil_ref, stencil_ref, sizeof *stencil_ref);
+
+   /* not sure. want new flag? */
+   llvmpipe->dirty |= LP_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+static void
+llvmpipe_set_sample_mask(struct pipe_context *pipe,
+                         unsigned sample_mask)
+{
+}
+
+void
+llvmpipe_init_blend_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_blend_state = llvmpipe_create_blend_state;
+   llvmpipe->pipe.bind_blend_state   = llvmpipe_bind_blend_state;
+   llvmpipe->pipe.delete_blend_state = llvmpipe_delete_blend_state;
+
+   llvmpipe->pipe.create_depth_stencil_alpha_state = llvmpipe_create_depth_stencil_state;
+   llvmpipe->pipe.bind_depth_stencil_alpha_state   = llvmpipe_bind_depth_stencil_state;
+   llvmpipe->pipe.delete_depth_stencil_alpha_state = llvmpipe_delete_depth_stencil_state;
+
+   llvmpipe->pipe.set_blend_color = llvmpipe_set_blend_color;
+
+   llvmpipe->pipe.set_stencil_ref = llvmpipe_set_stencil_ref;
+   llvmpipe->pipe.set_sample_mask = llvmpipe_set_sample_mask;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_clip.c b/src/gallium/drivers/llvmpipe/lp_state_clip.c
new file mode 100644
index 0000000000..32ae079cc1
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_clip.c
@@ -0,0 +1,94 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+#include "lp_context.h"
+#include "lp_state.h"
+#include "draw/draw_context.h"
+
+
+static void
+llvmpipe_set_clip_state(struct pipe_context *pipe,
+                        const struct pipe_clip_state *clip)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   /* pass the clip state to the draw module */
+   draw_set_clip_state(llvmpipe->draw, clip);
+}
+
+
+static void
+llvmpipe_set_viewport_state(struct pipe_context *pipe,
+                            const struct pipe_viewport_state *viewport)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_state(llvmpipe->draw, viewport);
+
+   llvmpipe->viewport = *viewport; /* struct copy */
+   llvmpipe->dirty |= LP_NEW_VIEWPORT;
+}
+
+
+static void
+llvmpipe_set_scissor_state(struct pipe_context *pipe,
+                           const struct pipe_scissor_state *scissor)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->scissor = *scissor; /* struct copy */
+   llvmpipe->dirty |= LP_NEW_SCISSOR;
+}
+
+
+static void
+llvmpipe_set_polygon_stipple(struct pipe_context *pipe,
+                             const struct pipe_poly_stipple *stipple)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->poly_stipple = *stipple; /* struct copy */
+   llvmpipe->dirty |= LP_NEW_STIPPLE;
+}
+
+
+
+void
+llvmpipe_init_clip_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.set_clip_state = llvmpipe_set_clip_state;
+   llvmpipe->pipe.set_polygon_stipple = llvmpipe_set_polygon_stipple;
+   llvmpipe->pipe.set_scissor_state = llvmpipe_set_scissor_state;
+   llvmpipe->pipe.set_viewport_state = llvmpipe_set_viewport_state;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
new file mode 100644
index 0000000000..d20a5218d4
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -0,0 +1,198 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_private.h"
+#include "lp_context.h"
+#include "lp_screen.h"
+#include "lp_setup.h"
+#include "lp_state.h"
+
+
+
+/**
+ * The vertex info describes how to convert the post-transformed vertices
+ * (simple float[][4]) used by the 'draw' module into vertices for
+ * rasterization.
+ *
+ * This function validates the vertex layout.
+ */
+static void
+compute_vertex_info(struct llvmpipe_context *llvmpipe)
+{
+   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
+   struct vertex_info *vinfo = &llvmpipe->vertex_info;
+   struct lp_shader_input *inputs = llvmpipe->inputs;
+   unsigned vs_index;
+   uint i;
+
+   /*
+    * Match FS inputs against VS outputs, emitting the necessary attributes.
+    */
+
+   vinfo->num_attribs = 0;
+
+   vs_index = draw_find_shader_output(llvmpipe->draw,
+                                       TGSI_SEMANTIC_POSITION,
+                                       0);
+
+   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+
+   for (i = 0; i < lpfs->info.num_inputs; i++) {
+      /*
+       * Search for each input in current vs output:
+       */
+
+      vs_index = draw_find_shader_output(llvmpipe->draw,
+                                         lpfs->info.input_semantic_name[i],
+                                         lpfs->info.input_semantic_index[i]);
+
+      /* This can be pre-computed, except for flatshade:
+       */
+      inputs[i].usage_mask = lpfs->info.input_usage_mask[i];
+
+      switch (lpfs->info.input_interpolate[i]) {
+      case TGSI_INTERPOLATE_CONSTANT:
+         inputs[i].interp = LP_INTERP_CONSTANT;
+         break;
+      case TGSI_INTERPOLATE_LINEAR:
+         inputs[i].interp = LP_INTERP_LINEAR;
+         break;
+      case TGSI_INTERPOLATE_PERSPECTIVE:
+         inputs[i].interp = LP_INTERP_PERSPECTIVE;
+         break;
+      default:
+         assert(0);
+         break;
+      }
+
+      switch (lpfs->info.input_semantic_name[i]) {
+      case TGSI_SEMANTIC_FACE:
+         inputs[i].interp = LP_INTERP_FACING;
+         break;
+      case TGSI_SEMANTIC_POSITION:
+         /* Position was already emitted above
+          */
+         inputs[i].interp = LP_INTERP_POSITION;
+         inputs[i].src_index = 0;
+         continue;
+      case TGSI_SEMANTIC_COLOR:
+         /* Colors are linearly inputs[i].interpolated in the fragment shader
+          * even when flatshading is active.  This just tells the
+          * setup module to use coefficients with ddx==0 and
+          * ddy==0.
+          */
+         if (llvmpipe->rasterizer->flatshade)
+            inputs[i].interp = LP_INTERP_CONSTANT;
+         break;
+
+      default:
+         break;
+      }
+
+      /*
+       * Emit the requested fs attribute for all but position.
+       */
+
+      inputs[i].src_index = vinfo->num_attribs;
+      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+   }
+   llvmpipe->num_inputs = lpfs->info.num_inputs;
+
+   draw_compute_vertex_size(vinfo);
+
+   lp_setup_set_vertex_info(llvmpipe->setup, vinfo);
+
+   lp_setup_set_fs_inputs(llvmpipe->setup,
+                          inputs,
+                          lpfs->info.num_inputs);
+}
+
+
+/**
+ * Handle state changes.
+ * Called just prior to drawing anything (pipe::draw_arrays(), etc).
+ *
+ * Hopefully this will remain quite simple, otherwise need to pull in
+ * something like the state tracker mechanism.
+ */
+void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
+{
+   struct llvmpipe_screen *lp_screen = llvmpipe_screen(llvmpipe->pipe.screen);
+
+   /* Check for updated textures.
+    */
+   if (llvmpipe->tex_timestamp != lp_screen->timestamp) {
+      llvmpipe->tex_timestamp = lp_screen->timestamp;
+      llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
+   }
+      
+   if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
+                          LP_NEW_FS |
+                          LP_NEW_VS))
+      compute_vertex_info( llvmpipe );
+
+   if (llvmpipe->dirty & (LP_NEW_FS |
+                          LP_NEW_BLEND |
+                          LP_NEW_SCISSOR |
+                          LP_NEW_DEPTH_STENCIL_ALPHA |
+                          LP_NEW_RASTERIZER |
+                          LP_NEW_SAMPLER |
+                          LP_NEW_SAMPLER_VIEW |
+                          LP_NEW_QUERY))
+      llvmpipe_update_fs( llvmpipe );
+
+   if (llvmpipe->dirty & LP_NEW_BLEND_COLOR)
+      lp_setup_set_blend_color(llvmpipe->setup,
+                               &llvmpipe->blend_color);
+
+   if (llvmpipe->dirty & LP_NEW_SCISSOR)
+      lp_setup_set_scissor(llvmpipe->setup, &llvmpipe->scissor);
+
+   if (llvmpipe->dirty & LP_NEW_DEPTH_STENCIL_ALPHA) {
+      lp_setup_set_alpha_ref_value(llvmpipe->setup, 
+                                   llvmpipe->depth_stencil->alpha.ref_value);
+      lp_setup_set_stencil_ref_values(llvmpipe->setup,
+                                      llvmpipe->stencil_ref.ref_value);
+   }
+
+   if (llvmpipe->dirty & LP_NEW_CONSTANTS)
+      lp_setup_set_fs_constants(llvmpipe->setup, 
+                                llvmpipe->constants[PIPE_SHADER_FRAGMENT][0]);
+
+   if (llvmpipe->dirty & LP_NEW_SAMPLER_VIEW)
+      lp_setup_set_fragment_sampler_views(llvmpipe->setup, 
+                                          llvmpipe->num_fragment_sampler_views,
+                                          llvmpipe->fragment_sampler_views);
+
+   llvmpipe->dirty = 0;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
new file mode 100644
index 0000000000..65115052cd
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -0,0 +1,1322 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Code generate the whole fragment pipeline.
+ *
+ * The fragment pipeline consists of the following stages:
+ * - triangle edge in/out testing
+ * - scissor test
+ * - stipple (TBI)
+ * - early depth test
+ * - fragment shader
+ * - alpha test
+ * - depth/stencil test
+ * - blending
+ *
+ * This file has only the glue to assemble the fragment pipeline.  The actual
+ * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
+ * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
+ * muster the LLVM JIT execution engine to create a function that follows an
+ * established binary interface and that can be called from C directly.
+ *
+ * A big source of complexity here is that we often want to run different
+ * stages with different precisions and data types and precisions. For example,
+ * the fragment shader needs typically to be done in floats, but the
+ * depth/stencil test and blending is better done in the type that most closely
+ * matches the depth/stencil and color buffer respectively.
+ *
+ * Since the width of a SIMD vector register stays the same regardless of the
+ * element type, different types imply different number of elements, so we must
+ * code generate more instances of the stages with larger types to be able to
+ * feed/consume the stages with smaller types.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include <limits.h>
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_pointer.h"
+#include "util/u_format.h"
+#include "util/u_dump.h"
+#include "util/u_string.h"
+#include "util/u_simple_list.h"
+#include "os/os_time.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_parse.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_conv.h"
+#include "gallivm/lp_bld_intr.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_tgsi.h"
+#include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_flow.h"
+#include "gallivm/lp_bld_debug.h"
+
+#include "lp_bld_alpha.h"
+#include "lp_bld_blend.h"
+#include "lp_bld_depth.h"
+#include "lp_bld_interp.h"
+#include "lp_context.h"
+#include "lp_debug.h"
+#include "lp_perf.h"
+#include "lp_screen.h"
+#include "lp_setup.h"
+#include "lp_state.h"
+#include "lp_tex_sample.h"
+#include "lp_flush.h"
+
+
+#include <llvm-c/Analysis.h>
+
+
+static unsigned fs_no = 0;
+
+
+/**
+ * Generate the depth /stencil test code.
+ */
+static void
+generate_depth_stencil(LLVMBuilderRef builder,
+                       const struct lp_fragment_shader_variant_key *key,
+                       struct lp_type src_type,
+                       struct lp_build_mask_context *mask,
+                       LLVMValueRef stencil_refs[2],
+                       LLVMValueRef src,
+                       LLVMValueRef dst_ptr,
+                       LLVMValueRef facing,
+                       LLVMValueRef counter)
+{
+   const struct util_format_description *format_desc;
+   struct lp_type dst_type;
+
+   if (!key->depth.enabled && !key->stencil[0].enabled && !key->stencil[1].enabled)
+      return;
+
+   format_desc = util_format_description(key->zsbuf_format);
+   assert(format_desc);
+
+   /*
+    * Depths are expected to be between 0 and 1, even if they are stored in
+    * floats. Setting these bits here will ensure that the lp_build_conv() call
+    * below won't try to unnecessarily clamp the incoming values.
+    */
+   if(src_type.floating) {
+      src_type.sign = FALSE;
+      src_type.norm = TRUE;
+   }
+   else {
+      assert(!src_type.sign);
+      assert(src_type.norm);
+   }
+
+   /* Pick the depth type. */
+   dst_type = lp_depth_type(format_desc, src_type.width*src_type.length);
+
+   /* FIXME: Cope with a depth test type with a different bit width. */
+   assert(dst_type.width == src_type.width);
+   assert(dst_type.length == src_type.length);
+
+   /* Convert fragment Z from float to integer */
+   lp_build_conv(builder, src_type, dst_type, &src, 1, &src, 1);
+
+   dst_ptr = LLVMBuildBitCast(builder,
+                              dst_ptr,
+                              LLVMPointerType(lp_build_vec_type(dst_type), 0), "");
+   lp_build_depth_stencil_test(builder,
+                               &key->depth,
+                               key->stencil,
+                               dst_type,
+                               format_desc,
+                               mask,
+                               stencil_refs,
+                               src,
+                               dst_ptr,
+                               facing,
+                               counter);
+}
+
+
+/**
+ * Generate the code to do inside/outside triangle testing for the
+ * four pixels in a 2x2 quad.  This will set the four elements of the
+ * quad mask vector to 0 or ~0.
+ * \param i  which quad of the quad group to test, in [0,3]
+ */
+static void
+generate_tri_edge_mask(LLVMBuilderRef builder,
+                       unsigned i,
+                       LLVMValueRef *mask,      /* ivec4, out */
+                       LLVMValueRef c0,         /* int32 */
+                       LLVMValueRef c1,         /* int32 */
+                       LLVMValueRef c2,         /* int32 */
+                       LLVMValueRef step0_ptr,  /* ivec4 */
+                       LLVMValueRef step1_ptr,  /* ivec4 */
+                       LLVMValueRef step2_ptr)  /* ivec4 */
+{
+#define OPTIMIZE_IN_OUT_TEST 0
+#if OPTIMIZE_IN_OUT_TEST
+   struct lp_build_if_state ifctx;
+   LLVMValueRef not_draw_all;
+#endif
+   struct lp_build_flow_context *flow;
+   struct lp_type i32_type;
+   LLVMTypeRef i32vec4_type;
+   LLVMValueRef c0_vec, c1_vec, c2_vec;
+   LLVMValueRef in_out_mask;
+
+   assert(i < 4);
+   
+   /* int32 vector type */
+   memset(&i32_type, 0, sizeof i32_type);
+   i32_type.floating = FALSE; /* values are integers */
+   i32_type.sign = TRUE;      /* values are signed */
+   i32_type.norm = FALSE;     /* values are not normalized */
+   i32_type.width = 32;       /* 32-bit int values */
+   i32_type.length = 4;       /* 4 elements per vector */
+
+   i32vec4_type = lp_build_int32_vec4_type();
+
+   /*
+    * Use a conditional here to do detailed pixel in/out testing.
+    * We only have to do this if c0 != INT_MIN.
+    */
+   flow = lp_build_flow_create(builder);
+   lp_build_flow_scope_begin(flow);
+
+   {
+#if OPTIMIZE_IN_OUT_TEST
+      /* not_draw_all = (c0 != INT_MIN) */
+      not_draw_all = LLVMBuildICmp(builder,
+                                   LLVMIntNE,
+                                   c0,
+                                   LLVMConstInt(LLVMInt32Type(), INT_MIN, 0),
+                                   "");
+
+      in_out_mask = lp_build_const_int_vec(i32_type, ~0);
+
+
+      lp_build_flow_scope_declare(flow, &in_out_mask);
+
+      /* if (not_draw_all) {... */
+      lp_build_if(&ifctx, flow, builder, not_draw_all);
+#endif
+      {
+         LLVMValueRef step0_vec, step1_vec, step2_vec;
+         LLVMValueRef m0_vec, m1_vec, m2_vec;
+         LLVMValueRef index, m;
+
+         /* c0_vec = {c0, c0, c0, c0}
+          * Note that we emit this code four times but LLVM optimizes away
+          * three instances of it.
+          */
+         c0_vec = lp_build_broadcast(builder, i32vec4_type, c0);
+         c1_vec = lp_build_broadcast(builder, i32vec4_type, c1);
+         c2_vec = lp_build_broadcast(builder, i32vec4_type, c2);
+         lp_build_name(c0_vec, "edgeconst0vec");
+         lp_build_name(c1_vec, "edgeconst1vec");
+         lp_build_name(c2_vec, "edgeconst2vec");
+
+         /* load step0vec, step1, step2 vec from memory */
+         index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         step0_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step0_ptr, &index, 1, ""), "");
+         step1_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step1_ptr, &index, 1, ""), "");
+         step2_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step2_ptr, &index, 1, ""), "");
+         lp_build_name(step0_vec, "step0vec");
+         lp_build_name(step1_vec, "step1vec");
+         lp_build_name(step2_vec, "step2vec");
+
+         /* m0_vec = step0_ptr[i] > c0_vec */
+         m0_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step0_vec, c0_vec);
+         m1_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step1_vec, c1_vec);
+         m2_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step2_vec, c2_vec);
+
+         /* in_out_mask = m0_vec & m1_vec & m2_vec */
+         m = LLVMBuildAnd(builder, m0_vec, m1_vec, "");
+         in_out_mask = LLVMBuildAnd(builder, m, m2_vec, "");
+         lp_build_name(in_out_mask, "inoutmaskvec");
+      }
+#if OPTIMIZE_IN_OUT_TEST
+      lp_build_endif(&ifctx);
+#endif
+
+   }
+   lp_build_flow_scope_end(flow);
+   lp_build_flow_destroy(flow);
+
+   /* This is the initial alive/dead pixel mask for a quad of four pixels.
+    * It's an int[4] vector with each word set to 0 or ~0.
+    * Words will get cleared when pixels faile the Z test, etc.
+    */
+   *mask = in_out_mask;
+}
+
+
+static LLVMValueRef
+generate_scissor_test(LLVMBuilderRef builder,
+                      LLVMValueRef context_ptr,
+                      const struct lp_build_interp_soa_context *interp,
+                      struct lp_type type)
+{
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMValueRef xpos = interp->pos[0], ypos = interp->pos[1];
+   LLVMValueRef xmin, ymin, xmax, ymax;
+   LLVMValueRef m0, m1, m2, m3, m;
+
+   /* xpos, ypos contain the window coords for the four pixels in the quad */
+   assert(xpos);
+   assert(ypos);
+
+   /* get the current scissor bounds, convert to vectors */
+   xmin = lp_jit_context_scissor_xmin_value(builder, context_ptr);
+   xmin = lp_build_broadcast(builder, vec_type, xmin);
+
+   ymin = lp_jit_context_scissor_ymin_value(builder, context_ptr);
+   ymin = lp_build_broadcast(builder, vec_type, ymin);
+
+   xmax = lp_jit_context_scissor_xmax_value(builder, context_ptr);
+   xmax = lp_build_broadcast(builder, vec_type, xmax);
+
+   ymax = lp_jit_context_scissor_ymax_value(builder, context_ptr);
+   ymax = lp_build_broadcast(builder, vec_type, ymax);
+
+   /* compare the fragment's position coordinates against the scissor bounds */
+   m0 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, xpos, xmin);
+   m1 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, ypos, ymin);
+   m2 = lp_build_compare(builder, type, PIPE_FUNC_LESS, xpos, xmax);
+   m3 = lp_build_compare(builder, type, PIPE_FUNC_LESS, ypos, ymax);
+
+   /* AND all the masks together */
+   m = LLVMBuildAnd(builder, m0, m1, "");
+   m = LLVMBuildAnd(builder, m, m2, "");
+   m = LLVMBuildAnd(builder, m, m3, "");
+
+   lp_build_name(m, "scissormask");
+
+   return m;
+}
+
+
+static LLVMValueRef
+build_int32_vec_const(int value)
+{
+   struct lp_type i32_type;
+
+   memset(&i32_type, 0, sizeof i32_type);
+   i32_type.floating = FALSE; /* values are integers */
+   i32_type.sign = TRUE;      /* values are signed */
+   i32_type.norm = FALSE;     /* values are not normalized */
+   i32_type.width = 32;       /* 32-bit int values */
+   i32_type.length = 4;       /* 4 elements per vector */
+   return lp_build_const_int_vec(i32_type, value);
+}
+
+
+
+/**
+ * Generate the fragment shader, depth/stencil test, and alpha tests.
+ * \param i  which quad in the tile, in range [0,3]
+ * \param do_tri_test  if 1, do triangle edge in/out testing
+ */
+static void
+generate_fs(struct llvmpipe_context *lp,
+            struct lp_fragment_shader *shader,
+            const struct lp_fragment_shader_variant_key *key,
+            LLVMBuilderRef builder,
+            struct lp_type type,
+            LLVMValueRef context_ptr,
+            unsigned i,
+            const struct lp_build_interp_soa_context *interp,
+            struct lp_build_sampler_soa *sampler,
+            LLVMValueRef *pmask,
+            LLVMValueRef (*color)[4],
+            LLVMValueRef depth_ptr,
+            LLVMValueRef facing,
+            unsigned do_tri_test,
+            LLVMValueRef c0,
+            LLVMValueRef c1,
+            LLVMValueRef c2,
+            LLVMValueRef step0_ptr,
+            LLVMValueRef step1_ptr,
+            LLVMValueRef step2_ptr,
+            LLVMValueRef counter)
+{
+   const struct tgsi_token *tokens = shader->base.tokens;
+   LLVMTypeRef vec_type;
+   LLVMValueRef consts_ptr;
+   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
+   LLVMValueRef z = interp->pos[2];
+   LLVMValueRef stencil_refs[2];
+   struct lp_build_flow_context *flow;
+   struct lp_build_mask_context mask;
+   boolean early_depth_stencil_test;
+   unsigned attrib;
+   unsigned chan;
+   unsigned cbuf;
+
+   assert(i < 4);
+
+   stencil_refs[0] = lp_jit_context_stencil_ref_front_value(builder, context_ptr);
+   stencil_refs[1] = lp_jit_context_stencil_ref_back_value(builder, context_ptr);
+
+   vec_type = lp_build_vec_type(type);
+
+   consts_ptr = lp_jit_context_constants(builder, context_ptr);
+
+   flow = lp_build_flow_create(builder);
+
+   memset(outputs, 0, sizeof outputs);
+
+   lp_build_flow_scope_begin(flow);
+
+   /* Declare the color and z variables */
+   for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
+      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+	 color[cbuf][chan] = LLVMGetUndef(vec_type);
+	 lp_build_flow_scope_declare(flow, &color[cbuf][chan]);
+      }
+   }
+   lp_build_flow_scope_declare(flow, &z);
+
+   /* do triangle edge testing */
+   if (do_tri_test) {
+      generate_tri_edge_mask(builder, i, pmask,
+                             c0, c1, c2, step0_ptr, step1_ptr, step2_ptr);
+   }
+   else {
+      *pmask = build_int32_vec_const(~0);
+   }
+
+   /* 'mask' will control execution based on quad's pixel alive/killed state */
+   lp_build_mask_begin(&mask, flow, type, *pmask);
+
+   if (key->scissor) {
+      LLVMValueRef smask =
+         generate_scissor_test(builder, context_ptr, interp, type);
+      lp_build_mask_update(&mask, smask);
+   }
+
+   early_depth_stencil_test =
+      (key->depth.enabled || key->stencil[0].enabled) &&
+      !key->alpha.enabled &&
+      !shader->info.uses_kill &&
+      !shader->info.writes_z;
+
+   if (early_depth_stencil_test)
+      generate_depth_stencil(builder, key,
+                             type, &mask,
+                             stencil_refs, z, depth_ptr, facing, counter);
+
+   lp_build_tgsi_soa(builder, tokens, type, &mask,
+                     consts_ptr, interp->pos, interp->inputs,
+                     outputs, sampler, &shader->info);
+
+   /* loop over fragment shader outputs/results */
+   for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) {
+      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+         if(outputs[attrib][chan]) {
+            LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
+            lp_build_name(out, "output%u.%u.%c", i, attrib, "xyzw"[chan]);
+
+            switch (shader->info.output_semantic_name[attrib]) {
+            case TGSI_SEMANTIC_COLOR:
+               {
+                  unsigned cbuf = shader->info.output_semantic_index[attrib];
+
+                  lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
+
+                  /* Alpha test */
+                  /* XXX: should the alpha reference value be passed separately? */
+		  /* XXX: should only test the final assignment to alpha */
+                  if(cbuf == 0 && chan == 3) {
+                     LLVMValueRef alpha = out;
+                     LLVMValueRef alpha_ref_value;
+                     alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
+                     alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
+                     lp_build_alpha_test(builder, &key->alpha, type,
+                                         &mask, alpha, alpha_ref_value);
+                  }
+
+		  color[cbuf][chan] = out;
+                  break;
+               }
+
+            case TGSI_SEMANTIC_POSITION:
+               if(chan == 2)
+                  z = out;
+               break;
+            }
+         }
+      }
+   }
+
+   if (!early_depth_stencil_test)
+      generate_depth_stencil(builder, key,
+                             type, &mask,
+                             stencil_refs, z, depth_ptr, facing, counter);
+
+   lp_build_mask_end(&mask);
+
+   lp_build_flow_scope_end(flow);
+
+   lp_build_flow_destroy(flow);
+
+   *pmask = mask.value;
+
+}
+
+
+/**
+ * Generate color blending and color output.
+ * \param rt  the render target index (to index blend, colormask state)
+ * \param type  the pixel color type
+ * \param context_ptr  pointer to the runtime JIT context
+ * \param mask  execution mask (active fragment/pixel mask)
+ * \param src  colors from the fragment shader
+ * \param dst_ptr  the destination color buffer pointer
+ */
+static void
+generate_blend(const struct pipe_blend_state *blend,
+               unsigned rt,
+               LLVMBuilderRef builder,
+               struct lp_type type,
+               LLVMValueRef context_ptr,
+               LLVMValueRef mask,
+               LLVMValueRef *src,
+               LLVMValueRef dst_ptr)
+{
+   struct lp_build_context bld;
+   struct lp_build_flow_context *flow;
+   struct lp_build_mask_context mask_ctx;
+   LLVMTypeRef vec_type;
+   LLVMValueRef const_ptr;
+   LLVMValueRef con[4];
+   LLVMValueRef dst[4];
+   LLVMValueRef res[4];
+   unsigned chan;
+
+   lp_build_context_init(&bld, builder, type);
+
+   flow = lp_build_flow_create(builder);
+
+   /* we'll use this mask context to skip blending if all pixels are dead */
+   lp_build_mask_begin(&mask_ctx, flow, type, mask);
+
+   vec_type = lp_build_vec_type(type);
+
+   const_ptr = lp_jit_context_blend_color(builder, context_ptr);
+   const_ptr = LLVMBuildBitCast(builder, const_ptr,
+                                LLVMPointerType(vec_type, 0), "");
+
+   /* load constant blend color and colors from the dest color buffer */
+   for(chan = 0; chan < 4; ++chan) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
+      con[chan] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, const_ptr, &index, 1, ""), "");
+
+      dst[chan] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dst_ptr, &index, 1, ""), "");
+
+      lp_build_name(con[chan], "con.%c", "rgba"[chan]);
+      lp_build_name(dst[chan], "dst.%c", "rgba"[chan]);
+   }
+
+   /* do blend */
+   lp_build_blend_soa(builder, blend, type, rt, src, dst, con, res);
+
+   /* store results to color buffer */
+   for(chan = 0; chan < 4; ++chan) {
+      if(blend->rt[rt].colormask & (1 << chan)) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
+         lp_build_name(res[chan], "res.%c", "rgba"[chan]);
+         res[chan] = lp_build_select(&bld, mask, res[chan], dst[chan]);
+         LLVMBuildStore(builder, res[chan], LLVMBuildGEP(builder, dst_ptr, &index, 1, ""));
+      }
+   }
+
+   lp_build_mask_end(&mask_ctx);
+   lp_build_flow_destroy(flow);
+}
+
+
+/**
+ * Generate the runtime callable function for the whole fragment pipeline.
+ * Note that the function which we generate operates on a block of 16
+ * pixels at at time.  The block contains 2x2 quads.  Each quad contains
+ * 2x2 pixels.
+ */
+static void
+generate_fragment(struct llvmpipe_context *lp,
+                  struct lp_fragment_shader *shader,
+                  struct lp_fragment_shader_variant *variant,
+                  unsigned do_tri_test)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
+   const struct lp_fragment_shader_variant_key *key = &variant->key;
+   char func_name[256];
+   struct lp_type fs_type;
+   struct lp_type blend_type;
+   LLVMTypeRef fs_elem_type;
+   LLVMTypeRef fs_int_vec_type;
+   LLVMTypeRef blend_vec_type;
+   LLVMTypeRef arg_types[16];
+   LLVMTypeRef func_type;
+   LLVMTypeRef int32_vec4_type = lp_build_int32_vec4_type();
+   LLVMValueRef context_ptr;
+   LLVMValueRef x;
+   LLVMValueRef y;
+   LLVMValueRef a0_ptr;
+   LLVMValueRef dadx_ptr;
+   LLVMValueRef dady_ptr;
+   LLVMValueRef color_ptr_ptr;
+   LLVMValueRef depth_ptr;
+   LLVMValueRef c0, c1, c2, step0_ptr, step1_ptr, step2_ptr, counter = NULL;
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   struct lp_build_sampler_soa *sampler;
+   struct lp_build_interp_soa_context interp;
+   LLVMValueRef fs_mask[LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef blend_mask;
+   LLVMValueRef function;
+   LLVMValueRef facing;
+   unsigned num_fs;
+   unsigned i;
+   unsigned chan;
+   unsigned cbuf;
+
+
+   /* TODO: actually pick these based on the fs and color buffer
+    * characteristics. */
+
+   memset(&fs_type, 0, sizeof fs_type);
+   fs_type.floating = TRUE; /* floating point values */
+   fs_type.sign = TRUE;     /* values are signed */
+   fs_type.norm = FALSE;    /* values are not limited to [0,1] or [-1,1] */
+   fs_type.width = 32;      /* 32-bit float */
+   fs_type.length = 4;      /* 4 elements per vector */
+   num_fs = 4;              /* number of quads per block */
+
+   memset(&blend_type, 0, sizeof blend_type);
+   blend_type.floating = FALSE; /* values are integers */
+   blend_type.sign = FALSE;     /* values are unsigned */
+   blend_type.norm = TRUE;      /* values are in [0,1] or [-1,1] */
+   blend_type.width = 8;        /* 8-bit ubyte values */
+   blend_type.length = 16;      /* 16 elements per vector */
+
+   /* 
+    * Generate the function prototype. Any change here must be reflected in
+    * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa.
+    */
+
+   fs_elem_type = lp_build_elem_type(fs_type);
+   fs_int_vec_type = lp_build_int_vec_type(fs_type);
+
+   blend_vec_type = lp_build_vec_type(blend_type);
+
+   util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s", 
+		 shader->no, variant->no, do_tri_test ? "edge" : "whole");
+
+   arg_types[0] = screen->context_ptr_type;            /* context */
+   arg_types[1] = LLVMInt32Type();                     /* x */
+   arg_types[2] = LLVMInt32Type();                     /* y */
+   arg_types[3] = LLVMFloatType();                     /* facing */
+   arg_types[4] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
+   arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
+   arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dady */
+   arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0);  /* color */
+   arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
+   arg_types[9] = LLVMInt32Type();                     /* c0 */
+   arg_types[10] = LLVMInt32Type();                    /* c1 */
+   arg_types[11] = LLVMInt32Type();                    /* c2 */
+   /* Note: the step arrays are built as int32[16] but we interpret
+    * them here as int32_vec4[4].
+    */
+   arg_types[12] = LLVMPointerType(int32_vec4_type, 0);/* step0 */
+   arg_types[13] = LLVMPointerType(int32_vec4_type, 0);/* step1 */
+   arg_types[14] = LLVMPointerType(int32_vec4_type, 0);/* step2 */
+   arg_types[15] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */
+
+   func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
+
+   function = LLVMAddFunction(screen->module, func_name, func_type);
+   LLVMSetFunctionCallConv(function, LLVMCCallConv);
+
+   variant->function[do_tri_test] = function;
+
+
+   /* XXX: need to propagate noalias down into color param now we are
+    * passing a pointer-to-pointer?
+    */
+   for(i = 0; i < Elements(arg_types); ++i)
+      if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
+         LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute);
+
+   context_ptr  = LLVMGetParam(function, 0);
+   x            = LLVMGetParam(function, 1);
+   y            = LLVMGetParam(function, 2);
+   facing       = LLVMGetParam(function, 3);
+   a0_ptr       = LLVMGetParam(function, 4);
+   dadx_ptr     = LLVMGetParam(function, 5);
+   dady_ptr     = LLVMGetParam(function, 6);
+   color_ptr_ptr = LLVMGetParam(function, 7);
+   depth_ptr    = LLVMGetParam(function, 8);
+   c0           = LLVMGetParam(function, 9);
+   c1           = LLVMGetParam(function, 10);
+   c2           = LLVMGetParam(function, 11);
+   step0_ptr    = LLVMGetParam(function, 12);
+   step1_ptr    = LLVMGetParam(function, 13);
+   step2_ptr    = LLVMGetParam(function, 14);
+
+   lp_build_name(context_ptr, "context");
+   lp_build_name(x, "x");
+   lp_build_name(y, "y");
+   lp_build_name(a0_ptr, "a0");
+   lp_build_name(dadx_ptr, "dadx");
+   lp_build_name(dady_ptr, "dady");
+   lp_build_name(color_ptr_ptr, "color_ptr_ptr");
+   lp_build_name(depth_ptr, "depth");
+   lp_build_name(c0, "c0");
+   lp_build_name(c1, "c1");
+   lp_build_name(c2, "c2");
+   lp_build_name(step0_ptr, "step0");
+   lp_build_name(step1_ptr, "step1");
+   lp_build_name(step2_ptr, "step2");
+
+   if (key->occlusion_count) {
+      counter = LLVMGetParam(function, 15);
+      lp_build_name(counter, "counter");
+   }
+
+   /*
+    * Function body
+    */
+
+   block = LLVMAppendBasicBlock(function, "entry");
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   /*
+    * The shader input interpolation info is not explicitely baked in the
+    * shader key, but everything it derives from (TGSI, and flatshade) is
+    * already included in the shader key.
+    */
+   lp_build_interp_soa_init(&interp, 
+                            lp->num_inputs,
+                            lp->inputs,
+                            builder, fs_type,
+                            a0_ptr, dadx_ptr, dady_ptr,
+                            x, y);
+
+   /* code generated texture sampling */
+   sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
+
+   /* loop over quads in the block */
+   for(i = 0; i < num_fs; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS];
+      LLVMValueRef depth_ptr_i;
+
+      if(i != 0)
+         lp_build_interp_soa_update(&interp, i);
+
+      depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, "");
+
+      generate_fs(lp, shader, key,
+                  builder,
+                  fs_type,
+                  context_ptr,
+                  i,
+                  &interp,
+                  sampler,
+                  &fs_mask[i], /* output */
+                  out_color,
+                  depth_ptr_i,
+                  facing,
+                  do_tri_test,
+                  c0, c1, c2,
+                  step0_ptr, step1_ptr, step2_ptr, counter);
+
+      for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++)
+	 for(chan = 0; chan < NUM_CHANNELS; ++chan)
+	    fs_out_color[cbuf][chan][i] = out_color[cbuf][chan];
+   }
+
+   sampler->destroy(sampler);
+
+   /* Loop over color outputs / color buffers to do blending.
+    */
+   for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
+      LLVMValueRef color_ptr;
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), cbuf, 0);
+      LLVMValueRef blend_in_color[NUM_CHANNELS];
+      unsigned rt;
+
+      /* 
+       * Convert the fs's output color and mask to fit to the blending type. 
+       */
+      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+	 lp_build_conv(builder, fs_type, blend_type,
+		       fs_out_color[cbuf][chan], num_fs,
+		       &blend_in_color[chan], 1);
+	 lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
+      }
+
+      lp_build_conv_mask(builder, fs_type, blend_type,
+			 fs_mask, num_fs,
+			 &blend_mask, 1);
+
+      color_ptr = LLVMBuildLoad(builder, 
+				LLVMBuildGEP(builder, color_ptr_ptr, &index, 1, ""),
+				"");
+      lp_build_name(color_ptr, "color_ptr%d", cbuf);
+
+      /* which blend/colormask state to use */
+      rt = key->blend.independent_blend_enable ? cbuf : 0;
+
+      /*
+       * Blending.
+       */
+      generate_blend(&key->blend,
+                     rt,
+		     builder,
+		     blend_type,
+		     context_ptr,
+		     blend_mask,
+		     blend_in_color,
+		     color_ptr);
+   }
+
+   LLVMBuildRetVoid(builder);
+
+   LLVMDisposeBuilder(builder);
+
+
+   /* Verify the LLVM IR.  If invalid, dump and abort */
+#ifdef DEBUG
+   if(LLVMVerifyFunction(function, LLVMPrintMessageAction)) {
+      if (1)
+         lp_debug_dump_value(function);
+      abort();
+   }
+#endif
+
+   /* Apply optimizations to LLVM IR */
+   if (1)
+      LLVMRunFunctionPassManager(screen->pass, function);
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      /* Print the LLVM IR to stderr */
+      lp_debug_dump_value(function);
+      debug_printf("\n");
+   }
+
+   /*
+    * Translate the LLVM IR into machine code.
+    */
+   {
+      void *f = LLVMGetPointerToGlobal(screen->engine, function);
+
+      variant->jit_function[do_tri_test] = (lp_jit_frag_func)pointer_to_func(f);
+
+      if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+         lp_disassemble(f);
+      }
+   }
+}
+
+
+static void
+dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
+{
+   unsigned i;
+
+   debug_printf("fs variant %p:\n", (void *) key);
+
+   if (key->depth.enabled) {
+      debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
+      debug_printf("depth.func = %s\n", util_dump_func(key->depth.func, TRUE));
+      debug_printf("depth.writemask = %u\n", key->depth.writemask);
+   }
+
+   for (i = 0; i < 2; ++i) {
+      if (key->stencil[i].enabled) {
+         debug_printf("stencil[%u].func = %s\n", i, util_dump_func(key->stencil[i].func, TRUE));
+         debug_printf("stencil[%u].fail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].fail_op, TRUE));
+         debug_printf("stencil[%u].zpass_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zpass_op, TRUE));
+         debug_printf("stencil[%u].zfail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zfail_op, TRUE));
+         debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
+         debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
+      }
+   }
+
+   if (key->alpha.enabled) {
+      debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE));
+      debug_printf("alpha.ref_value = %f\n", key->alpha.ref_value);
+   }
+
+   if (key->blend.logicop_enable) {
+      debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE));
+   }
+   else if (key->blend.rt[0].blend_enable) {
+      debug_printf("blend.rgb_func = %s\n",   util_dump_blend_func  (key->blend.rt[0].rgb_func, TRUE));
+      debug_printf("blend.rgb_src_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
+      debug_printf("blend.rgb_dst_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
+      debug_printf("blend.alpha_func = %s\n",       util_dump_blend_func  (key->blend.rt[0].alpha_func, TRUE));
+      debug_printf("blend.alpha_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
+      debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
+   }
+   debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
+   for (i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
+      if (key->sampler[i].format) {
+         debug_printf("sampler[%u] = \n", i);
+         debug_printf("  .format = %s\n",
+                      util_format_name(key->sampler[i].format));
+         debug_printf("  .target = %s\n",
+                      util_dump_tex_target(key->sampler[i].target, TRUE));
+         debug_printf("  .pot = %u %u %u\n",
+                      key->sampler[i].pot_width,
+                      key->sampler[i].pot_height,
+                      key->sampler[i].pot_depth);
+         debug_printf("  .wrap = %s %s %s\n",
+                      util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
+                      util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
+                      util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
+         debug_printf("  .min_img_filter = %s\n",
+                      util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
+         debug_printf("  .min_mip_filter = %s\n",
+                      util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
+         debug_printf("  .mag_img_filter = %s\n",
+                      util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
+         if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
+            debug_printf("  .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE));
+         debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
+      }
+   }
+}
+
+
+
+static struct lp_fragment_shader_variant *
+generate_variant(struct llvmpipe_context *lp,
+                 struct lp_fragment_shader *shader,
+                 const struct lp_fragment_shader_variant_key *key)
+{
+   struct lp_fragment_shader_variant *variant;
+
+   variant = CALLOC_STRUCT(lp_fragment_shader_variant);
+   if(!variant)
+      return NULL;
+
+   variant->shader = shader;
+   variant->list_item_global.base = variant;
+   variant->list_item_local.base = variant;
+   variant->no = shader->variants_created++;
+
+   memcpy(&variant->key, key, sizeof *key);
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      debug_printf("llvmpipe: Creating fragment shader #%u variant #%u:\n", 
+		   shader->no, variant->no);
+      tgsi_dump(shader->base.tokens, 0);
+      dump_fs_variant_key(key);
+   }
+
+   generate_fragment(lp, shader, variant, RAST_WHOLE);
+   generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
+
+   /* TODO: most of these can be relaxed, in particular the colormask */
+   variant->opaque =
+         !key->blend.logicop_enable &&
+         !key->blend.rt[0].blend_enable &&
+         key->blend.rt[0].colormask == 0xf &&
+         !key->stencil[0].enabled &&
+         !key->alpha.enabled &&
+         !key->depth.enabled &&
+         !key->scissor &&
+         !shader->info.uses_kill
+         ? TRUE : FALSE;
+
+   return variant;
+}
+
+
+static void *
+llvmpipe_create_fs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct lp_fragment_shader *shader;
+
+   shader = CALLOC_STRUCT(lp_fragment_shader);
+   if (!shader)
+      return NULL;
+
+   shader->no = fs_no++;
+   make_empty_list(&shader->variants);
+
+   /* get/save the summary info for this shader */
+   tgsi_scan_shader(templ->tokens, &shader->info);
+
+   /* we need to keep a local copy of the tokens */
+   shader->base.tokens = tgsi_dup_tokens(templ->tokens);
+
+   if (LP_DEBUG & DEBUG_TGSI) {
+      unsigned attrib;
+      debug_printf("llvmpipe: Create fragment shader #%u %p:\n", shader->no, (void *) shader);
+      tgsi_dump(templ->tokens, 0);
+      debug_printf("usage masks:\n");
+      for (attrib = 0; attrib < shader->info.num_inputs; ++attrib) {
+         unsigned usage_mask = shader->info.input_usage_mask[attrib];
+         debug_printf("  IN[%u].%s%s%s%s\n",
+                      attrib,
+                      usage_mask & TGSI_WRITEMASK_X ? "x" : "",
+                      usage_mask & TGSI_WRITEMASK_Y ? "y" : "",
+                      usage_mask & TGSI_WRITEMASK_Z ? "z" : "",
+                      usage_mask & TGSI_WRITEMASK_W ? "w" : "");
+      }
+      debug_printf("\n");
+   }
+
+   return shader;
+}
+
+
+static void
+llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if (llvmpipe->fs == fs)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->fs = fs;
+
+   llvmpipe->dirty |= LP_NEW_FS;
+}
+
+static void
+remove_shader_variant(struct llvmpipe_context *lp,
+                      struct lp_fragment_shader_variant *variant)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
+   unsigned i;
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      debug_printf("llvmpipe: del fs #%u var #%u v created #%u v cached #%u v total cached #%u\n",
+                    variant->shader->no, variant->no, variant->shader->variants_created,
+                    variant->shader->variants_cached, lp->nr_fs_variants);
+   }
+   for (i = 0; i < Elements(variant->function); i++) {
+      if (variant->function[i]) {
+         if (variant->jit_function[i])
+            LLVMFreeMachineCodeForFunction(screen->engine,
+                                           variant->function[i]);
+         LLVMDeleteFunction(variant->function[i]);
+      }
+   }
+   remove_from_list(&variant->list_item_local);
+   variant->shader->variants_cached--;
+   remove_from_list(&variant->list_item_global);
+   lp->nr_fs_variants--;
+   FREE(variant);
+}
+
+static void
+llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct pipe_fence_handle *fence = NULL;
+   struct lp_fragment_shader *shader = fs;
+   struct lp_fs_variant_list_item *li;
+
+   assert(fs != llvmpipe->fs);
+   (void) llvmpipe;
+
+   /*
+    * XXX: we need to flush the context until we have some sort of reference
+    * counting in fragment shaders as they may still be binned
+    * Flushing alone might not sufficient we need to wait on it too.
+    */
+
+   llvmpipe_flush(pipe, 0, &fence);
+
+   if (fence) {
+      pipe->screen->fence_finish(pipe->screen, fence, 0);
+      pipe->screen->fence_reference(pipe->screen, &fence, NULL);
+   }
+
+   li = first_elem(&shader->variants);
+   while(!at_end(&shader->variants, li)) {
+      struct lp_fs_variant_list_item *next = next_elem(li);
+      remove_shader_variant(llvmpipe, li->base);
+      li = next;
+   }
+
+   assert(shader->variants_cached == 0);
+   FREE((void *) shader->base.tokens);
+   FREE(shader);
+}
+
+
+
+static void
+llvmpipe_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             struct pipe_resource *constants)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   unsigned size = constants ? constants->width0 : 0;
+   const void *data = constants ? llvmpipe_resource_data(constants) : NULL;
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index < PIPE_MAX_CONSTANT_BUFFERS);
+
+   if(llvmpipe->constants[shader][index] == constants)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   /* note: reference counting */
+   pipe_resource_reference(&llvmpipe->constants[shader][index], constants);
+
+   if(shader == PIPE_SHADER_VERTEX ||
+      shader == PIPE_SHADER_GEOMETRY) {
+      draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
+                                      index, data, size);
+   }
+
+   llvmpipe->dirty |= LP_NEW_CONSTANTS;
+}
+
+
+/**
+ * Return the blend factor equivalent to a destination alpha of one.
+ */
+static INLINE unsigned
+force_dst_alpha_one(unsigned factor, boolean alpha)
+{
+   switch(factor) {
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return PIPE_BLENDFACTOR_ONE;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return PIPE_BLENDFACTOR_ZERO;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return PIPE_BLENDFACTOR_ZERO;
+   }
+
+   if (alpha) {
+      switch(factor) {
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         return PIPE_BLENDFACTOR_ONE;
+      case PIPE_BLENDFACTOR_INV_DST_COLOR:
+         return PIPE_BLENDFACTOR_ZERO;
+      }
+   }
+
+   return factor;
+}
+
+
+/**
+ * We need to generate several variants of the fragment pipeline to match
+ * all the combinations of the contributing state atoms.
+ *
+ * TODO: there is actually no reason to tie this to context state -- the
+ * generated code could be cached globally in the screen.
+ */
+static void
+make_variant_key(struct llvmpipe_context *lp,
+                 struct lp_fragment_shader *shader,
+                 struct lp_fragment_shader_variant_key *key)
+{
+   unsigned i;
+
+   memset(key, 0, sizeof *key);
+
+   if (lp->framebuffer.zsbuf) {
+      if (lp->depth_stencil->depth.enabled) {
+         key->zsbuf_format = lp->framebuffer.zsbuf->format;
+         memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth);
+      }
+      if (lp->depth_stencil->stencil[0].enabled) {
+         key->zsbuf_format = lp->framebuffer.zsbuf->format;
+         memcpy(&key->stencil, &lp->depth_stencil->stencil, sizeof key->stencil);
+      }
+   }
+
+   key->alpha.enabled = lp->depth_stencil->alpha.enabled;
+   if(key->alpha.enabled)
+      key->alpha.func = lp->depth_stencil->alpha.func;
+   /* alpha.ref_value is passed in jit_context */
+
+   key->flatshade = lp->rasterizer->flatshade;
+   key->scissor = lp->rasterizer->scissor;
+   if (lp->active_query_count) {
+      key->occlusion_count = TRUE;
+   }
+
+   if (lp->framebuffer.nr_cbufs) {
+      memcpy(&key->blend, lp->blend, sizeof key->blend);
+   }
+
+   key->nr_cbufs = lp->framebuffer.nr_cbufs;
+   for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
+      struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i];
+      const struct util_format_description *format_desc;
+      unsigned chan;
+
+      format_desc = util_format_description(lp->framebuffer.cbufs[i]->format);
+      assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
+             format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
+
+      blend_rt->colormask = lp->blend->rt[i].colormask;
+
+      /* mask out color channels not present in the color buffer.
+       * Should be simple to incorporate per-cbuf writemasks:
+       */
+      for(chan = 0; chan < 4; ++chan) {
+         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+
+         if(swizzle > UTIL_FORMAT_SWIZZLE_W)
+            blend_rt->colormask &= ~(1 << chan);
+      }
+
+      /*
+       * Our swizzled render tiles always have an alpha channel, but the linear
+       * render target format often does not, so force here the dst alpha to be
+       * one.
+       *
+       * This is not a mere optimization. Wrong results will be produced if the
+       * dst alpha is used, the dst format does not have alpha, and the previous
+       * rendering was not flushed from the swizzled to linear buffer. For
+       * example, NonPowTwo DCT.
+       *
+       * TODO: This should be generalized to all channels for better
+       * performance, but only alpha causes correctness issues.
+       */
+      if (format_desc->swizzle[3] > UTIL_FORMAT_SWIZZLE_W) {
+         blend_rt->rgb_src_factor = force_dst_alpha_one(blend_rt->rgb_src_factor, FALSE);
+         blend_rt->rgb_dst_factor = force_dst_alpha_one(blend_rt->rgb_dst_factor, FALSE);
+         blend_rt->alpha_src_factor = force_dst_alpha_one(blend_rt->alpha_src_factor, TRUE);
+         blend_rt->alpha_dst_factor = force_dst_alpha_one(blend_rt->alpha_dst_factor, TRUE);
+      }
+   }
+
+   for(i = 0; i < PIPE_MAX_SAMPLERS; ++i)
+      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i))
+         lp_sampler_static_state(&key->sampler[i], lp->fragment_sampler_views[i], lp->sampler[i]);
+}
+
+/**
+ * Update fragment state.  This is called just prior to drawing
+ * something when some fragment-related state has changed.
+ */
+void 
+llvmpipe_update_fs(struct llvmpipe_context *lp)
+{
+   struct lp_fragment_shader *shader = lp->fs;
+   struct lp_fragment_shader_variant_key key;
+   struct lp_fragment_shader_variant *variant = NULL;
+   struct lp_fs_variant_list_item *li;
+
+   make_variant_key(lp, shader, &key);
+
+   li = first_elem(&shader->variants);
+   while(!at_end(&shader->variants, li)) {
+      if(memcmp(&li->base->key, &key, sizeof key) == 0) {
+         variant = li->base;
+         break;
+      }
+      li = next_elem(li);
+   }
+
+   if (variant) {
+      move_to_head(&lp->fs_variants_list, &variant->list_item_global);
+   }
+   else {
+      int64_t t0, t1;
+      int64_t dt;
+      unsigned i;
+      if (lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS) {
+         struct pipe_context *pipe = &lp->pipe;
+         struct pipe_fence_handle *fence = NULL;
+
+         /*
+          * XXX: we need to flush the context until we have some sort of reference
+          * counting in fragment shaders as they may still be binned
+          * Flushing alone might not be sufficient we need to wait on it too.
+          */
+         llvmpipe_flush(pipe, 0, &fence);
+
+         if (fence) {
+            pipe->screen->fence_finish(pipe->screen, fence, 0);
+            pipe->screen->fence_reference(pipe->screen, &fence, NULL);
+         }
+         for (i = 0; i < LP_MAX_SHADER_VARIANTS / 4; i++) {
+            struct lp_fs_variant_list_item *item = last_elem(&lp->fs_variants_list);
+            remove_shader_variant(lp, item->base);
+         }
+      }
+      t0 = os_time_get();
+
+      variant = generate_variant(lp, shader, &key);
+
+      t1 = os_time_get();
+      dt = t1 - t0;
+      LP_COUNT_ADD(llvm_compile_time, dt);
+      LP_COUNT_ADD(nr_llvm_compiles, 2);  /* emit vs. omit in/out test */
+
+      if (variant) {
+         insert_at_head(&shader->variants, &variant->list_item_local);
+         insert_at_head(&lp->fs_variants_list, &variant->list_item_global);
+         lp->nr_fs_variants++;
+         shader->variants_cached++;
+      }
+   }
+
+   lp_setup_set_fs_variant(lp->setup, variant);
+}
+
+
+
+void
+llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state;
+   llvmpipe->pipe.bind_fs_state   = llvmpipe_bind_fs_state;
+   llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
+
+   llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
new file mode 100644
index 0000000000..593cd4de6b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -0,0 +1,107 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_STATE_FS_H_
+#define LP_STATE_FS_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h" /* for tgsi_shader_info */
+#include "gallivm/lp_bld_sample.h" /* for struct lp_sampler_static_state */
+
+
+struct tgsi_token;
+struct lp_fragment_shader;
+
+
+/** Indexes into jit_function[] array */
+#define RAST_WHOLE 0
+#define RAST_EDGE_TEST 1
+
+
+struct lp_fragment_shader_variant_key
+{
+   struct pipe_depth_state depth;
+   struct pipe_stencil_state stencil[2];
+   struct pipe_alpha_state alpha;
+   struct pipe_blend_state blend;
+   enum pipe_format zsbuf_format;
+   unsigned nr_cbufs:8;
+   unsigned flatshade:1;
+   unsigned scissor:1;
+   unsigned occlusion_count:1;
+
+   struct {
+      ubyte colormask;
+   } cbuf_blend[PIPE_MAX_COLOR_BUFS];
+
+   struct lp_sampler_static_state sampler[PIPE_MAX_SAMPLERS];
+};
+
+struct lp_fs_variant_list_item
+{
+   struct lp_fragment_shader_variant *base;
+   struct lp_fs_variant_list_item *next, *prev;
+};
+
+struct lp_fragment_shader_variant
+{
+   struct lp_fragment_shader_variant_key key;
+
+   boolean opaque;
+
+   LLVMValueRef function[2];
+
+   lp_jit_frag_func jit_function[2];
+
+   struct lp_fs_variant_list_item list_item_global, list_item_local;
+   struct lp_fragment_shader *shader;
+
+   /* For debugging/profiling purposes */
+   unsigned no;
+};
+
+
+/** Subclass of pipe_shader_state */
+struct lp_fragment_shader
+{
+   struct pipe_shader_state base;
+
+   struct tgsi_shader_info info;
+
+   struct lp_fs_variant_list_item variants;
+
+   /* For debugging/profiling purposes */
+   unsigned no;
+   unsigned variants_created;
+   unsigned variants_cached;
+};
+
+
+#endif /* LP_STATE_FS_H_ */
diff --git a/src/gallium/drivers/llvmpipe/lp_state_gs.c b/src/gallium/drivers/llvmpipe/lp_state_gs.c
new file mode 100644
index 0000000000..1ba6f10821
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_gs.c
@@ -0,0 +1,112 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_texture.h"
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "draw/draw_context.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_parse.h"
+
+
+static void *
+llvmpipe_create_gs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct lp_geometry_shader *state;
+
+   state = CALLOC_STRUCT(lp_geometry_shader);
+   if (state == NULL )
+      goto fail;
+
+   /* debug */
+   if (0)
+      tgsi_dump(templ->tokens, 0);
+
+   /* copy shader tokens, the ones passed in will go away.
+    */
+   state->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (state->shader.tokens == NULL)
+      goto fail;
+
+   state->draw_data = draw_create_geometry_shader(llvmpipe->draw, templ);
+   if (state->draw_data == NULL)
+      goto fail;
+
+   return state;
+
+fail:
+   if (state) {
+      FREE( (void *)state->shader.tokens );
+      FREE( state->draw_data );
+      FREE( state );
+   }
+   return NULL;
+}
+
+
+static void
+llvmpipe_bind_gs_state(struct pipe_context *pipe, void *gs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   llvmpipe->gs = (struct lp_geometry_shader *)gs;
+
+   draw_bind_geometry_shader(llvmpipe->draw,
+                             (llvmpipe->gs ? llvmpipe->gs->draw_data : NULL));
+
+   llvmpipe->dirty |= LP_NEW_GS;
+}
+
+
+static void
+llvmpipe_delete_gs_state(struct pipe_context *pipe, void *gs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   struct lp_geometry_shader *state =
+      (struct lp_geometry_shader *)gs;
+
+   draw_delete_geometry_shader(llvmpipe->draw,
+                               (state) ? state->draw_data : 0);
+   FREE(state);
+}
+
+
+void
+llvmpipe_init_gs_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_gs_state = llvmpipe_create_gs_state;
+   llvmpipe->pipe.bind_gs_state   = llvmpipe_bind_gs_state;
+   llvmpipe->pipe.delete_gs_state = llvmpipe_delete_gs_state;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
new file mode 100644
index 0000000000..afd3e0b21c
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
@@ -0,0 +1,97 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_setup.h"
+#include "draw/draw_context.h"
+
+
+
+static void *
+llvmpipe_create_rasterizer_state(struct pipe_context *pipe,
+                                 const struct pipe_rasterizer_state *rast)
+{
+   /* We do nothing special with rasterizer state.
+    * The CSO handle is just a pointer to a pipe_rasterizer_state object.
+    */
+   return mem_dup(rast, sizeof(*rast));
+}
+
+
+
+static void
+llvmpipe_bind_rasterizer_state(struct pipe_context *pipe, void *handle)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   const struct pipe_rasterizer_state *rasterizer =
+      (const struct pipe_rasterizer_state *) handle;
+
+   if (llvmpipe->rasterizer == rasterizer)
+      return;
+
+   /* pass-through to draw module */
+   draw_set_rasterizer_state(llvmpipe->draw, rasterizer, handle);
+
+   llvmpipe->rasterizer = rasterizer;
+
+   /* Note: we can immediately set the triangle state here and
+    * not worry about binning because we handle culling during
+    * triangle setup, not when rasterizing the bins.
+    */
+   if (llvmpipe->rasterizer) {
+      lp_setup_set_triangle_state( llvmpipe->setup,
+                   llvmpipe->rasterizer->cull_face,
+                   llvmpipe->rasterizer->front_ccw,
+                   llvmpipe->rasterizer->scissor,
+                   llvmpipe->rasterizer->gl_rasterization_rules);
+      lp_setup_set_flatshade_first( llvmpipe->setup,
+                   llvmpipe->rasterizer->flatshade_first);
+   }
+
+   llvmpipe->dirty |= LP_NEW_RASTERIZER;
+}
+
+
+static void
+llvmpipe_delete_rasterizer_state(struct pipe_context *pipe,
+                                 void *rasterizer)
+{
+   FREE( rasterizer );
+}
+
+
+
+void
+llvmpipe_init_rasterizer_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_rasterizer_state = llvmpipe_create_rasterizer_state;
+   llvmpipe->pipe.bind_rasterizer_state   = llvmpipe_bind_rasterizer_state;
+   llvmpipe->pipe.delete_rasterizer_state = llvmpipe_delete_rasterizer_state;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
new file mode 100644
index 0000000000..e94065fb6a
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -0,0 +1,231 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *  Brian Paul
+ */
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+
+#include "draw/draw_context.h"
+
+#include "lp_context.h"
+#include "lp_context.h"
+#include "lp_state.h"
+#include "draw/draw_context.h"
+
+
+
+static void *
+llvmpipe_create_sampler_state(struct pipe_context *pipe,
+                              const struct pipe_sampler_state *sampler)
+{
+   return mem_dup(sampler, sizeof(*sampler));
+}
+
+
+static void
+llvmpipe_bind_sampler_states(struct pipe_context *pipe,
+                             unsigned num, void **sampler)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   unsigned i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == llvmpipe->num_samplers &&
+       !memcmp(llvmpipe->sampler, sampler, num * sizeof(void *)))
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   for (i = 0; i < num; ++i)
+      llvmpipe->sampler[i] = sampler[i];
+   for (i = num; i < PIPE_MAX_SAMPLERS; ++i)
+      llvmpipe->sampler[i] = NULL;
+
+   llvmpipe->num_samplers = num;
+
+   llvmpipe->dirty |= LP_NEW_SAMPLER;
+}
+
+
+static void
+llvmpipe_bind_vertex_sampler_states(struct pipe_context *pipe,
+                                    unsigned num_samplers,
+                                    void **samplers)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   unsigned i;
+
+   assert(num_samplers <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_samplers == llvmpipe->num_vertex_samplers &&
+       !memcmp(llvmpipe->vertex_samplers, samplers, num_samplers * sizeof(void *)))
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   for (i = 0; i < num_samplers; ++i)
+      llvmpipe->vertex_samplers[i] = samplers[i];
+   for (i = num_samplers; i < PIPE_MAX_VERTEX_SAMPLERS; ++i)
+      llvmpipe->vertex_samplers[i] = NULL;
+
+   llvmpipe->num_vertex_samplers = num_samplers;
+
+   llvmpipe->dirty |= LP_NEW_SAMPLER;
+}
+
+
+static void
+llvmpipe_bind_geometry_sampler_states(struct pipe_context *pipe,
+                                      unsigned num, void **sampler)
+{
+   /* XXX: implementation missing */
+}
+
+static void
+llvmpipe_set_fragment_sampler_views(struct pipe_context *pipe,
+                                    unsigned num,
+                                    struct pipe_sampler_view **views)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == llvmpipe->num_fragment_sampler_views &&
+       !memcmp(llvmpipe->fragment_sampler_views, views, num * sizeof(struct pipe_sampler_view *)))
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      struct pipe_sampler_view *view = i < num ? views[i] : NULL;
+
+      pipe_sampler_view_reference(&llvmpipe->fragment_sampler_views[i], view);
+   }
+
+   llvmpipe->num_fragment_sampler_views = num;
+
+   llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
+}
+
+
+static void
+llvmpipe_set_vertex_sampler_views(struct pipe_context *pipe,
+                                  unsigned num,
+                                  struct pipe_sampler_view **views)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == llvmpipe->num_vertex_sampler_views &&
+       !memcmp(llvmpipe->vertex_sampler_views, views, num * sizeof(struct pipe_sampler_view *))) {
+      return;
+   }
+
+   draw_flush(llvmpipe->draw);
+
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      struct pipe_sampler_view *view = i < num ? views[i] : NULL;
+
+      pipe_sampler_view_reference(&llvmpipe->vertex_sampler_views[i], view);
+   }
+
+   llvmpipe->num_vertex_sampler_views = num;
+
+   llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
+}
+
+
+static void
+llvmpipe_set_geometry_sampler_views(struct pipe_context *pipe,
+                                    unsigned num,
+                                    struct pipe_sampler_view **views)
+{
+   /*XXX: implementation missing */
+}
+
+static struct pipe_sampler_view *
+llvmpipe_create_sampler_view(struct pipe_context *pipe,
+                            struct pipe_resource *texture,
+                            const struct pipe_sampler_view *templ)
+{
+   struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
+
+   if (view) {
+      *view = *templ;
+      view->reference.count = 1;
+      view->texture = NULL;
+      pipe_resource_reference(&view->texture, texture);
+      view->context = pipe;
+   }
+
+   return view;
+}
+
+
+static void
+llvmpipe_sampler_view_destroy(struct pipe_context *pipe,
+                              struct pipe_sampler_view *view)
+{
+   pipe_resource_reference(&view->texture, NULL);
+   FREE(view);
+}
+
+
+static void
+llvmpipe_delete_sampler_state(struct pipe_context *pipe,
+                              void *sampler)
+{
+   FREE( sampler );
+}
+
+
+void
+llvmpipe_init_sampler_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_sampler_state = llvmpipe_create_sampler_state;
+
+   llvmpipe->pipe.bind_fragment_sampler_states  = llvmpipe_bind_sampler_states;
+   llvmpipe->pipe.bind_vertex_sampler_states  = llvmpipe_bind_vertex_sampler_states;
+   llvmpipe->pipe.bind_geometry_sampler_states  = llvmpipe_bind_geometry_sampler_states;
+   llvmpipe->pipe.set_fragment_sampler_views = llvmpipe_set_fragment_sampler_views;
+   llvmpipe->pipe.set_vertex_sampler_views = llvmpipe_set_vertex_sampler_views;
+   llvmpipe->pipe.set_geometry_sampler_views = llvmpipe_set_geometry_sampler_views;
+   llvmpipe->pipe.create_sampler_view = llvmpipe_create_sampler_view;
+   llvmpipe->pipe.sampler_view_destroy = llvmpipe_sampler_view_destroy;
+   llvmpipe->pipe.delete_sampler_state = llvmpipe_delete_sampler_state;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_so.c b/src/gallium/drivers/llvmpipe/lp_state_so.c
new file mode 100644
index 0000000000..4c64a5b142
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_so.c
@@ -0,0 +1,138 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_texture.h"
+
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+
+
+static void *
+llvmpipe_create_stream_output_state(struct pipe_context *pipe,
+                                    const struct pipe_stream_output_state *templ)
+{
+   struct lp_so_state *so;
+   so = (struct lp_so_state *) CALLOC_STRUCT(lp_so_state);
+
+   if (so) {
+      so->base.num_outputs = templ->num_outputs;
+      so->base.stride = templ->stride;
+      memcpy(so->base.output_buffer,
+             templ->output_buffer,
+             sizeof(int) * templ->num_outputs);
+      memcpy(so->base.register_index,
+             templ->register_index,
+             sizeof(int) * templ->num_outputs);
+      memcpy(so->base.register_mask,
+             templ->register_mask,
+             sizeof(ubyte) * templ->num_outputs);
+   }
+   return so;
+}
+
+static void
+llvmpipe_bind_stream_output_state(struct pipe_context *pipe,
+                                  void *so)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(pipe);
+   struct lp_so_state *lp_so = (struct lp_so_state *) so;
+
+   lp->so = lp_so;
+
+   lp->dirty |= LP_NEW_SO;
+
+   if (lp_so)
+      draw_set_so_state(lp->draw, &lp_so->base);
+}
+
+static void
+llvmpipe_delete_stream_output_state(struct pipe_context *pipe, void *so)
+{
+   FREE( so );
+}
+
+static void
+llvmpipe_set_stream_output_buffers(struct pipe_context *pipe,
+                                   struct pipe_resource **buffers,
+                                   int *offsets,
+                                   int num_buffers)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(pipe);
+   int i;
+   void *map_buffers[PIPE_MAX_SO_BUFFERS];
+
+   assert(num_buffers <= PIPE_MAX_SO_BUFFERS);
+   if (num_buffers > PIPE_MAX_SO_BUFFERS)
+      num_buffers = PIPE_MAX_SO_BUFFERS;
+
+   lp->dirty |= LP_NEW_SO_BUFFERS;
+
+   for (i = 0; i < num_buffers; ++i) {
+      void *mapped;
+      struct llvmpipe_resource *res = llvmpipe_resource(buffers[i]);
+
+      if (!res) {
+         /* the whole call is invalid, bail out */
+         lp->so_target.num_buffers = 0;
+         draw_set_mapped_so_buffers(lp->draw, 0, 0);
+         return;
+      }
+
+      lp->so_target.buffer[i] = res;
+      lp->so_target.offset[i] = offsets[i];
+      lp->so_target.so_count[i] = 0;
+
+      mapped = res->data;
+      if (offsets[i] >= 0)
+         map_buffers[i] = ((char*)mapped) + offsets[i];
+      else {
+         /* this is a buffer append */
+         assert(!"appending not implemented");
+         map_buffers[i] = mapped;
+      }
+   }
+   lp->so_target.num_buffers = num_buffers;
+
+   draw_set_mapped_so_buffers(lp->draw, map_buffers, num_buffers);
+}
+
+void
+llvmpipe_init_so_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_stream_output_state =
+      llvmpipe_create_stream_output_state;
+   llvmpipe->pipe.bind_stream_output_state =
+      llvmpipe_bind_stream_output_state;
+   llvmpipe->pipe.delete_stream_output_state =
+      llvmpipe_delete_stream_output_state;
+
+   llvmpipe->pipe.set_stream_output_buffers =
+      llvmpipe_set_stream_output_buffers;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c b/src/gallium/drivers/llvmpipe/lp_state_surface.c
new file mode 100644
index 0000000000..4b135aaf8b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -0,0 +1,83 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_framebuffer.h"
+#include "util/u_surface.h"
+#include "lp_context.h"
+#include "lp_scene.h"
+#include "lp_state.h"
+#include "lp_setup.h"
+
+#include "draw/draw_context.h"
+
+#include "util/u_format.h"
+
+
+/**
+ * Set the framebuffer surface info: color buffers, zbuffer, stencil buffer.
+ */
+void
+llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
+                               const struct pipe_framebuffer_state *fb)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(pipe);
+
+   boolean changed = !util_framebuffer_state_equal(&lp->framebuffer, fb);
+
+   assert(fb->width <= LP_MAX_WIDTH);
+   assert(fb->height <= LP_MAX_HEIGHT);
+
+   if (changed) {
+
+      util_copy_framebuffer_state(&lp->framebuffer, fb);
+
+      /* Tell draw module how deep the Z/depth buffer is */
+      if (lp->framebuffer.zsbuf) {
+         int depth_bits;
+         double mrd;
+         depth_bits = util_format_get_component_bits(lp->framebuffer.zsbuf->format,
+                                                     UTIL_FORMAT_COLORSPACE_ZS,
+                                                     0);
+         if (depth_bits > 16) {
+            mrd = 0.0000001;
+         }
+         else {
+            mrd = 0.00002;
+         }
+         draw_set_mrd(lp->draw, mrd);
+      }
+
+      lp_setup_bind_framebuffer( lp->setup, &lp->framebuffer );
+
+      lp->dirty |= LP_NEW_FRAMEBUFFER;
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_vertex.c b/src/gallium/drivers/llvmpipe/lp_state_vertex.c
new file mode 100644
index 0000000000..113f13db01
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_vertex.c
@@ -0,0 +1,101 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "lp_context.h"
+#include "lp_state.h"
+
+#include "draw/draw_context.h"
+
+
+static void *
+llvmpipe_create_vertex_elements_state(struct pipe_context *pipe,
+                                      unsigned count,
+                                      const struct pipe_vertex_element *attribs)
+{
+   struct lp_velems_state *velems;
+   assert(count <= PIPE_MAX_ATTRIBS);
+   velems = (struct lp_velems_state *) MALLOC(sizeof(struct lp_velems_state));
+   if (velems) {
+      velems->count = count;
+      memcpy(velems->velem, attribs, sizeof(*attribs) * count);
+   }
+   return velems;
+}
+
+static void
+llvmpipe_bind_vertex_elements_state(struct pipe_context *pipe,
+                                    void *velems)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct lp_velems_state *lp_velems = (struct lp_velems_state *) velems;
+
+   llvmpipe->velems = lp_velems;
+
+   llvmpipe->dirty |= LP_NEW_VERTEX;
+
+   if (velems)
+      draw_set_vertex_elements(llvmpipe->draw, lp_velems->count, lp_velems->velem);
+}
+
+static void
+llvmpipe_delete_vertex_elements_state(struct pipe_context *pipe, void *velems)
+{
+   FREE( velems );
+}
+
+static void
+llvmpipe_set_vertex_buffers(struct pipe_context *pipe,
+                            unsigned count,
+                            const struct pipe_vertex_buffer *buffers)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(llvmpipe->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   llvmpipe->num_vertex_buffers = count;
+
+   llvmpipe->dirty |= LP_NEW_VERTEX;
+
+   draw_set_vertex_buffers(llvmpipe->draw, count, buffers);
+}
+
+
+
+void
+llvmpipe_init_vertex_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_vertex_elements_state = llvmpipe_create_vertex_elements_state;
+   llvmpipe->pipe.bind_vertex_elements_state = llvmpipe_bind_vertex_elements_state;
+   llvmpipe->pipe.delete_vertex_elements_state = llvmpipe_delete_vertex_elements_state;
+
+   llvmpipe->pipe.set_vertex_buffers = llvmpipe_set_vertex_buffers;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_vs.c b/src/gallium/drivers/llvmpipe/lp_state_vs.c
new file mode 100644
index 0000000000..f2d8808990
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_vs.c
@@ -0,0 +1,118 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_defines.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+
+#include "lp_context.h"
+#include "lp_debug.h"
+#include "lp_state.h"
+
+
+static void *
+llvmpipe_create_vs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct lp_vertex_shader *state;
+
+   state = CALLOC_STRUCT(lp_vertex_shader);
+   if (state == NULL ) 
+      goto fail;
+
+   /* copy shader tokens, the ones passed in will go away.
+    */
+   state->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (state->shader.tokens == NULL)
+      goto fail;
+
+   state->draw_data = draw_create_vertex_shader(llvmpipe->draw, templ);
+   if (state->draw_data == NULL) 
+      goto fail;
+
+   if (LP_DEBUG & DEBUG_TGSI) {
+      debug_printf("llvmpipe: Create vertex shader %p:\n", (void *) state);
+      tgsi_dump(templ->tokens, 0);
+   }
+
+   return state;
+
+fail:
+   if (state) {
+      FREE( (void *)state->shader.tokens );
+      FREE( state->draw_data );
+      FREE( state );
+   }
+   return NULL;
+}
+
+
+static void
+llvmpipe_bind_vs_state(struct pipe_context *pipe, void *_vs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   const struct lp_vertex_shader *vs = (const struct lp_vertex_shader *)_vs;
+
+   if (llvmpipe->vs == vs)
+      return;
+
+   draw_bind_vertex_shader(llvmpipe->draw, 
+                           vs ? vs->draw_data : NULL);
+
+   llvmpipe->vs = vs;
+
+   llvmpipe->dirty |= LP_NEW_VS;
+}
+
+
+static void
+llvmpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   struct lp_vertex_shader *state =
+      (struct lp_vertex_shader *)vs;
+
+   draw_delete_vertex_shader(llvmpipe->draw, state->draw_data);
+   FREE( (void *)state->shader.tokens );
+   FREE( state );
+}
+
+
+
+void
+llvmpipe_init_vs_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_vs_state = llvmpipe_create_vs_state;
+   llvmpipe->pipe.bind_vs_state   = llvmpipe_bind_vs_state;
+   llvmpipe->pipe.delete_vs_state = llvmpipe_delete_vs_state;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c b/src/gallium/drivers/llvmpipe/lp_surface.c
new file mode 100644
index 0000000000..76b3fce1fa
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_surface.c
@@ -0,0 +1,158 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_rect.h"
+#include "util/u_surface.h"
+#include "lp_context.h"
+#include "lp_flush.h"
+#include "lp_limits.h"
+#include "lp_surface.h"
+#include "lp_texture.h"
+
+
+/**
+ * Adjust x, y, width, height to lie on tile bounds.
+ */
+static void
+adjust_to_tile_bounds(unsigned x, unsigned y, unsigned width, unsigned height,
+                      unsigned *x_tile, unsigned *y_tile,
+                      unsigned *w_tile, unsigned *h_tile)
+{
+   *x_tile = x & ~(TILE_SIZE - 1);
+   *y_tile = y & ~(TILE_SIZE - 1);
+   *w_tile = ((x + width + TILE_SIZE - 1) & ~(TILE_SIZE - 1)) - *x_tile;
+   *h_tile = ((y + height + TILE_SIZE - 1) & ~(TILE_SIZE - 1)) - *y_tile;
+}
+
+
+
+static void
+lp_resource_copy(struct pipe_context *pipe,
+                 struct pipe_resource *dst, struct pipe_subresource subdst,
+                 unsigned dstx, unsigned dsty, unsigned dstz,
+                 struct pipe_resource *src, struct pipe_subresource subsrc,
+                 unsigned srcx, unsigned srcy, unsigned srcz,
+                 unsigned width, unsigned height)
+{
+   /* XXX what about the dstz/srcz parameters - zslice wasn't used... */
+   struct llvmpipe_resource *src_tex = llvmpipe_resource(src);
+   struct llvmpipe_resource *dst_tex = llvmpipe_resource(dst);
+   const enum pipe_format format = src_tex->base.format;
+
+   llvmpipe_flush_resource(pipe,
+                           dst, subdst.face, subdst.level,
+                           0, /* flush_flags */
+                           FALSE, /* read_only */
+                           FALSE, /* cpu_access */
+                           FALSE); /* do_not_block */
+
+   llvmpipe_flush_resource(pipe,
+                           src, subsrc.face, subsrc.level,
+                           0, /* flush_flags */
+                           TRUE, /* read_only */
+                           FALSE, /* cpu_access */
+                           FALSE); /* do_not_block */
+
+   /*
+   printf("surface copy from %u to %u: %u,%u to %u,%u %u x %u\n",
+          src_tex->id, dst_tex->id,
+          srcx, srcy, dstx, dsty, width, height);
+   */
+
+   /* set src tiles to linear layout */
+   {
+      unsigned tx, ty, tw, th;
+      unsigned x, y;
+
+      adjust_to_tile_bounds(srcx, srcy, width, height, &tx, &ty, &tw, &th);
+
+      for (y = 0; y < th; y += TILE_SIZE) {
+         for (x = 0; x < tw; x += TILE_SIZE) {
+            (void) llvmpipe_get_texture_tile_linear(src_tex,
+                                                    subsrc.face, subsrc.level,
+                                                    LP_TEX_USAGE_READ,
+                                                    tx + x, ty + y);
+         }
+      }
+   }
+
+   /* set dst tiles to linear layout */
+   {
+      unsigned tx, ty, tw, th;
+      unsigned x, y;
+      enum lp_texture_usage usage;
+
+      /* XXX for the tiles which are completely contained by the
+       * dest rectangle, we could set the usage mode to WRITE_ALL.
+       * Just test for the case of replacing the whole dest region for now.
+       */
+      if (width == dst_tex->base.width0 && height == dst_tex->base.height0)
+         usage = LP_TEX_USAGE_WRITE_ALL;
+      else
+         usage = LP_TEX_USAGE_READ_WRITE;
+
+      adjust_to_tile_bounds(dstx, dsty, width, height, &tx, &ty, &tw, &th);
+
+      for (y = 0; y < th; y += TILE_SIZE) {
+         for (x = 0; x < tw; x += TILE_SIZE) {
+            (void) llvmpipe_get_texture_tile_linear(dst_tex,
+                                                    subdst.face, subdst.level,
+                                                    usage,
+                                                    tx + x, ty + y);
+         }
+      }
+   }
+
+   /* copy */
+   {
+      const ubyte *src_linear_ptr
+         = llvmpipe_get_texture_image_address(src_tex, subsrc.face,
+                                              subsrc.level,
+                                              LP_TEX_LAYOUT_LINEAR);
+      ubyte *dst_linear_ptr
+         = llvmpipe_get_texture_image_address(dst_tex, subdst.face,
+                                              subdst.level,
+                                              LP_TEX_LAYOUT_LINEAR);
+
+      util_copy_rect(dst_linear_ptr, format,
+                     llvmpipe_resource_stride(&dst_tex->base, subdst.level),
+                     dstx, dsty,
+                     width, height,
+                     src_linear_ptr,
+                     llvmpipe_resource_stride(&src_tex->base, subsrc.level),
+                     srcx, srcy);
+   }
+}
+
+
+void
+llvmpipe_init_surface_functions(struct llvmpipe_context *lp)
+{
+   lp->pipe.resource_copy_region = lp_resource_copy;
+   lp->pipe.clear_render_target = util_clear_render_target;
+   lp->pipe.clear_depth_stencil = util_clear_depth_stencil;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_surface.h b/src/gallium/drivers/llvmpipe/lp_surface.h
new file mode 100644
index 0000000000..b1b896ebd9
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_surface.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef LP_SURFACE_H
+#define LP_SURFACE_H
+
+
+struct llvmpipe_context;
+
+
+extern void
+llvmpipe_init_surface_functions(struct llvmpipe_context *lp);
+
+
+#endif /* LP_SURFACE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h
new file mode 100644
index 0000000000..90422e4258
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_test.h
@@ -0,0 +1,144 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Shared testing code.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_TEST_H
+#define LP_TEST_H
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <float.h>
+
+#include "gallivm/lp_bld.h"
+#include <llvm-c/Analysis.h>
+#include <llvm-c/ExecutionEngine.h>
+#include <llvm-c/Target.h>
+#include <llvm-c/BitWriter.h>
+#include <llvm-c/Transforms/Scalar.h>
+
+#include "pipe/p_state.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_dump.h"
+
+#include "gallivm/lp_bld_type.h"
+
+
+#define LP_TEST_NUM_SAMPLES 32
+
+
+void
+write_tsv_header(FILE *fp);
+
+
+boolean
+test_some(unsigned verbose, FILE *fp, unsigned long n);
+
+boolean
+test_single(unsigned verbose, FILE *fp);
+
+boolean
+test_all(unsigned verbose, FILE *fp);
+
+
+#if defined(PIPE_CC_MSVC)
+
+unsigned __int64 __rdtsc();
+#pragma intrinsic(__rdtsc)
+#define rdtsc() __rdtsc()
+
+#elif defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
+
+static INLINE uint64_t
+rdtsc(void)
+{
+   uint32_t hi, lo;
+   __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+   return ((uint64_t)lo) | (((uint64_t)hi) << 32);
+}
+
+#else
+
+#define rdtsc() 0
+
+#endif
+
+
+
+float
+random_float(void);
+
+
+void
+dump_type(FILE *fp, struct lp_type type);
+
+
+double
+read_elem(struct lp_type type, const void *src, unsigned index);
+
+
+void
+write_elem(struct lp_type type, void *dst, unsigned index, double src);
+
+
+void
+random_elem(struct lp_type type, void *dst, unsigned index);
+
+
+void
+read_vec(struct lp_type type, const void *src, double *dst);
+
+
+void
+write_vec(struct lp_type type, void *dst, const double *src);
+
+
+void
+random_vec(struct lp_type type, void *dst);
+
+
+boolean
+compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps);
+
+
+boolean
+compare_vec(struct lp_type type, const void *res, const void *ref);
+
+
+void
+dump_vec(FILE *fp, struct lp_type type, const void *src);
+
+
+#endif /* !LP_TEST_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
new file mode 100644
index 0000000000..0c95555655
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -0,0 +1,905 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Unit tests for blend LLVM IR generation
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ *
+ * Blend computation code derived from code written by
+ * @author Brian Paul <brian@vmware.com>
+ */
+
+
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_debug.h"
+#include "lp_bld_blend.h"
+#include "lp_test.h"
+
+
+enum vector_mode
+{
+   AoS = 0,
+   SoA = 1
+};
+
+
+typedef void (*blend_test_ptr_t)(const void *src, const void *dst, const void *con, void *res);
+
+/** cast wrapper */
+static blend_test_ptr_t
+voidptr_to_blend_test_ptr_t(void *p)
+{
+   union {
+      void *v;
+      blend_test_ptr_t f;
+   } u;
+   u.v = p;
+   return u.f;
+}
+
+
+
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "cycles_per_channel\t"
+           "mode\t"
+           "type\t"
+           "sep_func\t"
+           "sep_src_factor\t"
+           "sep_dst_factor\t"
+           "rgb_func\t"
+           "rgb_src_factor\t"
+           "rgb_dst_factor\t"
+           "alpha_func\t"
+           "alpha_src_factor\t"
+           "alpha_dst_factor\n");
+
+   fflush(fp);
+}
+
+
+static void
+write_tsv_row(FILE *fp,
+              const struct pipe_blend_state *blend,
+              enum vector_mode mode,
+              struct lp_type type,
+              double cycles,
+              boolean success)
+{
+   fprintf(fp, "%s\t", success ? "pass" : "fail");
+
+   if (mode == AoS) {
+      fprintf(fp, "%.1f\t", cycles / type.length);
+      fprintf(fp, "aos\t");
+   }
+
+   if (mode == SoA) {
+      fprintf(fp, "%.1f\t", cycles / (4 * type.length));
+      fprintf(fp, "soa\t");
+   }
+
+   fprintf(fp, "%s%u%sx%u\t",
+           type.floating ? "f" : (type.fixed ? "h" : (type.sign ? "s" : "u")),
+           type.width,
+           type.norm ? "n" : "",
+           type.length);
+
+   fprintf(fp,
+           "%s\t%s\t%s\t",
+           blend->rt[0].rgb_func != blend->rt[0].alpha_func ? "true" : "false",
+           blend->rt[0].rgb_src_factor != blend->rt[0].alpha_src_factor ? "true" : "false",
+           blend->rt[0].rgb_dst_factor != blend->rt[0].alpha_dst_factor ? "true" : "false");
+
+   fprintf(fp,
+           "%s\t%s\t%s\t%s\t%s\t%s\n",
+           util_dump_blend_func(blend->rt[0].rgb_func, TRUE),
+           util_dump_blend_factor(blend->rt[0].rgb_src_factor, TRUE),
+           util_dump_blend_factor(blend->rt[0].rgb_dst_factor, TRUE),
+           util_dump_blend_func(blend->rt[0].alpha_func, TRUE),
+           util_dump_blend_factor(blend->rt[0].alpha_src_factor, TRUE),
+           util_dump_blend_factor(blend->rt[0].alpha_dst_factor, TRUE));
+
+   fflush(fp);
+}
+
+
+static void
+dump_blend_type(FILE *fp,
+                const struct pipe_blend_state *blend,
+                enum vector_mode mode,
+                struct lp_type type)
+{
+   fprintf(fp, "%s", mode ? "soa" : "aos");
+
+   fprintf(fp, " type=%s%u%sx%u",
+           type.floating ? "f" : (type.fixed ? "h" : (type.sign ? "s" : "u")),
+           type.width,
+           type.norm ? "n" : "",
+           type.length);
+
+   fprintf(fp,
+           " %s=%s %s=%s %s=%s %s=%s %s=%s %s=%s",
+           "rgb_func",         util_dump_blend_func(blend->rt[0].rgb_func, TRUE),
+           "rgb_src_factor",   util_dump_blend_factor(blend->rt[0].rgb_src_factor, TRUE),
+           "rgb_dst_factor",   util_dump_blend_factor(blend->rt[0].rgb_dst_factor, TRUE),
+           "alpha_func",       util_dump_blend_func(blend->rt[0].alpha_func, TRUE),
+           "alpha_src_factor", util_dump_blend_factor(blend->rt[0].alpha_src_factor, TRUE),
+           "alpha_dst_factor", util_dump_blend_factor(blend->rt[0].alpha_dst_factor, TRUE));
+
+   fprintf(fp, " ...\n");
+   fflush(fp);
+}
+
+
+static LLVMValueRef
+add_blend_test(LLVMModuleRef module,
+               const struct pipe_blend_state *blend,
+               enum vector_mode mode,
+               struct lp_type type)
+{
+   LLVMTypeRef vec_type;
+   LLVMTypeRef args[4];
+   LLVMValueRef func;
+   LLVMValueRef src_ptr;
+   LLVMValueRef dst_ptr;
+   LLVMValueRef const_ptr;
+   LLVMValueRef res_ptr;
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   const unsigned rt = 0;
+
+   vec_type = lp_build_vec_type(type);
+
+   args[3] = args[2] = args[1] = args[0] = LLVMPointerType(vec_type, 0);
+   func = LLVMAddFunction(module, "test", LLVMFunctionType(LLVMVoidType(), args, 4, 0));
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+   src_ptr = LLVMGetParam(func, 0);
+   dst_ptr = LLVMGetParam(func, 1);
+   const_ptr = LLVMGetParam(func, 2);
+   res_ptr = LLVMGetParam(func, 3);
+
+   block = LLVMAppendBasicBlock(func, "entry");
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   if (mode == AoS) {
+      LLVMValueRef src;
+      LLVMValueRef dst;
+      LLVMValueRef con;
+      LLVMValueRef res;
+
+      src = LLVMBuildLoad(builder, src_ptr, "src");
+      dst = LLVMBuildLoad(builder, dst_ptr, "dst");
+      con = LLVMBuildLoad(builder, const_ptr, "const");
+
+      res = lp_build_blend_aos(builder, blend, type, rt, src, dst, con, 3);
+
+      lp_build_name(res, "res");
+
+      LLVMBuildStore(builder, res, res_ptr);
+   }
+
+   if (mode == SoA) {
+      LLVMValueRef src[4];
+      LLVMValueRef dst[4];
+      LLVMValueRef con[4];
+      LLVMValueRef res[4];
+      unsigned i;
+
+      for(i = 0; i < 4; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         src[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, src_ptr, &index, 1, ""), "");
+         dst[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dst_ptr, &index, 1, ""), "");
+         con[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, const_ptr, &index, 1, ""), "");
+         lp_build_name(src[i], "src.%c", "rgba"[i]);
+         lp_build_name(con[i], "con.%c", "rgba"[i]);
+         lp_build_name(dst[i], "dst.%c", "rgba"[i]);
+      }
+
+      lp_build_blend_soa(builder, blend, type, rt, src, dst, con, res);
+
+      for(i = 0; i < 4; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         lp_build_name(res[i], "res.%c", "rgba"[i]);
+         LLVMBuildStore(builder, res[i], LLVMBuildGEP(builder, res_ptr, &index, 1, ""));
+      }
+   }
+
+   LLVMBuildRetVoid(builder);;
+
+   LLVMDisposeBuilder(builder);
+   return func;
+}
+
+
+/** Add and limit result to ceiling of 1.0 */
+#define ADD_SAT(R, A, B) \
+do { \
+   R = (A) + (B);  if (R > 1.0f) R = 1.0f; \
+} while (0)
+
+/** Subtract and limit result to floor of 0.0 */
+#define SUB_SAT(R, A, B) \
+do { \
+   R = (A) - (B);  if (R < 0.0f) R = 0.0f; \
+} while (0)
+
+
+static void
+compute_blend_ref_term(unsigned rgb_factor,
+                       unsigned alpha_factor,
+                       const double *factor,
+                       const double *src,
+                       const double *dst,
+                       const double *con,
+                       double *term)
+{
+   double temp;
+
+   switch (rgb_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      term[0] = factor[0]; /* R */
+      term[1] = factor[1]; /* G */
+      term[2] = factor[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      term[0] = factor[0] * src[0]; /* R */
+      term[1] = factor[1] * src[1]; /* G */
+      term[2] = factor[2] * src[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      term[0] = factor[0] * src[3]; /* R */
+      term[1] = factor[1] * src[3]; /* G */
+      term[2] = factor[2] * src[3]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      term[0] = factor[0] * dst[0]; /* R */
+      term[1] = factor[1] * dst[1]; /* G */
+      term[2] = factor[2] * dst[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      term[0] = factor[0] * dst[3]; /* R */
+      term[1] = factor[1] * dst[3]; /* G */
+      term[2] = factor[2] * dst[3]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      temp = MIN2(src[3], 1.0f - dst[3]);
+      term[0] = factor[0] * temp; /* R */
+      term[1] = factor[1] * temp; /* G */
+      term[2] = factor[2] * temp; /* B */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      term[0] = factor[0] * con[0]; /* R */
+      term[1] = factor[1] * con[1]; /* G */
+      term[2] = factor[2] * con[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      term[0] = factor[0] * con[3]; /* R */
+      term[1] = factor[1] * con[3]; /* G */
+      term[2] = factor[2] * con[3]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      term[0] = 0.0f; /* R */
+      term[1] = 0.0f; /* G */
+      term[2] = 0.0f; /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      term[0] = factor[0] * (1.0f - src[0]); /* R */
+      term[1] = factor[1] * (1.0f - src[1]); /* G */
+      term[2] = factor[2] * (1.0f - src[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      term[0] = factor[0] * (1.0f - src[3]); /* R */
+      term[1] = factor[1] * (1.0f - src[3]); /* G */
+      term[2] = factor[2] * (1.0f - src[3]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      term[0] = factor[0] * (1.0f - dst[3]); /* R */
+      term[1] = factor[1] * (1.0f - dst[3]); /* G */
+      term[2] = factor[2] * (1.0f - dst[3]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      term[0] = factor[0] * (1.0f - dst[0]); /* R */
+      term[1] = factor[1] * (1.0f - dst[1]); /* G */
+      term[2] = factor[2] * (1.0f - dst[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      term[0] = factor[0] * (1.0f - con[0]); /* R */
+      term[1] = factor[1] * (1.0f - con[1]); /* G */
+      term[2] = factor[2] * (1.0f - con[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      term[0] = factor[0] * (1.0f - con[3]); /* R */
+      term[1] = factor[1] * (1.0f - con[3]); /* G */
+      term[2] = factor[2] * (1.0f - con[3]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Compute src/first term A
+    */
+   switch (alpha_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      term[3] = factor[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      term[3] = factor[3] * src[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      term[3] = factor[3] * dst[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      term[3] = src[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      term[3] = factor[3] * con[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      term[3] = 0.0f; /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      term[3] = factor[3] * (1.0f - src[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      term[3] = factor[3] * (1.0f - dst[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      term[3] = factor[3] * (1.0f - con[3]);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+compute_blend_ref(const struct pipe_blend_state *blend,
+                  const double *src,
+                  const double *dst,
+                  const double *con,
+                  double *res)
+{
+   double src_term[4];
+   double dst_term[4];
+
+   compute_blend_ref_term(blend->rt[0].rgb_src_factor, blend->rt[0].alpha_src_factor,
+                          src, src, dst, con, src_term);
+   compute_blend_ref_term(blend->rt[0].rgb_dst_factor, blend->rt[0].alpha_dst_factor,
+                          dst, src, dst, con, dst_term);
+
+   /*
+    * Combine RGB terms
+    */
+   switch (blend->rt[0].rgb_func) {
+   case PIPE_BLEND_ADD:
+      ADD_SAT(res[0], src_term[0], dst_term[0]); /* R */
+      ADD_SAT(res[1], src_term[1], dst_term[1]); /* G */
+      ADD_SAT(res[2], src_term[2], dst_term[2]); /* B */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      SUB_SAT(res[0], src_term[0], dst_term[0]); /* R */
+      SUB_SAT(res[1], src_term[1], dst_term[1]); /* G */
+      SUB_SAT(res[2], src_term[2], dst_term[2]); /* B */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      SUB_SAT(res[0], dst_term[0], src_term[0]); /* R */
+      SUB_SAT(res[1], dst_term[1], src_term[1]); /* G */
+      SUB_SAT(res[2], dst_term[2], src_term[2]); /* B */
+      break;
+   case PIPE_BLEND_MIN:
+      res[0] = MIN2(src_term[0], dst_term[0]); /* R */
+      res[1] = MIN2(src_term[1], dst_term[1]); /* G */
+      res[2] = MIN2(src_term[2], dst_term[2]); /* B */
+      break;
+   case PIPE_BLEND_MAX:
+      res[0] = MAX2(src_term[0], dst_term[0]); /* R */
+      res[1] = MAX2(src_term[1], dst_term[1]); /* G */
+      res[2] = MAX2(src_term[2], dst_term[2]); /* B */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Combine A terms
+    */
+   switch (blend->rt[0].alpha_func) {
+   case PIPE_BLEND_ADD:
+      ADD_SAT(res[3], src_term[3], dst_term[3]); /* A */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      SUB_SAT(res[3], src_term[3], dst_term[3]); /* A */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      SUB_SAT(res[3], dst_term[3], src_term[3]); /* A */
+      break;
+   case PIPE_BLEND_MIN:
+      res[3] = MIN2(src_term[3], dst_term[3]); /* A */
+      break;
+   case PIPE_BLEND_MAX:
+      res[3] = MAX2(src_term[3], dst_term[3]); /* A */
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+PIPE_ALIGN_STACK
+static boolean
+test_one(unsigned verbose,
+         FILE *fp,
+         const struct pipe_blend_state *blend,
+         enum vector_mode mode,
+         struct lp_type type)
+{
+   LLVMModuleRef module = NULL;
+   LLVMValueRef func = NULL;
+   LLVMExecutionEngineRef engine = NULL;
+   LLVMModuleProviderRef provider = NULL;
+   LLVMPassManagerRef pass = NULL;
+   char *error = NULL;
+   blend_test_ptr_t blend_test_ptr;
+   boolean success;
+   const unsigned n = LP_TEST_NUM_SAMPLES;
+   int64_t cycles[LP_TEST_NUM_SAMPLES];
+   double cycles_avg = 0.0;
+   unsigned i, j;
+   void *code;
+
+   if(verbose >= 1)
+      dump_blend_type(stdout, blend, mode, type);
+
+   module = LLVMModuleCreateWithName("test");
+
+   func = add_blend_test(module, blend, mode, type);
+
+   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
+      LLVMDumpModule(module);
+      abort();
+   }
+   LLVMDisposeMessage(error);
+
+   provider = LLVMCreateModuleProviderForExistingModule(module);
+   if (LLVMCreateJITCompiler(&engine, provider, 1, &error)) {
+      if(verbose < 1)
+         dump_blend_type(stderr, blend, mode, type);
+      fprintf(stderr, "%s\n", error);
+      LLVMDisposeMessage(error);
+      abort();
+   }
+
+#if 0
+   pass = LLVMCreatePassManager();
+   LLVMAddTargetData(LLVMGetExecutionEngineTargetData(engine), pass);
+   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+    * but there are more on SVN. */
+   LLVMAddConstantPropagationPass(pass);
+   LLVMAddInstructionCombiningPass(pass);
+   LLVMAddPromoteMemoryToRegisterPass(pass);
+   LLVMAddGVNPass(pass);
+   LLVMAddCFGSimplificationPass(pass);
+   LLVMRunPassManager(pass, module);
+#else
+   (void)pass;
+#endif
+
+   if(verbose >= 2)
+      LLVMDumpModule(module);
+
+   code = LLVMGetPointerToGlobal(engine, func);
+   blend_test_ptr = voidptr_to_blend_test_ptr_t(code);
+
+   if(verbose >= 2)
+      lp_disassemble(code);
+
+   success = TRUE;
+   for(i = 0; i < n && success; ++i) {
+      if(mode == AoS) {
+         PIPE_ALIGN_VAR(16) uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
+         int64_t start_counter = 0;
+         int64_t end_counter = 0;
+
+         random_vec(type, src);
+         random_vec(type, dst);
+         random_vec(type, con);
+
+         {
+            double fsrc[LP_MAX_VECTOR_LENGTH];
+            double fdst[LP_MAX_VECTOR_LENGTH];
+            double fcon[LP_MAX_VECTOR_LENGTH];
+            double fref[LP_MAX_VECTOR_LENGTH];
+
+            read_vec(type, src, fsrc);
+            read_vec(type, dst, fdst);
+            read_vec(type, con, fcon);
+
+            for(j = 0; j < type.length; j += 4)
+               compute_blend_ref(blend, fsrc + j, fdst + j, fcon + j, fref + j);
+
+            write_vec(type, ref, fref);
+         }
+
+         start_counter = rdtsc();
+         blend_test_ptr(src, dst, con, res);
+         end_counter = rdtsc();
+
+         cycles[i] = end_counter - start_counter;
+
+         if(!compare_vec(type, res, ref)) {
+            success = FALSE;
+
+            if(verbose < 1)
+               dump_blend_type(stderr, blend, mode, type);
+            fprintf(stderr, "MISMATCH\n");
+
+            fprintf(stderr, "  Src: ");
+            dump_vec(stderr, type, src);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Dst: ");
+            dump_vec(stderr, type, dst);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Con: ");
+            dump_vec(stderr, type, con);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Res: ");
+            dump_vec(stderr, type, res);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Ref: ");
+            dump_vec(stderr, type, ref);
+            fprintf(stderr, "\n");
+         }
+      }
+
+      if(mode == SoA) {
+         const unsigned stride = type.length*type.width/8;
+         PIPE_ALIGN_VAR(16) uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
+         int64_t start_counter = 0;
+         int64_t end_counter = 0;
+         boolean mismatch;
+
+         for(j = 0; j < 4; ++j) {
+            random_vec(type, src + j*stride);
+            random_vec(type, dst + j*stride);
+            random_vec(type, con + j*stride);
+         }
+
+         {
+            double fsrc[4];
+            double fdst[4];
+            double fcon[4];
+            double fref[4];
+            unsigned k;
+
+            for(k = 0; k < type.length; ++k) {
+               for(j = 0; j < 4; ++j) {
+                  fsrc[j] = read_elem(type, src + j*stride, k);
+                  fdst[j] = read_elem(type, dst + j*stride, k);
+                  fcon[j] = read_elem(type, con + j*stride, k);
+               }
+
+               compute_blend_ref(blend, fsrc, fdst, fcon, fref);
+
+               for(j = 0; j < 4; ++j)
+                  write_elem(type, ref + j*stride, k, fref[j]);
+            }
+         }
+
+         start_counter = rdtsc();
+         blend_test_ptr(src, dst, con, res);
+         end_counter = rdtsc();
+
+         cycles[i] = end_counter - start_counter;
+
+         mismatch = FALSE;
+         for (j = 0; j < 4; ++j)
+            if(!compare_vec(type, res + j*stride, ref + j*stride))
+               mismatch = TRUE;
+
+         if (mismatch) {
+            success = FALSE;
+
+            if(verbose < 1)
+               dump_blend_type(stderr, blend, mode, type);
+            fprintf(stderr, "MISMATCH\n");
+            for(j = 0; j < 4; ++j) {
+               char channel = "RGBA"[j];
+               fprintf(stderr, "  Src%c: ", channel);
+               dump_vec(stderr, type, src + j*stride);
+               fprintf(stderr, "\n");
+
+               fprintf(stderr, "  Dst%c: ", channel);
+               dump_vec(stderr, type, dst + j*stride);
+               fprintf(stderr, "\n");
+
+               fprintf(stderr, "  Con%c: ", channel);
+               dump_vec(stderr, type, con + j*stride);
+               fprintf(stderr, "\n");
+
+               fprintf(stderr, "  Res%c: ", channel);
+               dump_vec(stderr, type, res + j*stride);
+               fprintf(stderr, "\n");
+
+               fprintf(stderr, "  Ref%c: ", channel);
+               dump_vec(stderr, type, ref + j*stride);
+               fprintf(stderr, "\n");
+            }
+         }
+      }
+   }
+
+   /*
+    * Unfortunately the output of cycle counter is not very reliable as it comes
+    * -- sometimes we get outliers (due IRQs perhaps?) which are
+    * better removed to avoid random or biased data.
+    */
+   {
+      double sum = 0.0, sum2 = 0.0;
+      double avg, std;
+      unsigned m;
+
+      for(i = 0; i < n; ++i) {
+         sum += cycles[i];
+         sum2 += cycles[i]*cycles[i];
+      }
+
+      avg = sum/n;
+      std = sqrtf((sum2 - n*avg*avg)/n);
+
+      m = 0;
+      sum = 0.0;
+      for(i = 0; i < n; ++i) {
+         if(fabs(cycles[i] - avg) <= 4.0*std) {
+            sum += cycles[i];
+            ++m;
+         }
+      }
+
+      cycles_avg = sum/m;
+
+   }
+
+   if(fp)
+      write_tsv_row(fp, blend, mode, type, cycles_avg, success);
+
+   if (!success) {
+      if(verbose < 2)
+         LLVMDumpModule(module);
+      LLVMWriteBitcodeToFile(module, "blend.bc");
+      fprintf(stderr, "blend.bc written\n");
+      fprintf(stderr, "Invoke as \"llc -o - blend.bc\"\n");
+      abort();
+   }
+
+   LLVMFreeMachineCodeForFunction(engine, func);
+
+   LLVMDisposeExecutionEngine(engine);
+   if(pass)
+      LLVMDisposePassManager(pass);
+
+   return success;
+}
+
+
+const unsigned
+blend_factors[] = {
+   PIPE_BLENDFACTOR_ZERO,
+   PIPE_BLENDFACTOR_ONE,
+   PIPE_BLENDFACTOR_SRC_COLOR,
+   PIPE_BLENDFACTOR_SRC_ALPHA,
+   PIPE_BLENDFACTOR_DST_COLOR,
+   PIPE_BLENDFACTOR_DST_ALPHA,
+   PIPE_BLENDFACTOR_CONST_COLOR,
+   PIPE_BLENDFACTOR_CONST_ALPHA,
+#if 0
+   PIPE_BLENDFACTOR_SRC1_COLOR,
+   PIPE_BLENDFACTOR_SRC1_ALPHA,
+#endif
+   PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE,
+   PIPE_BLENDFACTOR_INV_SRC_COLOR,
+   PIPE_BLENDFACTOR_INV_SRC_ALPHA,
+   PIPE_BLENDFACTOR_INV_DST_COLOR,
+   PIPE_BLENDFACTOR_INV_DST_ALPHA,
+   PIPE_BLENDFACTOR_INV_CONST_COLOR,
+   PIPE_BLENDFACTOR_INV_CONST_ALPHA,
+#if 0
+   PIPE_BLENDFACTOR_INV_SRC1_COLOR,
+   PIPE_BLENDFACTOR_INV_SRC1_ALPHA,
+#endif
+};
+
+
+const unsigned
+blend_funcs[] = {
+   PIPE_BLEND_ADD,
+   PIPE_BLEND_SUBTRACT,
+   PIPE_BLEND_REVERSE_SUBTRACT,
+   PIPE_BLEND_MIN,
+   PIPE_BLEND_MAX
+};
+
+
+const struct lp_type blend_types[] = {
+   /* float, fixed,  sign,  norm, width, len */
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   4 }, /* f32 x 4 */
+   {  FALSE, FALSE, FALSE,  TRUE,     8,  16 }, /* u8n x 16 */
+};
+
+
+const unsigned num_funcs = sizeof(blend_funcs)/sizeof(blend_funcs[0]);
+const unsigned num_factors = sizeof(blend_factors)/sizeof(blend_factors[0]);
+const unsigned num_types = sizeof(blend_types)/sizeof(blend_types[0]);
+
+
+boolean
+test_all(unsigned verbose, FILE *fp)
+{
+   const unsigned *rgb_func;
+   const unsigned *rgb_src_factor;
+   const unsigned *rgb_dst_factor;
+   const unsigned *alpha_func;
+   const unsigned *alpha_src_factor;
+   const unsigned *alpha_dst_factor;
+   struct pipe_blend_state blend;
+   enum vector_mode mode;
+   const struct lp_type *type;
+   boolean success = TRUE;
+
+   for(rgb_func = blend_funcs; rgb_func < &blend_funcs[num_funcs]; ++rgb_func) {
+      for(alpha_func = blend_funcs; alpha_func < &blend_funcs[num_funcs]; ++alpha_func) {
+         for(rgb_src_factor = blend_factors; rgb_src_factor < &blend_factors[num_factors]; ++rgb_src_factor) {
+            for(rgb_dst_factor = blend_factors; rgb_dst_factor <= rgb_src_factor; ++rgb_dst_factor) {
+               for(alpha_src_factor = blend_factors; alpha_src_factor < &blend_factors[num_factors]; ++alpha_src_factor) {
+                  for(alpha_dst_factor = blend_factors; alpha_dst_factor <= alpha_src_factor; ++alpha_dst_factor) {
+                     for(mode = 0; mode < 2; ++mode) {
+                        for(type = blend_types; type < &blend_types[num_types]; ++type) {
+
+                           if(*rgb_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+                              *alpha_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE)
+                              continue;
+
+                           memset(&blend, 0, sizeof blend);
+                           blend.rt[0].blend_enable      = 1;
+                           blend.rt[0].rgb_func          = *rgb_func;
+                           blend.rt[0].rgb_src_factor    = *rgb_src_factor;
+                           blend.rt[0].rgb_dst_factor    = *rgb_dst_factor;
+                           blend.rt[0].alpha_func        = *alpha_func;
+                           blend.rt[0].alpha_src_factor  = *alpha_src_factor;
+                           blend.rt[0].alpha_dst_factor  = *alpha_dst_factor;
+                           blend.rt[0].colormask         = PIPE_MASK_RGBA;
+
+                           if(!test_one(verbose, fp, &blend, mode, *type))
+                             success = FALSE;
+
+                        }
+                     }
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp, unsigned long n)
+{
+   const unsigned *rgb_func;
+   const unsigned *rgb_src_factor;
+   const unsigned *rgb_dst_factor;
+   const unsigned *alpha_func;
+   const unsigned *alpha_src_factor;
+   const unsigned *alpha_dst_factor;
+   struct pipe_blend_state blend;
+   enum vector_mode mode;
+   const struct lp_type *type;
+   unsigned long i;
+   boolean success = TRUE;
+
+   for(i = 0; i < n; ++i) {
+      rgb_func = &blend_funcs[rand() % num_funcs];
+      alpha_func = &blend_funcs[rand() % num_funcs];
+      rgb_src_factor = &blend_factors[rand() % num_factors];
+      alpha_src_factor = &blend_factors[rand() % num_factors];
+      
+      do {
+         rgb_dst_factor = &blend_factors[rand() % num_factors];
+      } while(*rgb_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
+
+      do {
+         alpha_dst_factor = &blend_factors[rand() % num_factors];
+      } while(*alpha_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
+
+      mode = rand() & 1;
+
+      type = &blend_types[rand() % num_types];
+
+      memset(&blend, 0, sizeof blend);
+      blend.rt[0].blend_enable      = 1;
+      blend.rt[0].rgb_func          = *rgb_func;
+      blend.rt[0].rgb_src_factor    = *rgb_src_factor;
+      blend.rt[0].rgb_dst_factor    = *rgb_dst_factor;
+      blend.rt[0].alpha_func        = *alpha_func;
+      blend.rt[0].alpha_src_factor  = *alpha_src_factor;
+      blend.rt[0].alpha_dst_factor  = *alpha_dst_factor;
+      blend.rt[0].colormask         = PIPE_MASK_RGBA;
+
+      if(!test_one(verbose, fp, &blend, mode, *type))
+        success = FALSE;
+   }
+
+   return success;
+}
+
+
+boolean
+test_single(unsigned verbose, FILE *fp)
+{
+   printf("no test_single()");
+   return TRUE;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
new file mode 100644
index 0000000000..9b02f436c5
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -0,0 +1,453 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Unit tests for type conversion.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_pointer.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_conv.h"
+#include "gallivm/lp_bld_debug.h"
+#include "lp_test.h"
+
+
+typedef void (*conv_test_ptr_t)(const void *src, const void *dst);
+
+
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "cycles_per_channel\t"
+           "src_type\t"
+           "dst_type\n");
+
+   fflush(fp);
+}
+
+
+static void
+write_tsv_row(FILE *fp,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              double cycles,
+              boolean success)
+{
+   fprintf(fp, "%s\t", success ? "pass" : "fail");
+
+   fprintf(fp, "%.1f\t", cycles / MAX2(src_type.length, dst_type.length));
+
+   dump_type(fp, src_type);
+   fprintf(fp, "\t");
+
+   dump_type(fp, dst_type);
+   fprintf(fp, "\n");
+
+   fflush(fp);
+}
+
+
+static void
+dump_conv_types(FILE *fp,
+               struct lp_type src_type,
+               struct lp_type dst_type)
+{
+   fprintf(fp, "src_type=");
+   dump_type(fp, src_type);
+
+   fprintf(fp, " dst_type=");
+   dump_type(fp, dst_type);
+
+   fprintf(fp, " ...\n");
+   fflush(fp);
+}
+
+
+static LLVMValueRef
+add_conv_test(LLVMModuleRef module,
+              struct lp_type src_type, unsigned num_srcs,
+              struct lp_type dst_type, unsigned num_dsts)
+{
+   LLVMTypeRef args[2];
+   LLVMValueRef func;
+   LLVMValueRef src_ptr;
+   LLVMValueRef dst_ptr;
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   LLVMValueRef src[LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef dst[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   args[0] = LLVMPointerType(lp_build_vec_type(src_type), 0);
+   args[1] = LLVMPointerType(lp_build_vec_type(dst_type), 0);
+
+   func = LLVMAddFunction(module, "test", LLVMFunctionType(LLVMVoidType(), args, 2, 0));
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+   src_ptr = LLVMGetParam(func, 0);
+   dst_ptr = LLVMGetParam(func, 1);
+
+   block = LLVMAppendBasicBlock(func, "entry");
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   for(i = 0; i < num_srcs; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef ptr = LLVMBuildGEP(builder, src_ptr, &index, 1, "");
+      src[i] = LLVMBuildLoad(builder, ptr, "");
+   }
+
+   lp_build_conv(builder, src_type, dst_type, src, num_srcs, dst, num_dsts);
+
+   for(i = 0; i < num_dsts; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef ptr = LLVMBuildGEP(builder, dst_ptr, &index, 1, "");
+      LLVMBuildStore(builder, dst[i], ptr);
+   }
+
+   LLVMBuildRetVoid(builder);;
+
+   LLVMDisposeBuilder(builder);
+   return func;
+}
+
+
+PIPE_ALIGN_STACK
+static boolean
+test_one(unsigned verbose,
+         FILE *fp,
+         struct lp_type src_type,
+         struct lp_type dst_type)
+{
+   LLVMModuleRef module = NULL;
+   LLVMValueRef func = NULL;
+   LLVMExecutionEngineRef engine = NULL;
+   LLVMModuleProviderRef provider = NULL;
+   LLVMPassManagerRef pass = NULL;
+   char *error = NULL;
+   conv_test_ptr_t conv_test_ptr;
+   boolean success;
+   const unsigned n = LP_TEST_NUM_SAMPLES;
+   int64_t cycles[LP_TEST_NUM_SAMPLES];
+   double cycles_avg = 0.0;
+   unsigned num_srcs;
+   unsigned num_dsts;
+   double eps;
+   unsigned i, j;
+   void *code;
+
+   if(verbose >= 1)
+      dump_conv_types(stdout, src_type, dst_type);
+
+   if(src_type.length > dst_type.length) {
+      num_srcs = 1;
+      num_dsts = src_type.length/dst_type.length;
+   }
+   else  {
+      num_dsts = 1;
+      num_srcs = dst_type.length/src_type.length;
+   }
+
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+   eps = MAX2(lp_const_eps(src_type), lp_const_eps(dst_type));
+
+   module = LLVMModuleCreateWithName("test");
+
+   func = add_conv_test(module, src_type, num_srcs, dst_type, num_dsts);
+
+   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
+      LLVMDumpModule(module);
+      abort();
+   }
+   LLVMDisposeMessage(error);
+
+   provider = LLVMCreateModuleProviderForExistingModule(module);
+   if (LLVMCreateJITCompiler(&engine, provider, 1, &error)) {
+      if(verbose < 1)
+         dump_conv_types(stderr, src_type, dst_type);
+      fprintf(stderr, "%s\n", error);
+      LLVMDisposeMessage(error);
+      abort();
+   }
+
+#if 0
+   pass = LLVMCreatePassManager();
+   LLVMAddTargetData(LLVMGetExecutionEngineTargetData(engine), pass);
+   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+    * but there are more on SVN. */
+   LLVMAddConstantPropagationPass(pass);
+   LLVMAddInstructionCombiningPass(pass);
+   LLVMAddPromoteMemoryToRegisterPass(pass);
+   LLVMAddGVNPass(pass);
+   LLVMAddCFGSimplificationPass(pass);
+   LLVMRunPassManager(pass, module);
+#else
+   (void)pass;
+#endif
+
+   if(verbose >= 2)
+      LLVMDumpModule(module);
+
+   code = LLVMGetPointerToGlobal(engine, func);
+   conv_test_ptr = (conv_test_ptr_t)pointer_to_func(code);
+
+   if(verbose >= 2)
+      lp_disassemble(code);
+
+   success = TRUE;
+   for(i = 0; i < n && success; ++i) {
+      unsigned src_stride = src_type.length*src_type.width/8;
+      unsigned dst_stride = dst_type.length*dst_type.width/8;
+      PIPE_ALIGN_VAR(16) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(16) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      int64_t start_counter = 0;
+      int64_t end_counter = 0;
+
+      for(j = 0; j < num_srcs; ++j) {
+         random_vec(src_type, src + j*src_stride);
+         read_vec(src_type, src + j*src_stride, fref + j*src_type.length);
+      }
+
+      for(j = 0; j < num_dsts; ++j) {
+         write_vec(dst_type, ref + j*dst_stride, fref + j*dst_type.length);
+      }
+
+      start_counter = rdtsc();
+      conv_test_ptr(src, dst);
+      end_counter = rdtsc();
+
+      cycles[i] = end_counter - start_counter;
+
+      for(j = 0; j < num_dsts; ++j) {
+         if(!compare_vec_with_eps(dst_type, dst + j*dst_stride, ref + j*dst_stride, eps))
+            success = FALSE;
+      }
+
+      if (!success || verbose >= 3) {
+         if(verbose < 1)
+            dump_conv_types(stderr, src_type, dst_type);
+         if (success) {
+            fprintf(stderr, "PASS\n");
+         }
+         else {
+            fprintf(stderr, "MISMATCH\n");
+         }
+
+         for(j = 0; j < num_srcs; ++j) {
+            fprintf(stderr, "  Src%u: ", j);
+            dump_vec(stderr, src_type, src + j*src_stride);
+            fprintf(stderr, "\n");
+         }
+
+#if 1
+         fprintf(stderr, "  Ref: ");
+         for(j = 0; j < src_type.length*num_srcs; ++j)
+            fprintf(stderr, " %f", fref[j]);
+         fprintf(stderr, "\n");
+#endif
+
+         for(j = 0; j < num_dsts; ++j) {
+            fprintf(stderr, "  Dst%u: ", j);
+            dump_vec(stderr, dst_type, dst + j*dst_stride);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Ref%u: ", j);
+            dump_vec(stderr, dst_type, ref + j*dst_stride);
+            fprintf(stderr, "\n");
+         }
+      }
+   }
+
+   /*
+    * Unfortunately the output of cycle counter is not very reliable as it comes
+    * -- sometimes we get outliers (due IRQs perhaps?) which are
+    * better removed to avoid random or biased data.
+    */
+   {
+      double sum = 0.0, sum2 = 0.0;
+      double avg, std;
+      unsigned m;
+
+      for(i = 0; i < n; ++i) {
+         sum += cycles[i];
+         sum2 += cycles[i]*cycles[i];
+      }
+
+      avg = sum/n;
+      std = sqrtf((sum2 - n*avg*avg)/n);
+
+      m = 0;
+      sum = 0.0;
+      for(i = 0; i < n; ++i) {
+         if(fabs(cycles[i] - avg) <= 4.0*std) {
+            sum += cycles[i];
+            ++m;
+         }
+      }
+
+      cycles_avg = sum/m;
+
+   }
+
+   if(fp)
+      write_tsv_row(fp, src_type, dst_type, cycles_avg, success);
+
+   if (!success) {
+      static boolean firsttime = TRUE;
+      if(firsttime) {
+         if(verbose < 2)
+            LLVMDumpModule(module);
+         LLVMWriteBitcodeToFile(module, "conv.bc");
+         fprintf(stderr, "conv.bc written\n");
+         fprintf(stderr, "Invoke as \"llc -o - conv.bc\"\n");
+         firsttime = FALSE;
+         /* abort(); */
+      }
+   }
+
+   LLVMFreeMachineCodeForFunction(engine, func);
+
+   LLVMDisposeExecutionEngine(engine);
+   if(pass)
+      LLVMDisposePassManager(pass);
+
+   return success;
+}
+
+
+const struct lp_type conv_types[] = {
+   /* float, fixed,  sign,  norm, width, len */
+
+   {   TRUE, FALSE,  TRUE,  TRUE,    32,   4 },
+   {   TRUE, FALSE,  TRUE, FALSE,    32,   4 },
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   4 },
+   {   TRUE, FALSE, FALSE, FALSE,    32,   4 },
+
+   /* TODO: test fixed formats too */
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    16,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    16,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    16,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    16,   8 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    32,   4 },
+   {  FALSE, FALSE,  TRUE, FALSE,    32,   4 },
+   {  FALSE, FALSE, FALSE,  TRUE,    32,   4 },
+   {  FALSE, FALSE, FALSE, FALSE,    32,   4 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    16,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    16,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    16,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    16,   8 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,     8,  16 },
+   {  FALSE, FALSE,  TRUE, FALSE,     8,  16 },
+   {  FALSE, FALSE, FALSE,  TRUE,     8,  16 },
+   {  FALSE, FALSE, FALSE, FALSE,     8,  16 },
+};
+
+
+const unsigned num_types = sizeof(conv_types)/sizeof(conv_types[0]);
+
+
+boolean
+test_all(unsigned verbose, FILE *fp)
+{
+   const struct lp_type *src_type;
+   const struct lp_type *dst_type;
+   boolean success = TRUE;
+
+   for(src_type = conv_types; src_type < &conv_types[num_types]; ++src_type) {
+      for(dst_type = conv_types; dst_type < &conv_types[num_types]; ++dst_type) {
+
+         if(src_type == dst_type)
+            continue;
+
+         if(src_type->norm != dst_type->norm)
+            continue;
+
+         if(!test_one(verbose, fp, *src_type, *dst_type))
+           success = FALSE;
+
+      }
+   }
+
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp, unsigned long n)
+{
+   const struct lp_type *src_type;
+   const struct lp_type *dst_type;
+   unsigned long i;
+   boolean success = TRUE;
+
+   for(i = 0; i < n; ++i) {
+      src_type = &conv_types[rand() % num_types];
+      
+      do {
+         dst_type = &conv_types[rand() % num_types];
+      } while (src_type == dst_type || src_type->norm != dst_type->norm);
+
+      if(!test_one(verbose, fp, *src_type, *dst_type))
+        success = FALSE;
+   }
+
+   return success;
+}
+
+
+boolean
+test_single(unsigned verbose, FILE *fp)
+{
+   /*    float, fixed,  sign,  norm, width, len */
+   struct lp_type f32x4_type =
+      {   TRUE, FALSE,  TRUE,  TRUE,    32,   4 };
+   struct lp_type ub8x4_type =
+      {  FALSE, FALSE, FALSE,  TRUE,     8,  16 };
+
+   boolean success;
+
+   success = test_one(verbose, fp, f32x4_type, ub8x4_type);
+
+   return success;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
new file mode 100644
index 0000000000..8b6dc1c7f5
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -0,0 +1,279 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <float.h>
+
+#include "gallivm/lp_bld.h"
+#include "gallivm/lp_bld_init.h"
+#include <llvm-c/Analysis.h>
+#include <llvm-c/Target.h>
+#include <llvm-c/Transforms/Scalar.h>
+
+#include "util/u_memory.h"
+#include "util/u_pointer.h"
+#include "util/u_format.h"
+#include "util/u_format_tests.h"
+#include "util/u_format_s3tc.h"
+
+#include "gallivm/lp_bld_format.h"
+#include "lp_test.h"
+
+
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "format\n");
+
+   fflush(fp);
+}
+
+
+static void
+write_tsv_row(FILE *fp,
+              const struct util_format_description *desc,
+              boolean success)
+{
+   fprintf(fp, "%s\t", success ? "pass" : "fail");
+
+   fprintf(fp, "%s\n", desc->name);
+
+   fflush(fp);
+}
+
+
+typedef void
+(*fetch_ptr_t)(float *, const void *packed,
+               unsigned i, unsigned j);
+
+
+static LLVMValueRef
+add_fetch_rgba_test(LLVMModuleRef lp_build_module,
+                    const struct util_format_description *desc)
+{
+   LLVMTypeRef args[4];
+   LLVMValueRef func;
+   LLVMValueRef packed_ptr;
+   LLVMValueRef rgba_ptr;
+   LLVMValueRef i;
+   LLVMValueRef j;
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   LLVMValueRef rgba;
+
+   args[0] = LLVMPointerType(LLVMVectorType(LLVMFloatType(), 4), 0);
+   args[1] = LLVMPointerType(LLVMInt8Type(), 0);
+   args[3] = args[2] = LLVMInt32Type();
+
+   func = LLVMAddFunction(lp_build_module, "fetch", LLVMFunctionType(LLVMVoidType(), args, Elements(args), 0));
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+   rgba_ptr = LLVMGetParam(func, 0);
+   packed_ptr = LLVMGetParam(func, 1);
+   i = LLVMGetParam(func, 2);
+   j = LLVMGetParam(func, 3);
+
+   block = LLVMAppendBasicBlock(func, "entry");
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   rgba = lp_build_fetch_rgba_aos(builder, desc, packed_ptr, i, j);
+
+   LLVMBuildStore(builder, rgba, rgba_ptr);
+
+   LLVMBuildRetVoid(builder);
+
+   LLVMDisposeBuilder(builder);
+   return func;
+}
+
+
+PIPE_ALIGN_STACK
+static boolean
+test_format(unsigned verbose, FILE *fp,
+            const struct util_format_description *desc,
+            const struct util_format_test_case *test)
+{
+   LLVMValueRef fetch = NULL;
+   LLVMPassManagerRef pass = NULL;
+   fetch_ptr_t fetch_ptr;
+   PIPE_ALIGN_VAR(16) float unpacked[4];
+   boolean success;
+   unsigned i, j, k;
+
+   fetch = add_fetch_rgba_test(lp_build_module, desc);
+
+   if (LLVMVerifyFunction(fetch, LLVMPrintMessageAction)) {
+      LLVMDumpValue(fetch);
+      abort();
+   }
+
+#if 0
+   pass = LLVMCreatePassManager();
+   LLVMAddTargetData(LLVMGetExecutionEngineTargetData(lp_build_engine), pass);
+   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+    * but there are more on SVN. */
+   LLVMAddConstantPropagationPass(pass);
+   LLVMAddInstructionCombiningPass(pass);
+   LLVMAddPromoteMemoryToRegisterPass(pass);
+   LLVMAddGVNPass(pass);
+   LLVMAddCFGSimplificationPass(pass);
+   LLVMRunPassManager(pass, lp_build_module);
+#else
+   (void)pass;
+#endif
+
+   fetch_ptr = (fetch_ptr_t)pointer_to_func(LLVMGetPointerToGlobal(lp_build_engine, fetch));
+
+   for (i = 0; i < desc->block.height; ++i) {
+      for (j = 0; j < desc->block.width; ++j) {
+
+         memset(unpacked, 0, sizeof unpacked);
+
+         fetch_ptr(unpacked, test->packed, j, i);
+
+         success = TRUE;
+         for(k = 0; k < 4; ++k)
+            if (fabs((float)test->unpacked[i][j][k] - unpacked[k]) > FLT_EPSILON)
+               success = FALSE;
+
+         if (!success) {
+            printf("FAILED\n");
+            printf("  Packed: %02x %02x %02x %02x\n",
+                   test->packed[0], test->packed[1], test->packed[2], test->packed[3]);
+            printf("  Unpacked (%u,%u): %f %f %f %f obtained\n",
+                   j, i,
+                   unpacked[0], unpacked[1], unpacked[2], unpacked[3]);
+            printf("                  %f %f %f %f expected\n",
+                   test->unpacked[i][j][0],
+                   test->unpacked[i][j][1],
+                   test->unpacked[i][j][2],
+                   test->unpacked[i][j][3]);
+         }
+      }
+   }
+
+   if (!success)
+      LLVMDumpValue(fetch);
+
+   LLVMFreeMachineCodeForFunction(lp_build_engine, fetch);
+   LLVMDeleteFunction(fetch);
+
+   if(pass)
+      LLVMDisposePassManager(pass);
+
+   if(fp)
+      write_tsv_row(fp, desc, success);
+
+   return success;
+}
+
+
+
+static boolean
+test_one(unsigned verbose, FILE *fp,
+         const struct util_format_description *format_desc)
+{
+   unsigned i;
+   boolean first = TRUE;
+   boolean success = TRUE;
+
+   for (i = 0; i < util_format_nr_test_cases; ++i) {
+      const struct util_format_test_case *test = &util_format_test_cases[i];
+
+      if (test->format == format_desc->format) {
+
+         if (first) {
+            printf("Testing %s ...\n",
+                   format_desc->name);
+            first = FALSE;
+         }
+
+         if (!test_format(verbose, fp, format_desc, test)) {
+           success = FALSE;
+         }
+      }
+   }
+
+   return success;
+}
+
+
+boolean
+test_all(unsigned verbose, FILE *fp)
+{
+   enum pipe_format format;
+   boolean success = TRUE;
+
+   util_format_s3tc_init();
+
+   for (format = 1; format < PIPE_FORMAT_COUNT; ++format) {
+      const struct util_format_description *format_desc;
+
+      format_desc = util_format_description(format);
+      if (!format_desc) {
+         continue;
+      }
+
+      /*
+       * TODO: test more
+       */
+
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+         continue;
+      }
+
+      if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC &&
+          !util_format_s3tc_enabled) {
+         continue;
+      }
+
+      if (!test_one(verbose, fp, format_desc)) {
+           success = FALSE;
+      }
+   }
+
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp, unsigned long n)
+{
+   return test_all(verbose, fp);
+}
+
+
+boolean
+test_single(unsigned verbose, FILE *fp)
+{
+   printf("no test_single()");
+   return TRUE;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
new file mode 100644
index 0000000000..7bbbc61d4c
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -0,0 +1,408 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Shared testing code.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_cpu_detect.h"
+
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_init.h"
+#include "lp_test.h"
+
+
+#ifdef PIPE_CC_MSVC
+static INLINE double
+round(double x)
+{
+   if (x >= 0.0)
+      return floor(x + 0.5);
+   else
+      return ceil(x - 0.5);
+}
+#endif
+
+
+void
+dump_type(FILE *fp,
+          struct lp_type type)
+{
+   fprintf(fp, "%s%s%u%sx%u",
+           type.sign ? (type.floating || type.fixed ? "" : "s") : "u",
+           type.floating ? "f" : (type.fixed ? "h" : "i"),
+           type.width,
+           type.norm ? "n" : "",
+           type.length);
+}
+
+
+double
+read_elem(struct lp_type type, const void *src, unsigned index)
+{
+   double scale = lp_const_scale(type);
+   double value;
+   assert(index < type.length);
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         value = *((const float *)src + index);
+         break;
+      case 64:
+         value =  *((const double *)src + index);
+         break;
+      default:
+         assert(0);
+         return 0.0;
+      }
+   }
+   else {
+      if(type.sign) {
+         switch(type.width) {
+         case 8:
+            value = *((const int8_t *)src + index);
+            break;
+         case 16:
+            value = *((const int16_t *)src + index);
+            break;
+         case 32:
+            value = *((const int32_t *)src + index);
+            break;
+         case 64:
+            value = *((const int64_t *)src + index);
+            break;
+         default:
+            assert(0);
+            return 0.0;
+         }
+      }
+      else {
+         switch(type.width) {
+         case 8:
+            value = *((const uint8_t *)src + index);
+            break;
+         case 16:
+            value = *((const uint16_t *)src + index);
+            break;
+         case 32:
+            value = *((const uint32_t *)src + index);
+            break;
+         case 64:
+            value = *((const uint64_t *)src + index);
+            break;
+         default:
+            assert(0);
+            return 0.0;
+         }
+      }
+   }
+   return value/scale;
+}
+
+
+void
+write_elem(struct lp_type type, void *dst, unsigned index, double value)
+{
+   assert(index < type.length);
+   if(!type.sign && value < 0.0)
+      value = 0.0;
+   if(type.norm && value < -1.0)
+      value = -1.0;
+   if(type.norm && value > 1.0)
+      value = 1.0;
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         *((float *)dst + index) = (float)(value);
+         break;
+      case 64:
+          *((double *)dst + index) = value;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   else {
+      double scale = lp_const_scale(type);
+      value = round(value*scale);
+      if(type.sign) {
+         long long lvalue = (long long)value;
+         lvalue = MIN2(lvalue, ((long long)1 << (type.width - 1)) - 1);
+         switch(type.width) {
+         case 8:
+            *((int8_t *)dst + index) = (int8_t)lvalue;
+            break;
+         case 16:
+            *((int16_t *)dst + index) = (int16_t)lvalue;
+            break;
+         case 32:
+            *((int32_t *)dst + index) = (int32_t)lvalue;
+            break;
+         case 64:
+            *((int64_t *)dst + index) = (int64_t)lvalue;
+            break;
+         default:
+            assert(0);
+         }
+      }
+      else {
+         unsigned long long lvalue = (long long)value;
+         lvalue = MIN2(lvalue, ((unsigned long long)1 << type.width) - 1);
+         switch(type.width) {
+         case 8:
+            *((uint8_t *)dst + index) = (uint8_t)lvalue;
+            break;
+         case 16:
+            *((uint16_t *)dst + index) = (uint16_t)lvalue;
+            break;
+         case 32:
+            *((uint32_t *)dst + index) = (uint32_t)lvalue;
+            break;
+         case 64:
+            *((uint64_t *)dst + index) = (uint64_t)lvalue;
+            break;
+         default:
+            assert(0);
+         }
+      }
+   }
+}
+
+
+void
+random_elem(struct lp_type type, void *dst, unsigned index)
+{
+   double value;
+   assert(index < type.length);
+   value = (double)rand()/(double)RAND_MAX;
+   if(!type.norm) {
+      unsigned long long mask;
+      if (type.floating)
+         mask = ~(unsigned long long)0;
+      else if (type.fixed)
+         mask = ((unsigned long long)1 << (type.width / 2)) - 1;
+      else if (type.sign)
+         mask = ((unsigned long long)1 << (type.width - 1)) - 1;
+      else
+         mask = ((unsigned long long)1 << type.width) - 1;
+      value += (double)(mask & rand());
+   }
+   if(!type.sign)
+      if(rand() & 1)
+         value = -value;
+   write_elem(type, dst, index, value);
+}
+
+
+void
+read_vec(struct lp_type type, const void *src, double *dst)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i)
+      dst[i] = read_elem(type, src, i);
+}
+
+
+void
+write_vec(struct lp_type type, void *dst, const double *src)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i)
+      write_elem(type, dst, i, src[i]);
+}
+
+
+float
+random_float(void)
+{
+    return (float)((double)rand()/(double)RAND_MAX);
+}
+
+
+void
+random_vec(struct lp_type type, void *dst)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i)
+      random_elem(type, dst, i);
+}
+
+
+boolean
+compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i) {
+      double res_elem = read_elem(type, res, i);
+      double ref_elem = read_elem(type, ref, i);
+      double delta = fabs(res_elem - ref_elem);
+      if(delta >= 2.0*eps)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+boolean
+compare_vec(struct lp_type type, const void *res, const void *ref)
+{
+   double eps = lp_const_eps(type);
+   return compare_vec_with_eps(type, res, ref, eps);
+}
+
+
+void
+dump_vec(FILE *fp, struct lp_type type, const void *src)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i) {
+      if(i)
+         fprintf(fp, " ");
+      if (type.floating) {
+         double value;
+         switch(type.width) {
+         case 32:
+            value = *((const float *)src + i);
+            break;
+         case 64:
+            value = *((const double *)src + i);
+            break;
+         default:
+            assert(0);
+            value = 0.0;
+         }
+         fprintf(fp, "%f", value);
+      }
+      else {
+         if(type.sign && !type.norm) {
+            long long value;
+            const char *format;
+            switch(type.width) {
+            case 8:
+               value = *((const int8_t *)src + i);
+               format = "%3lli";
+               break;
+            case 16:
+               value = *((const int16_t *)src + i);
+               format = "%5lli";
+               break;
+            case 32:
+               value = *((const int32_t *)src + i);
+               format = "%10lli";
+               break;
+            case 64:
+               value = *((const int64_t *)src + i);
+               format = "%20lli";
+               break;
+            default:
+               assert(0);
+               value = 0.0;
+               format = "?";
+            }
+            fprintf(fp, format, value);
+         }
+         else {
+            unsigned long long value;
+            const char *format;
+            switch(type.width) {
+            case 8:
+               value = *((const uint8_t *)src + i);
+               format = type.norm ? "%2x" : "%4llu";
+               break;
+            case 16:
+               value = *((const uint16_t *)src + i);
+               format = type.norm ? "%4x" : "%6llx";
+               break;
+            case 32:
+               value = *((const uint32_t *)src + i);
+               format = type.norm ? "%8x" : "%11llx";
+               break;
+            case 64:
+               value = *((const uint64_t *)src + i);
+               format = type.norm ? "%16x" : "%21llx";
+               break;
+            default:
+               assert(0);
+               value = 0.0;
+               format = "?";
+            }
+            fprintf(fp, format, value);
+         }
+      }
+   }
+}
+
+
+int main(int argc, char **argv)
+{
+   unsigned verbose = 0;
+   FILE *fp = NULL;
+   unsigned long n = 1000;
+   unsigned i;
+   boolean success;
+   boolean single = FALSE;
+
+   for(i = 1; i < argc; ++i) {
+      if(strcmp(argv[i], "-v") == 0)
+         ++verbose;
+      else if(strcmp(argv[i], "-s") == 0)
+         single = TRUE;
+      else if(strcmp(argv[i], "-o") == 0)
+         fp = fopen(argv[++i], "wt");
+      else
+         n = atoi(argv[i]);
+   }
+
+   lp_build_init();
+
+   util_cpu_detect();
+
+   if(fp) {
+      /* Warm up the caches */
+      test_some(0, NULL, 100);
+
+      write_tsv_header(fp);
+   }
+      
+   if (single)
+      success = test_single(verbose, fp);
+   else if (n)
+      success = test_some(verbose, fp, n);
+   else
+      success = test_all(verbose, fp);
+
+   if(fp)
+      fclose(fp);
+
+   return success ? 0 : 1;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_test_printf.c b/src/gallium/drivers/llvmpipe/lp_test_printf.c
new file mode 100644
index 0000000000..21df83f9d8
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_test_printf.c
@@ -0,0 +1,175 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "util/u_pointer.h"
+#include "gallivm/lp_bld.h"
+#include "gallivm/lp_bld_printf.h"
+
+#include <llvm-c/Analysis.h>
+#include <llvm-c/ExecutionEngine.h>
+#include <llvm-c/Target.h>
+#include <llvm-c/Transforms/Scalar.h>
+
+#include "lp_test.h"
+
+
+struct printf_test_case {
+   int foo;
+};
+
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "format\n");
+
+   fflush(fp);
+}
+
+
+
+typedef void (*test_printf_t)(int i);
+
+
+static LLVMValueRef
+add_printf_test(LLVMModuleRef module)
+{
+   LLVMTypeRef args[1] = { LLVMIntType(32) };
+   LLVMValueRef func = LLVMAddFunction(module, "test_printf", LLVMFunctionType(LLVMVoidType(), args, 1, 0));
+   LLVMBuilderRef builder = LLVMCreateBuilder();
+   LLVMBasicBlockRef block = LLVMAppendBasicBlock(func, "entry");
+
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+
+   LLVMPositionBuilderAtEnd(builder, block);
+   lp_build_printf(builder, "hello, world\n");
+   lp_build_printf(builder, "print 5 6: %d %d\n", LLVMConstInt(LLVMInt32Type(), 5, 0),
+				LLVMConstInt(LLVMInt32Type(), 6, 0));
+   LLVMBuildRetVoid(builder);
+   LLVMDisposeBuilder(builder);
+   return func;
+}
+
+
+PIPE_ALIGN_STACK
+static boolean
+test_printf(unsigned verbose, FILE *fp, const struct printf_test_case *testcase)
+{
+   LLVMModuleRef module = NULL;
+   LLVMValueRef test = NULL;
+   LLVMExecutionEngineRef engine = NULL;
+   LLVMModuleProviderRef provider = NULL;
+   LLVMPassManagerRef pass = NULL;
+   char *error = NULL;
+   test_printf_t test_printf;
+   float unpacked[4];
+   unsigned packed;
+   boolean success = TRUE;
+   void *code;
+
+   module = LLVMModuleCreateWithName("test");
+
+   test = add_printf_test(module);
+
+   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
+      LLVMDumpModule(module);
+      abort();
+   }
+   LLVMDisposeMessage(error);
+
+   provider = LLVMCreateModuleProviderForExistingModule(module);
+   if (LLVMCreateJITCompiler(&engine, provider, 1, &error)) {
+      fprintf(stderr, "%s\n", error);
+      LLVMDisposeMessage(error);
+      abort();
+   }
+
+#if 0
+   pass = LLVMCreatePassManager();
+   LLVMAddTargetData(LLVMGetExecutionEngineTargetData(engine), pass);
+   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+    * but there are more on SVN. */
+   LLVMAddConstantPropagationPass(pass);
+   LLVMAddInstructionCombiningPass(pass);
+   LLVMAddPromoteMemoryToRegisterPass(pass);
+   LLVMAddGVNPass(pass);
+   LLVMAddCFGSimplificationPass(pass);
+   LLVMRunPassManager(pass, module);
+#else
+   (void)pass;
+#endif
+
+   code = LLVMGetPointerToGlobal(engine, test);
+   test_printf = (test_printf_t)pointer_to_func(code);
+
+   memset(unpacked, 0, sizeof unpacked);
+   packed = 0;
+
+
+   // LLVMDumpModule(module);
+
+   test_printf(0);
+
+   LLVMFreeMachineCodeForFunction(engine, test);
+
+   LLVMDisposeExecutionEngine(engine);
+   if(pass)
+      LLVMDisposePassManager(pass);
+
+   return success;
+}
+
+
+boolean
+test_all(unsigned verbose, FILE *fp)
+{
+   boolean success = TRUE;
+
+   test_printf(verbose, fp, NULL);
+
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp, unsigned long n)
+{
+   return test_all(verbose, fp);
+}
+
+
+boolean
+test_single(unsigned verbose, FILE *fp)
+{
+   printf("no test_single()");
+   return TRUE;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_test_sincos.c b/src/gallium/drivers/llvmpipe/lp_test_sincos.c
new file mode 100644
index 0000000000..c7a903a025
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_test_sincos.c
@@ -0,0 +1,207 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "gallivm/lp_bld.h"
+#include "gallivm/lp_bld_printf.h"
+#include "gallivm/lp_bld_arit.h"
+
+#include <llvm-c/Analysis.h>
+#include <llvm-c/ExecutionEngine.h>
+#include <llvm-c/Target.h>
+#include <llvm-c/Transforms/Scalar.h>
+
+#include "lp_test.h"
+
+
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "format\n");
+
+   fflush(fp);
+}
+
+
+#ifdef PIPE_ARCH_SSE
+
+#define USE_SSE2
+#include "sse_mathfun.h"
+
+typedef __m128 (*test_sincos_t)(__m128);
+
+static LLVMValueRef
+add_sincos_test(LLVMModuleRef module, boolean sin)
+{
+   LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
+   LLVMTypeRef args[1] = { v4sf };
+   LLVMValueRef func = LLVMAddFunction(module, "sincos", LLVMFunctionType(v4sf, args, 1, 0));
+   LLVMValueRef arg1 = LLVMGetParam(func, 0);
+   LLVMBuilderRef builder = LLVMCreateBuilder();
+   LLVMBasicBlockRef block = LLVMAppendBasicBlock(func, "entry");
+   LLVMValueRef ret;
+   struct lp_build_context bld;
+
+   bld.builder = builder;
+   bld.type.floating = 1;
+   bld.type.width = 32;
+   bld.type.length = 4;
+
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+
+   LLVMPositionBuilderAtEnd(builder, block);
+   ret = sin ? lp_build_sin(&bld, arg1) : lp_build_cos(&bld, arg1);
+   LLVMBuildRet(builder, ret);
+   LLVMDisposeBuilder(builder);
+   return func;
+}
+
+static void
+printv(char* string, v4sf value)
+{
+   v4sf v = value;
+   uint32_t  *p = (uint32_t *) &v;
+   float *f = (float *)&v;
+   printf("%s: %f(%x) %f(%x) %f(%x) %f(%x)\n", string,
+           f[0], p[0], f[1], p[1], f[2], p[2], f[3], p[3]);
+}
+
+PIPE_ALIGN_STACK
+static boolean
+test_sincos(unsigned verbose, FILE *fp)
+{
+   LLVMModuleRef module = NULL;
+   LLVMValueRef test_sin = NULL, test_cos = NULL;
+   LLVMExecutionEngineRef engine = NULL;
+   LLVMModuleProviderRef provider = NULL;
+   LLVMPassManagerRef pass = NULL;
+   char *error = NULL;
+   test_sincos_t sin_func;
+   test_sincos_t cos_func;
+   float unpacked[4];
+   unsigned packed;
+   boolean success = TRUE;
+
+   module = LLVMModuleCreateWithName("test");
+
+   test_sin = add_sincos_test(module, TRUE);
+   test_cos = add_sincos_test(module, FALSE);
+
+   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
+      printf("LLVMVerifyModule: %s\n", error);
+      LLVMDumpModule(module);
+      abort();
+   }
+   LLVMDisposeMessage(error);
+
+   provider = LLVMCreateModuleProviderForExistingModule(module);
+   if (LLVMCreateJITCompiler(&engine, provider, 1, &error)) {
+      fprintf(stderr, "%s\n", error);
+      LLVMDisposeMessage(error);
+      abort();
+   }
+
+#if 0
+   pass = LLVMCreatePassManager();
+   LLVMAddTargetData(LLVMGetExecutionEngineTargetData(engine), pass);
+   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+    * but there are more on SVN. */
+   LLVMAddConstantPropagationPass(pass);
+   LLVMAddInstructionCombiningPass(pass);
+   LLVMAddPromoteMemoryToRegisterPass(pass);
+   LLVMAddGVNPass(pass);
+   LLVMAddCFGSimplificationPass(pass);
+   LLVMRunPassManager(pass, module);
+#else
+   (void)pass;
+#endif
+
+   sin_func = (test_sincos_t)LLVMGetPointerToGlobal(engine, test_sin);
+   cos_func = (test_sincos_t)LLVMGetPointerToGlobal(engine, test_cos);
+
+   memset(unpacked, 0, sizeof unpacked);
+   packed = 0;
+
+
+   // LLVMDumpModule(module);
+   {
+      v4sf src = {3.14159/4.0, -3.14159/4.0, 1.0, -1.0};
+      printv("ref ",sin_ps(src));
+      printv("llvm", sin_func(src));
+      printv("ref ",cos_ps(src));
+      printv("llvm",cos_func(src));
+   }
+
+   LLVMFreeMachineCodeForFunction(engine, test_sin);
+   LLVMFreeMachineCodeForFunction(engine, test_cos);
+
+   LLVMDisposeExecutionEngine(engine);
+   if(pass)
+      LLVMDisposePassManager(pass);
+
+   return success;
+}
+
+#else /* !PIPE_ARCH_SSE */
+
+static boolean
+test_sincos(unsigned verbose, FILE *fp)
+{
+   return TRUE;
+}
+
+#endif /* !PIPE_ARCH_SSE */
+
+
+boolean
+test_all(unsigned verbose, FILE *fp)
+{
+   boolean success = TRUE;
+
+   test_sincos(verbose, fp);
+
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp, unsigned long n)
+{
+   return test_all(verbose, fp);
+}
+
+boolean
+test_single(unsigned verbose, FILE *fp)
+{
+   printf("no test_single()");
+   return TRUE;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
new file mode 100644
index 0000000000..65208dd5d5
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -0,0 +1,219 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture sampling code generation
+ *
+ * This file is nothing more than ugly glue between three largely independent
+ * entities:
+ * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa)
+ * - texture sampling code generation (i.e., lp_build_sample_soa)
+ * - LLVM pipe driver
+ *
+ * All interesting code is in the functions mentioned above. There is really
+ * nothing to see here.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_sample.h"
+#include "gallivm/lp_bld_tgsi.h"
+#include "lp_jit.h"
+#include "lp_tex_sample.h"
+
+
+/**
+ * This provides the bridge between the sampler state store in
+ * lp_jit_context and lp_jit_texture and the sampler code
+ * generator. It provides the texture layout information required by
+ * the texture sampler code generator in terms of the state stored in
+ * lp_jit_context and lp_jit_texture in runtime.
+ */
+struct llvmpipe_sampler_dynamic_state
+{
+   struct lp_sampler_dynamic_state base;
+
+   const struct lp_sampler_static_state *static_state;
+
+   LLVMValueRef context_ptr;
+};
+
+
+/**
+ * This is the bridge between our sampler and the TGSI translator.
+ */
+struct lp_llvm_sampler_soa
+{
+   struct lp_build_sampler_soa base;
+
+   struct llvmpipe_sampler_dynamic_state dynamic_state;
+};
+
+
+/**
+ * Fetch the specified member of the lp_jit_texture structure.
+ * \param emit_load  if TRUE, emit the LLVM load instruction to actually
+ *                   fetch the field's value.  Otherwise, just emit the
+ *                   GEP code to address the field.
+ *
+ * @sa http://llvm.org/docs/GetElementPtr.html
+ */
+static LLVMValueRef
+lp_llvm_texture_member(const struct lp_sampler_dynamic_state *base,
+                       LLVMBuilderRef builder,
+                       unsigned unit,
+                       unsigned member_index,
+                       const char *member_name,
+                       boolean emit_load)
+{
+   struct llvmpipe_sampler_dynamic_state *state =
+      (struct llvmpipe_sampler_dynamic_state *)base;
+   LLVMValueRef indices[4];
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   assert(unit < PIPE_MAX_SAMPLERS);
+
+   /* context[0] */
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   /* context[0].textures */
+   indices[1] = LLVMConstInt(LLVMInt32Type(), LP_JIT_CTX_TEXTURES, 0);
+   /* context[0].textures[unit] */
+   indices[2] = LLVMConstInt(LLVMInt32Type(), unit, 0);
+   /* context[0].textures[unit].member */
+   indices[3] = LLVMConstInt(LLVMInt32Type(), member_index, 0);
+
+   ptr = LLVMBuildGEP(builder, state->context_ptr, indices, Elements(indices), "");
+
+   if (emit_load)
+      res = LLVMBuildLoad(builder, ptr, "");
+   else
+      res = ptr;
+
+   lp_build_name(res, "context.texture%u.%s", unit, member_name);
+
+   return res;
+}
+
+
+/**
+ * Helper macro to instantiate the functions that generate the code to
+ * fetch the members of lp_jit_texture to fulfill the sampler code
+ * generator requests.
+ *
+ * This complexity is the price we have to pay to keep the texture
+ * sampler code generator a reusable module without dependencies to
+ * llvmpipe internals.
+ */
+#define LP_LLVM_TEXTURE_MEMBER(_name, _index, _emit_load)  \
+   static LLVMValueRef \
+   lp_llvm_texture_##_name( const struct lp_sampler_dynamic_state *base, \
+                            LLVMBuilderRef builder, \
+                            unsigned unit) \
+   { \
+      return lp_llvm_texture_member(base, builder, unit, _index, #_name, _emit_load ); \
+   }
+
+
+LP_LLVM_TEXTURE_MEMBER(width,      LP_JIT_TEXTURE_WIDTH, TRUE)
+LP_LLVM_TEXTURE_MEMBER(height,     LP_JIT_TEXTURE_HEIGHT, TRUE)
+LP_LLVM_TEXTURE_MEMBER(depth,      LP_JIT_TEXTURE_DEPTH, TRUE)
+LP_LLVM_TEXTURE_MEMBER(last_level, LP_JIT_TEXTURE_LAST_LEVEL, TRUE)
+LP_LLVM_TEXTURE_MEMBER(row_stride, LP_JIT_TEXTURE_ROW_STRIDE, FALSE)
+LP_LLVM_TEXTURE_MEMBER(img_stride, LP_JIT_TEXTURE_IMG_STRIDE, FALSE)
+LP_LLVM_TEXTURE_MEMBER(data_ptr,   LP_JIT_TEXTURE_DATA, FALSE)
+
+
+static void
+lp_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+/**
+ * Fetch filtered values from texture.
+ * The 'texel' parameter returns four vectors corresponding to R, G, B, A.
+ */
+static void
+lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
+                                     LLVMBuilderRef builder,
+                                     struct lp_type type,
+                                     unsigned unit,
+                                     unsigned num_coords,
+                                     const LLVMValueRef *coords,
+                                     const LLVMValueRef *ddx,
+                                     const LLVMValueRef *ddy,
+                                     LLVMValueRef lod_bias, /* optional */
+                                     LLVMValueRef explicit_lod, /* optional */
+                                     LLVMValueRef *texel)
+{
+   struct lp_llvm_sampler_soa *sampler = (struct lp_llvm_sampler_soa *)base;
+
+   assert(unit < PIPE_MAX_SAMPLERS);
+
+   lp_build_sample_soa(builder,
+                       &sampler->dynamic_state.static_state[unit],
+                       &sampler->dynamic_state.base,
+                       type,
+                       unit,
+                       num_coords, coords,
+                       ddx, ddy,
+                       lod_bias, explicit_lod,
+                       texel);
+}
+
+
+struct lp_build_sampler_soa *
+lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
+                           LLVMValueRef context_ptr)
+{
+   struct lp_llvm_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(lp_llvm_sampler_soa);
+   if(!sampler)
+      return NULL;
+
+   sampler->base.destroy = lp_llvm_sampler_soa_destroy;
+   sampler->base.emit_fetch_texel = lp_llvm_sampler_soa_emit_fetch_texel;
+   sampler->dynamic_state.base.width = lp_llvm_texture_width;
+   sampler->dynamic_state.base.height = lp_llvm_texture_height;
+   sampler->dynamic_state.base.depth = lp_llvm_texture_depth;
+   sampler->dynamic_state.base.last_level = lp_llvm_texture_last_level;
+   sampler->dynamic_state.base.row_stride = lp_llvm_texture_row_stride;
+   sampler->dynamic_state.base.img_stride = lp_llvm_texture_img_stride;
+   sampler->dynamic_state.base.data_ptr = lp_llvm_texture_data_ptr;
+   sampler->dynamic_state.static_state = static_state;
+   sampler->dynamic_state.context_ptr = context_ptr;
+
+   return &sampler->base;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
new file mode 100644
index 0000000000..1228a831f3
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -0,0 +1,48 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_TEX_SAMPLE_H
+#define LP_TEX_SAMPLE_H
+
+
+#include "gallivm/lp_bld.h"
+
+
+struct lp_sampler_static_state;
+
+
+/**
+ * Pure-LLVM texture sampling code generator.
+ *
+ * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
+ */
+struct lp_build_sampler_soa *
+lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key,
+                           LLVMValueRef context_ptr);
+
+
+#endif /* LP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
new file mode 100644
index 0000000000..0d526ead89
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -0,0 +1,1288 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  */
+
+#include <stdio.h>
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+#include "util/u_transfer.h"
+
+#include "lp_context.h"
+#include "lp_flush.h"
+#include "lp_screen.h"
+#include "lp_tile_image.h"
+#include "lp_texture.h"
+#include "lp_setup.h"
+
+#include "state_tracker/sw_winsys.h"
+
+
+#ifdef DEBUG
+static struct llvmpipe_resource resource_list;
+#endif
+
+
+static INLINE boolean
+resource_is_texture(const struct pipe_resource *resource)
+{
+   switch (resource->target) {
+   case PIPE_BUFFER:
+      return FALSE;
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_3D:
+   case PIPE_TEXTURE_CUBE:
+      return TRUE;
+   default:
+      assert(0);
+      return FALSE;
+   }
+}
+
+
+
+/**
+ * Allocate storage for llvmpipe_texture::layout array.
+ * The number of elements is width_in_tiles * height_in_tiles.
+ */
+static enum lp_texture_layout *
+alloc_layout_array(unsigned num_slices, unsigned width, unsigned height)
+{
+   const unsigned tx = align(width, TILE_SIZE) / TILE_SIZE;
+   const unsigned ty = align(height, TILE_SIZE) / TILE_SIZE;
+
+   assert(num_slices * tx * ty > 0);
+   assert(LP_TEX_LAYOUT_NONE == 0); /* calloc'ing LP_TEX_LAYOUT_NONE here */
+
+   return (enum lp_texture_layout *)
+      CALLOC(num_slices * tx * ty, sizeof(enum lp_texture_layout));
+}
+
+
+
+/**
+ * Conventional allocation path for non-display textures:
+ * Just compute row strides here.  Storage is allocated on demand later.
+ */
+static boolean
+llvmpipe_texture_layout(struct llvmpipe_screen *screen,
+                        struct llvmpipe_resource *lpr)
+{
+   struct pipe_resource *pt = &lpr->base;
+   unsigned level;
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
+
+   assert(LP_MAX_TEXTURE_2D_LEVELS <= LP_MAX_TEXTURE_LEVELS);
+   assert(LP_MAX_TEXTURE_3D_LEVELS <= LP_MAX_TEXTURE_LEVELS);
+
+   for (level = 0; level <= pt->last_level; level++) {
+
+      /* Row stride and image stride (for linear layout) */
+      {
+         unsigned alignment, nblocksx, nblocksy, block_size;
+
+         /* For non-compressed formats we need to align the texture size
+          * to the tile size to facilitate render-to-texture.
+          */
+         if (util_format_is_compressed(pt->format))
+            alignment = 1;
+         else
+            alignment = TILE_SIZE;
+
+         nblocksx = util_format_get_nblocksx(pt->format,
+                                             align(width, alignment));
+         nblocksy = util_format_get_nblocksy(pt->format,
+                                             align(height, alignment));
+         block_size = util_format_get_blocksize(pt->format);
+
+         lpr->row_stride[level] = align(nblocksx * block_size, 16);
+
+         lpr->img_stride[level] = lpr->row_stride[level] * nblocksy;
+      }
+
+      /* Size of the image in tiles (for tiled layout) */
+      {
+         const unsigned width_t = align(width, TILE_SIZE) / TILE_SIZE;
+         const unsigned height_t = align(height, TILE_SIZE) / TILE_SIZE;
+         lpr->tiles_per_row[level] = width_t;
+         lpr->tiles_per_image[level] = width_t * height_t;
+      }
+
+      /* Number of 3D image slices or cube faces */
+      {
+         unsigned num_slices;
+
+         if (lpr->base.target == PIPE_TEXTURE_CUBE)
+            num_slices = 6;
+         else if (lpr->base.target == PIPE_TEXTURE_3D)
+            num_slices = depth;
+         else
+            num_slices = 1;
+
+         lpr->num_slices_faces[level] = num_slices;
+
+         lpr->layout[level] = alloc_layout_array(num_slices, width, height);
+      }
+
+      /* Compute size of next mipmap level */
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
+   }
+
+   return TRUE;
+}
+
+
+
+static boolean
+llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
+                              struct llvmpipe_resource *lpr)
+{
+   struct sw_winsys *winsys = screen->winsys;
+
+   /* Round up the surface size to a multiple of the tile size to
+    * avoid tile clipping.
+    */
+   const unsigned width = align(lpr->base.width0, TILE_SIZE);
+   const unsigned height = align(lpr->base.height0, TILE_SIZE);
+   const unsigned width_t = align(width, TILE_SIZE) / TILE_SIZE;
+   const unsigned height_t = align(height, TILE_SIZE) / TILE_SIZE;
+
+   lpr->tiles_per_row[0] = width_t;
+   lpr->tiles_per_image[0] = width_t * height_t;
+   lpr->num_slices_faces[0] = 1;
+   lpr->img_stride[0] = 0;
+
+   lpr->layout[0] = alloc_layout_array(1, width, height);
+   //lpr->layout[0][0] = LP_TEX_LAYOUT_LINEAR;
+
+   lpr->dt = winsys->displaytarget_create(winsys,
+                                          lpr->base.bind,
+                                          lpr->base.format,
+                                          width, height,
+                                          16,
+                                          &lpr->row_stride[0] );
+
+   return lpr->dt != NULL;
+}
+
+
+static struct pipe_resource *
+llvmpipe_resource_create(struct pipe_screen *_screen,
+                         const struct pipe_resource *templat)
+{
+   static unsigned id_counter = 0;
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct llvmpipe_resource *lpr = CALLOC_STRUCT(llvmpipe_resource);
+   if (!lpr)
+      return NULL;
+
+   lpr->base = *templat;
+   pipe_reference_init(&lpr->base.reference, 1);
+   lpr->base.screen = &screen->base;
+
+   /* assert(lpr->base.bind); */
+
+   if (resource_is_texture(&lpr->base)) {
+      if (lpr->base.bind & PIPE_BIND_DISPLAY_TARGET) {
+         /* displayable surface */
+         if (!llvmpipe_displaytarget_layout(screen, lpr))
+            goto fail;
+         assert(lpr->layout[0][0] == LP_TEX_LAYOUT_NONE);
+      }
+      else {
+         /* texture map */
+         if (!llvmpipe_texture_layout(screen, lpr))
+            goto fail;
+         assert(lpr->layout[0][0] == LP_TEX_LAYOUT_NONE);
+      }
+      assert(lpr->layout[0]);
+   }
+   else {
+      /* other data (vertex buffer, const buffer, etc) */
+      const enum pipe_format format = templat->format;
+      const uint w = templat->width0 / util_format_get_blockheight(format);
+      const uint h = templat->height0 / util_format_get_blockwidth(format);
+      const uint d = templat->depth0;
+      const uint bpp = util_format_get_blocksize(format);
+      const uint bytes = w * h * d * bpp;
+      lpr->data = align_malloc(bytes, 16);
+      if (!lpr->data)
+         goto fail;
+   }
+
+   lpr->id = id_counter++;
+
+#ifdef DEBUG
+   insert_at_tail(&resource_list, lpr);
+#endif
+
+   return &lpr->base;
+
+ fail:
+   FREE(lpr);
+   return NULL;
+}
+
+
+static void
+llvmpipe_resource_destroy(struct pipe_screen *pscreen,
+			  struct pipe_resource *pt)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(pscreen);
+   struct llvmpipe_resource *lpr = llvmpipe_resource(pt);
+
+   if (lpr->dt) {
+      /* display target */
+      struct sw_winsys *winsys = screen->winsys;
+      winsys->displaytarget_destroy(winsys, lpr->dt);
+
+      if (lpr->tiled[0].data) {
+         align_free(lpr->tiled[0].data);
+         lpr->tiled[0].data = NULL;
+      }
+
+      FREE(lpr->layout[0]);
+   }
+   else if (resource_is_texture(pt)) {
+      /* regular texture */
+      uint level;
+
+      /* free linear image data */
+      for (level = 0; level < Elements(lpr->linear); level++) {
+         if (lpr->linear[level].data) {
+            align_free(lpr->linear[level].data);
+            lpr->linear[level].data = NULL;
+         }
+      }
+
+      /* free tiled image data */
+      for (level = 0; level < Elements(lpr->tiled); level++) {
+         if (lpr->tiled[level].data) {
+            align_free(lpr->tiled[level].data);
+            lpr->tiled[level].data = NULL;
+         }
+      }
+
+      /* free layout flag arrays */
+      for (level = 0; level < Elements(lpr->tiled); level++) {
+         FREE(lpr->layout[level]);
+         lpr->layout[level] = NULL;
+      }
+   }
+   else if (!lpr->userBuffer) {
+      assert(lpr->data);
+      align_free(lpr->data);
+   }
+
+#ifdef DEBUG
+   if (lpr->next)
+      remove_from_list(lpr);
+#endif
+
+   FREE(lpr);
+}
+
+
+/**
+ * Map a resource for read/write.
+ */
+void *
+llvmpipe_resource_map(struct pipe_resource *resource,
+		      unsigned face,
+		      unsigned level,
+		      unsigned zslice,
+                      enum lp_texture_usage tex_usage,
+                      enum lp_texture_layout layout)
+{
+   struct llvmpipe_resource *lpr = llvmpipe_resource(resource);
+   uint8_t *map;
+
+   assert(face < 6);
+   assert(level < LP_MAX_TEXTURE_LEVELS);
+
+   assert(tex_usage == LP_TEX_USAGE_READ ||
+          tex_usage == LP_TEX_USAGE_READ_WRITE ||
+          tex_usage == LP_TEX_USAGE_WRITE_ALL);
+
+   assert(layout == LP_TEX_LAYOUT_NONE ||
+          layout == LP_TEX_LAYOUT_TILED ||
+          layout == LP_TEX_LAYOUT_LINEAR);
+
+   if (lpr->dt) {
+      /* display target */
+      struct llvmpipe_screen *screen = llvmpipe_screen(resource->screen);
+      struct sw_winsys *winsys = screen->winsys;
+      unsigned dt_usage;
+      uint8_t *map2;
+
+      if (tex_usage == LP_TEX_USAGE_READ) {
+         dt_usage = PIPE_TRANSFER_READ;
+      }
+      else {
+         dt_usage = PIPE_TRANSFER_READ_WRITE;
+      }
+
+      assert(face == 0);
+      assert(level == 0);
+      assert(zslice == 0);
+
+      /* FIXME: keep map count? */
+      map = winsys->displaytarget_map(winsys, lpr->dt, dt_usage);
+
+      /* install this linear image in texture data structure */
+      lpr->linear[level].data = map;
+
+      /* make sure tiled data gets converted to linear data */
+      map2 = llvmpipe_get_texture_image(lpr, 0, 0, tex_usage, layout);
+      if (layout == LP_TEX_LAYOUT_LINEAR)
+         assert(map == map2);
+
+      return map2;
+   }
+   else if (resource_is_texture(resource)) {
+      /* regular texture */
+      if (resource->target != PIPE_TEXTURE_CUBE) {
+         assert(face == 0);
+      }
+      if (resource->target != PIPE_TEXTURE_3D) {
+         assert(zslice == 0);
+      }
+
+      map = llvmpipe_get_texture_image(lpr, face + zslice, level,
+                                       tex_usage, layout);
+      assert(map);
+      return map;
+   }
+   else {
+      return lpr->data;
+   }
+}
+
+
+/**
+ * Unmap a resource.
+ */
+void
+llvmpipe_resource_unmap(struct pipe_resource *resource,
+                       unsigned face,
+                       unsigned level,
+                       unsigned zslice)
+{
+   struct llvmpipe_resource *lpr = llvmpipe_resource(resource);
+
+   if (lpr->dt) {
+      /* display target */
+      struct llvmpipe_screen *lp_screen = llvmpipe_screen(resource->screen);
+      struct sw_winsys *winsys = lp_screen->winsys;
+
+      assert(face == 0);
+      assert(level == 0);
+      assert(zslice == 0);
+
+      /* make sure linear image is up to date */
+      (void) llvmpipe_get_texture_image(lpr, face + zslice, level,
+                                        LP_TEX_USAGE_READ,
+                                        LP_TEX_LAYOUT_LINEAR);
+
+      winsys->displaytarget_unmap(winsys, lpr->dt);
+   }
+}
+
+
+void *
+llvmpipe_resource_data(struct pipe_resource *resource)
+{
+   struct llvmpipe_resource *lpr = llvmpipe_resource(resource);
+
+   assert(!resource_is_texture(resource));
+
+   return lpr->data;
+}
+
+
+static struct pipe_resource *
+llvmpipe_resource_from_handle(struct pipe_screen *screen,
+			      const struct pipe_resource *template,
+			      struct winsys_handle *whandle)
+{
+   struct sw_winsys *winsys = llvmpipe_screen(screen)->winsys;
+   struct llvmpipe_resource *lpr = CALLOC_STRUCT(llvmpipe_resource);
+   if (!lpr)
+      return NULL;
+
+   lpr->base = *template;
+   pipe_reference_init(&lpr->base.reference, 1);
+   lpr->base.screen = screen;
+
+   lpr->dt = winsys->displaytarget_from_handle(winsys,
+                                               template,
+                                               whandle,
+                                               &lpr->row_stride[0]);
+   if (!lpr->dt)
+      goto fail;
+
+   return &lpr->base;
+
+ fail:
+   FREE(lpr);
+   return NULL;
+}
+
+
+static boolean
+llvmpipe_resource_get_handle(struct pipe_screen *screen,
+                            struct pipe_resource *pt,
+                            struct winsys_handle *whandle)
+{
+   struct sw_winsys *winsys = llvmpipe_screen(screen)->winsys;
+   struct llvmpipe_resource *lpr = llvmpipe_resource(pt);
+
+   assert(lpr->dt);
+   if (!lpr->dt)
+      return FALSE;
+
+   return winsys->displaytarget_get_handle(winsys, lpr->dt, whandle);
+}
+
+
+static struct pipe_surface *
+llvmpipe_get_tex_surface(struct pipe_screen *screen,
+                         struct pipe_resource *pt,
+                         unsigned face, unsigned level, unsigned zslice,
+                         unsigned usage)
+{
+   struct pipe_surface *ps;
+
+   assert(level <= pt->last_level);
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      pipe_reference_init(&ps->reference, 1);
+      pipe_resource_reference(&ps->texture, pt);
+      ps->format = pt->format;
+      ps->width = u_minify(pt->width0, level);
+      ps->height = u_minify(pt->height0, level);
+      ps->usage = usage;
+
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
+   }
+   return ps;
+}
+
+
+static void 
+llvmpipe_tex_surface_destroy(struct pipe_surface *surf)
+{
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For llvmpipe, nothing to do.
+    */
+   assert(surf->texture);
+   pipe_resource_reference(&surf->texture, NULL);
+   FREE(surf);
+}
+
+
+static struct pipe_transfer *
+llvmpipe_get_transfer(struct pipe_context *pipe,
+		      struct pipe_resource *resource,
+		      struct pipe_subresource sr,
+		      unsigned usage,
+		      const struct pipe_box *box)
+{
+   struct llvmpipe_resource *lprex = llvmpipe_resource(resource);
+   struct llvmpipe_transfer *lpr;
+
+   assert(resource);
+   assert(sr.level <= resource->last_level);
+
+   /*
+    * Transfers, like other pipe operations, must happen in order, so flush the
+    * context if necessary.
+    */
+   if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+      boolean read_only = !(usage & PIPE_TRANSFER_WRITE);
+      boolean do_not_block = !!(usage & PIPE_TRANSFER_DONTBLOCK);
+      if (!llvmpipe_flush_resource(pipe, resource,
+                                   sr.face, sr.level,
+                                   0, /* flush_flags */
+                                   read_only,
+                                   TRUE, /* cpu_access */
+                                   do_not_block)) {
+         /*
+          * It would have blocked, but state tracker requested no to.
+          */
+         assert(do_not_block);
+         return NULL;
+      }
+   }
+
+   lpr = CALLOC_STRUCT(llvmpipe_transfer);
+   if (lpr) {
+      struct pipe_transfer *pt = &lpr->base;
+      pipe_resource_reference(&pt->resource, resource);
+      pt->box = *box;
+      pt->sr = sr;
+      pt->stride = lprex->row_stride[sr.level];
+      pt->slice_stride = lprex->img_stride[sr.level];
+      pt->usage = usage;
+
+      return pt;
+   }
+   return NULL;
+}
+
+
+static void 
+llvmpipe_transfer_destroy(struct pipe_context *pipe,
+                              struct pipe_transfer *transfer)
+{
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For llvmpipe, nothing to do.
+    */
+   assert (transfer->resource);
+   pipe_resource_reference(&transfer->resource, NULL);
+   FREE(transfer);
+}
+
+
+static void *
+llvmpipe_transfer_map( struct pipe_context *pipe,
+                       struct pipe_transfer *transfer )
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(pipe->screen);
+   ubyte *map;
+   struct llvmpipe_resource *lpr;
+   enum pipe_format format;
+   enum lp_texture_usage tex_usage;
+   const char *mode;
+
+   assert(transfer->sr.face < 6);
+   assert(transfer->sr.level < LP_MAX_TEXTURE_LEVELS);
+
+   /*
+   printf("tex_transfer_map(%d, %d  %d x %d of %d x %d,  usage %d )\n",
+          transfer->x, transfer->y, transfer->width, transfer->height,
+          transfer->texture->width0,
+          transfer->texture->height0,
+          transfer->usage);
+   */
+
+   if (transfer->usage == PIPE_TRANSFER_READ) {
+      tex_usage = LP_TEX_USAGE_READ;
+      mode = "read";
+   }
+   else {
+      tex_usage = LP_TEX_USAGE_READ_WRITE;
+      mode = "read/write";
+   }
+
+   if (0) {
+      struct llvmpipe_resource *lpr = llvmpipe_resource(transfer->resource);
+      printf("transfer map tex %u  mode %s\n", lpr->id, mode);
+   }
+
+
+   assert(transfer->resource);
+   lpr = llvmpipe_resource(transfer->resource);
+   format = lpr->base.format;
+
+   map = llvmpipe_resource_map(transfer->resource,
+			       transfer->sr.face,
+			       transfer->sr.level,
+			       transfer->box.z,
+                               tex_usage, LP_TEX_LAYOUT_LINEAR);
+
+
+   /* May want to do different things here depending on read/write nature
+    * of the map:
+    */
+   if (transfer->usage & PIPE_TRANSFER_WRITE) {
+      /* Do something to notify sharing contexts of a texture change.
+       */
+      screen->timestamp++;
+   }
+   
+   map +=
+      transfer->box.y / util_format_get_blockheight(format) * transfer->stride +
+      transfer->box.x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
+
+   return map;
+}
+
+
+static void
+llvmpipe_transfer_unmap(struct pipe_context *pipe,
+                        struct pipe_transfer *transfer)
+{
+   assert(transfer->resource);
+
+   llvmpipe_resource_unmap(transfer->resource,
+			   transfer->sr.face,
+			   transfer->sr.level,
+			   transfer->box.z);
+}
+
+static unsigned int
+llvmpipe_is_resource_referenced( struct pipe_context *pipe,
+				struct pipe_resource *presource,
+				unsigned face, unsigned level)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+
+   if (presource->target == PIPE_BUFFER)
+      return PIPE_UNREFERENCED;
+   
+   return lp_setup_is_resource_referenced(llvmpipe->setup, presource);
+}
+
+
+
+/**
+ * Create buffer which wraps user-space data.
+ */
+static struct pipe_resource *
+llvmpipe_user_buffer_create(struct pipe_screen *screen,
+                            void *ptr,
+                            unsigned bytes,
+			    unsigned bind_flags)
+{
+   struct llvmpipe_resource *buffer;
+
+   buffer = CALLOC_STRUCT(llvmpipe_resource);
+   if(!buffer)
+      return NULL;
+
+   pipe_reference_init(&buffer->base.reference, 1);
+   buffer->base.screen = screen;
+   buffer->base.format = PIPE_FORMAT_R8_UNORM; /* ?? */
+   buffer->base.bind = bind_flags;
+   buffer->base.usage = PIPE_USAGE_IMMUTABLE;
+   buffer->base.flags = 0;
+   buffer->base.width0 = bytes;
+   buffer->base.height0 = 1;
+   buffer->base.depth0 = 1;
+   buffer->userBuffer = TRUE;
+   buffer->data = ptr;
+
+   return &buffer->base;
+}
+
+
+/**
+ * Compute size (in bytes) need to store a texture image / mipmap level,
+ * for just one cube face or one 3D texture slice
+ */
+static unsigned
+tex_image_face_size(const struct llvmpipe_resource *lpr, unsigned level,
+                    enum lp_texture_layout layout)
+{
+   const unsigned width = u_minify(lpr->base.width0, level);
+   const unsigned height = u_minify(lpr->base.height0, level);
+
+   assert(layout == LP_TEX_LAYOUT_TILED ||
+          layout == LP_TEX_LAYOUT_LINEAR);
+
+   if (layout == LP_TEX_LAYOUT_TILED) {
+      /* for tiled layout, force a 32bpp format */
+      const enum pipe_format format = PIPE_FORMAT_B8G8R8A8_UNORM;
+      const unsigned block_size = util_format_get_blocksize(format);
+      const unsigned nblocksy =
+         util_format_get_nblocksy(format, align(height, TILE_SIZE));
+      const unsigned nblocksx =
+         util_format_get_nblocksx(format, align(width, TILE_SIZE));
+      const unsigned buffer_size = block_size * nblocksy * nblocksx;
+      return buffer_size;
+   }
+   else {
+      /* we already computed this */
+      return lpr->img_stride[level];
+   }
+}
+
+
+/**
+ * Compute size (in bytes) need to store a texture image / mipmap level,
+ * including all cube faces or 3D image slices
+ */
+static unsigned
+tex_image_size(const struct llvmpipe_resource *lpr, unsigned level,
+               enum lp_texture_layout layout)
+{
+   const unsigned buf_size = tex_image_face_size(lpr, level, layout);
+   return buf_size * lpr->num_slices_faces[level];
+}
+
+
+/**
+ * This function encapsulates some complicated logic for determining
+ * how to convert a tile of image data from linear layout to tiled
+ * layout, or vice versa.
+ * \param cur_layout  the current tile layout
+ * \param target_layout  the desired tile layout
+ * \param usage  how the tile will be accessed (R/W vs. read-only, etc)
+ * \param new_layout_return  returns the new layout mode
+ * \param convert_return  returns TRUE if image conversion is needed
+ */
+static void
+layout_logic(enum lp_texture_layout cur_layout,
+             enum lp_texture_layout target_layout,
+             enum lp_texture_usage usage,
+             enum lp_texture_layout *new_layout_return,
+             boolean *convert)
+{
+   enum lp_texture_layout other_layout, new_layout;
+
+   *convert = FALSE;
+
+   new_layout = 99; /* debug check */
+
+   if (target_layout == LP_TEX_LAYOUT_LINEAR) {
+      other_layout = LP_TEX_LAYOUT_TILED;
+   }
+   else {
+      assert(target_layout == LP_TEX_LAYOUT_TILED);
+      other_layout = LP_TEX_LAYOUT_LINEAR;
+   }
+
+   new_layout = target_layout;  /* may get changed below */
+
+   if (cur_layout == LP_TEX_LAYOUT_BOTH) {
+      if (usage == LP_TEX_USAGE_READ) {
+         new_layout = LP_TEX_LAYOUT_BOTH;
+      }
+   }
+   else if (cur_layout == other_layout) {
+      if (usage != LP_TEX_USAGE_WRITE_ALL) {
+         /* need to convert tiled data to linear or vice versa */
+         *convert = TRUE;
+
+         if (usage == LP_TEX_USAGE_READ)
+            new_layout = LP_TEX_LAYOUT_BOTH;
+      }
+   }
+   else {
+      assert(cur_layout == LP_TEX_LAYOUT_NONE ||
+             cur_layout == target_layout);
+   }
+
+   assert(new_layout == LP_TEX_LAYOUT_BOTH ||
+          new_layout == target_layout);
+
+   *new_layout_return = new_layout;
+}
+
+
+/**
+ * Return pointer to a 2D texture image/face/slice.
+ * No tiled/linear conversion is done.
+ */
+ubyte *
+llvmpipe_get_texture_image_address(struct llvmpipe_resource *lpr,
+                                   unsigned face_slice, unsigned level,
+                                   enum lp_texture_layout layout)
+{
+   struct llvmpipe_texture_image *img;
+   unsigned offset;
+
+   if (layout == LP_TEX_LAYOUT_LINEAR) {
+      img = &lpr->linear[level];
+   }
+   else {
+      assert (layout == LP_TEX_LAYOUT_TILED);
+      img = &lpr->tiled[level];
+   }
+
+   if (face_slice > 0)
+      offset = face_slice * tex_image_face_size(lpr, level, layout);
+   else
+      offset = 0;
+
+   return (ubyte *) img->data + offset;
+}
+
+
+static INLINE enum lp_texture_layout
+llvmpipe_get_texture_tile_layout(const struct llvmpipe_resource *lpr,
+                                 unsigned face_slice, unsigned level,
+                                 unsigned x, unsigned y)
+{
+   uint i;
+   assert(resource_is_texture(&lpr->base));
+   assert(x < lpr->tiles_per_row[level]);
+   i = face_slice * lpr->tiles_per_image[level]
+      + y * lpr->tiles_per_row[level] + x;
+   return lpr->layout[level][i];
+}
+
+
+static INLINE void
+llvmpipe_set_texture_tile_layout(struct llvmpipe_resource *lpr,
+                                 unsigned face_slice, unsigned level,
+                                 unsigned x, unsigned y,
+                                 enum lp_texture_layout layout)
+{
+   uint i;
+   assert(resource_is_texture(&lpr->base));
+   assert(x < lpr->tiles_per_row[level]);
+   i = face_slice * lpr->tiles_per_image[level]
+      + y * lpr->tiles_per_row[level] + x;
+   lpr->layout[level][i] = layout;
+}
+
+
+/**
+ * Set the layout mode for all tiles in a particular image.
+ */
+static INLINE void
+llvmpipe_set_texture_image_layout(struct llvmpipe_resource *lpr,
+                                  unsigned face_slice, unsigned level,
+                                  unsigned width_t, unsigned height_t,
+                                  enum lp_texture_layout layout)
+{
+   const unsigned start = face_slice * lpr->tiles_per_image[level];
+   unsigned i;
+
+   for (i = 0; i < width_t * height_t; i++) {
+      lpr->layout[level][start + i] = layout;
+   }
+}
+
+
+/**
+ * Allocate storage for a linear or tile texture image (all cube
+ * faces and all 3D slices.
+ */
+static void
+alloc_image_data(struct llvmpipe_resource *lpr, unsigned level,
+                 enum lp_texture_layout layout)
+{
+   if (lpr->dt)
+      assert(level == 0);
+
+   if (layout == LP_TEX_LAYOUT_TILED) {
+      /* tiled data is stored in regular memory */
+      uint buffer_size = tex_image_size(lpr, level, layout);
+      lpr->tiled[level].data = align_malloc(buffer_size, 16);
+   }
+   else {
+      assert(layout == LP_TEX_LAYOUT_LINEAR);
+      if (lpr->dt) {
+         /* we get the linear memory from the winsys */
+         struct llvmpipe_screen *screen = llvmpipe_screen(lpr->base.screen);
+         struct sw_winsys *winsys = screen->winsys;
+
+         lpr->linear[0].data =
+            winsys->displaytarget_map(winsys, lpr->dt,
+                                      PIPE_TRANSFER_READ_WRITE);
+      }
+      else {
+         /* not a display target - allocate regular memory */
+         uint buffer_size = tex_image_size(lpr, level, LP_TEX_LAYOUT_LINEAR);
+         lpr->linear[level].data = align_malloc(buffer_size, 16);
+      }
+   }
+}
+
+
+
+/**
+ * Return pointer to texture image data (either linear or tiled layout)
+ * for a particular cube face or 3D texture slice.
+ *
+ * \param face_slice  the cube face or 3D slice of interest
+ * \param usage  one of LP_TEX_USAGE_READ/WRITE_ALL/READ_WRITE
+ * \param layout  either LP_TEX_LAYOUT_LINEAR or _TILED or _NONE
+ */
+void *
+llvmpipe_get_texture_image(struct llvmpipe_resource *lpr,
+                           unsigned face_slice, unsigned level,
+                           enum lp_texture_usage usage,
+                           enum lp_texture_layout layout)
+{
+   /*
+    * 'target' refers to the image which we're retrieving (either in
+    * tiled or linear layout).
+    * 'other' refers to the same image but in the other layout. (it may
+    *  or may not exist.
+    */
+   struct llvmpipe_texture_image *target_img;
+   struct llvmpipe_texture_image *other_img;
+   void *target_data;
+   void *other_data;
+   const unsigned width = u_minify(lpr->base.width0, level);
+   const unsigned height = u_minify(lpr->base.height0, level);
+   const unsigned width_t = align(width, TILE_SIZE) / TILE_SIZE;
+   const unsigned height_t = align(height, TILE_SIZE) / TILE_SIZE;
+   enum lp_texture_layout other_layout;
+   boolean only_allocate;
+
+   assert(layout == LP_TEX_LAYOUT_NONE ||
+          layout == LP_TEX_LAYOUT_TILED ||
+          layout == LP_TEX_LAYOUT_LINEAR);
+
+   assert(usage == LP_TEX_USAGE_READ ||
+          usage == LP_TEX_USAGE_READ_WRITE ||
+          usage == LP_TEX_USAGE_WRITE_ALL);
+
+   /* check for the special case of layout == LP_TEX_LAYOUT_NONE */
+   if (layout == LP_TEX_LAYOUT_NONE) {
+      only_allocate = TRUE;
+      layout = LP_TEX_LAYOUT_TILED;
+   }
+   else {
+      only_allocate = FALSE;
+   }
+
+   if (lpr->dt) {
+      assert(lpr->linear[level].data);
+   }
+
+   /* which is target?  which is other? */
+   if (layout == LP_TEX_LAYOUT_LINEAR) {
+      target_img = &lpr->linear[level];
+      other_img = &lpr->tiled[level];
+      other_layout = LP_TEX_LAYOUT_TILED;
+   }
+   else {
+      target_img = &lpr->tiled[level];
+      other_img = &lpr->linear[level];
+      other_layout = LP_TEX_LAYOUT_LINEAR;
+   }
+
+   target_data = target_img->data;
+   other_data = other_img->data;
+
+   if (!target_data) {
+      /* allocate memory for the target image now */
+      alloc_image_data(lpr, level, layout);
+      target_data = target_img->data;
+   }
+
+   if (face_slice > 0) {
+      unsigned target_offset, other_offset;
+
+      target_offset = face_slice * tex_image_face_size(lpr, level, layout);
+      other_offset = face_slice * tex_image_face_size(lpr, level, other_layout);
+      if (target_data) {
+         target_data = (uint8_t *) target_data + target_offset;
+      }
+      if (other_data) {
+         other_data = (uint8_t *) other_data + other_offset;
+      }
+   }
+
+   if (only_allocate) {
+      /* Just allocating tiled memory.  Don't initialize it from the
+       * linear data if it exists.
+       */
+      return target_data;
+   }
+
+   if (other_data) {
+      /* may need to convert other data to the requested layout */
+      enum lp_texture_layout new_layout;
+      unsigned x, y;
+
+      /* loop over all image tiles, doing layout conversion where needed */
+      for (y = 0; y < height_t; y++) {
+         for (x = 0; x < width_t; x++) {
+            enum lp_texture_layout cur_layout =
+               llvmpipe_get_texture_tile_layout(lpr, face_slice, level, x, y);
+            boolean convert;
+
+            layout_logic(cur_layout, layout, usage, &new_layout, &convert);
+
+            if (convert) {
+               if (layout == LP_TEX_LAYOUT_TILED) {
+                  lp_linear_to_tiled(other_data, target_data,
+                                     x * TILE_SIZE, y * TILE_SIZE,
+                                     TILE_SIZE, TILE_SIZE,
+                                     lpr->base.format,
+                                     lpr->row_stride[level],
+                                     lpr->tiles_per_row[level]);
+               }
+               else {
+                  assert(layout == LP_TEX_LAYOUT_LINEAR);
+                  lp_tiled_to_linear(other_data, target_data,
+                                     x * TILE_SIZE, y * TILE_SIZE,
+                                     TILE_SIZE, TILE_SIZE,
+                                     lpr->base.format,
+                                     lpr->row_stride[level],
+                                     lpr->tiles_per_row[level]);
+               }
+            }
+
+            if (new_layout != cur_layout)
+               llvmpipe_set_texture_tile_layout(lpr, face_slice, level, x, y,
+                                                new_layout);
+         }
+      }
+   }
+   else {
+      /* no other data */
+      llvmpipe_set_texture_image_layout(lpr, face_slice, level,
+                                        width_t, height_t, layout);
+   }
+
+   assert(target_data);
+
+   return target_data;
+}
+
+
+/**
+ * Return pointer to start of a texture image (1D, 2D, 3D, CUBE).
+ * All cube faces and 3D slices will be converted to the requested
+ * layout if needed.
+ * This is typically used when we're about to sample from a texture.
+ */
+void *
+llvmpipe_get_texture_image_all(struct llvmpipe_resource *lpr,
+                               unsigned level,
+                               enum lp_texture_usage usage,
+                               enum lp_texture_layout layout)
+{
+   const int slices = lpr->num_slices_faces[level];
+   int slice;
+   void *map = NULL;
+
+   assert(slices > 0);
+
+   for (slice = slices - 1; slice >= 0; slice--) {
+      map = llvmpipe_get_texture_image(lpr, slice, level, usage, layout);
+   }
+
+   return map;
+}
+
+
+/**
+ * Get pointer to a linear image (not the tile!) where the tile at (x,y)
+ * is known to be in linear layout.
+ * Conversion from tiled to linear will be done if necessary.
+ * \return pointer to start of image/face (not the tile)
+ */
+ubyte *
+llvmpipe_get_texture_tile_linear(struct llvmpipe_resource *lpr,
+                                 unsigned face_slice, unsigned level,
+                                 enum lp_texture_usage usage,
+                                 unsigned x, unsigned y)
+{
+   struct llvmpipe_texture_image *linear_img = &lpr->linear[level];
+   enum lp_texture_layout cur_layout, new_layout;
+   const unsigned tx = x / TILE_SIZE, ty = y / TILE_SIZE;
+   boolean convert;
+   uint8_t *tiled_image, *linear_image;
+
+   assert(resource_is_texture(&lpr->base));
+   assert(x % TILE_SIZE == 0);
+   assert(y % TILE_SIZE == 0);
+
+   if (!linear_img->data) {
+      /* allocate memory for the linear image now */
+      alloc_image_data(lpr, level, LP_TEX_LAYOUT_LINEAR);
+   }
+
+   /* compute address of the slice/face of the image that contains the tile */
+   tiled_image = llvmpipe_get_texture_image_address(lpr, face_slice, level,
+                                                    LP_TEX_LAYOUT_TILED);
+   linear_image = llvmpipe_get_texture_image_address(lpr, face_slice, level,
+                                                     LP_TEX_LAYOUT_LINEAR);
+
+   /* get current tile layout and determine if data conversion is needed */
+   cur_layout = llvmpipe_get_texture_tile_layout(lpr, face_slice, level, tx, ty);
+
+   layout_logic(cur_layout, LP_TEX_LAYOUT_LINEAR, usage,
+                &new_layout, &convert);
+
+   if (convert) {
+      lp_tiled_to_linear(tiled_image, linear_image,
+                         x, y, TILE_SIZE, TILE_SIZE, lpr->base.format,
+                         lpr->row_stride[level],
+                         lpr->tiles_per_row[level]);
+   }
+
+   if (new_layout != cur_layout)
+      llvmpipe_set_texture_tile_layout(lpr, face_slice, level, tx, ty, new_layout);
+
+   return linear_image;
+}
+
+
+/**
+ * Get pointer to tiled data for rendering.
+ * \return pointer to the tiled data at the given tile position
+ */
+ubyte *
+llvmpipe_get_texture_tile(struct llvmpipe_resource *lpr,
+                          unsigned face_slice, unsigned level,
+                          enum lp_texture_usage usage,
+                          unsigned x, unsigned y)
+{
+   struct llvmpipe_texture_image *tiled_img = &lpr->tiled[level];
+   enum lp_texture_layout cur_layout, new_layout;
+   const unsigned tx = x / TILE_SIZE, ty = y / TILE_SIZE;
+   boolean convert;
+   uint8_t *tiled_image, *linear_image;
+   unsigned tile_offset;
+
+   assert(x % TILE_SIZE == 0);
+   assert(y % TILE_SIZE == 0);
+
+   if (!tiled_img->data) {
+      /* allocate memory for the tiled image now */
+      alloc_image_data(lpr, level, LP_TEX_LAYOUT_TILED);
+   }
+
+   /* compute address of the slice/face of the image that contains the tile */
+   tiled_image = llvmpipe_get_texture_image_address(lpr, face_slice, level,
+                                                    LP_TEX_LAYOUT_TILED);
+   linear_image = llvmpipe_get_texture_image_address(lpr, face_slice, level,
+                                                     LP_TEX_LAYOUT_LINEAR);
+
+   /* get current tile layout and see if we need to convert the data */
+   cur_layout = llvmpipe_get_texture_tile_layout(lpr, face_slice, level, tx, ty);
+
+   layout_logic(cur_layout, LP_TEX_LAYOUT_TILED, usage, &new_layout, &convert);
+   if (convert) {
+      lp_linear_to_tiled(linear_image, tiled_image,
+                         x, y, TILE_SIZE, TILE_SIZE, lpr->base.format,
+                         lpr->row_stride[level],
+                         lpr->tiles_per_row[level]);
+   }
+
+   if (new_layout != cur_layout)
+      llvmpipe_set_texture_tile_layout(lpr, face_slice, level, tx, ty, new_layout);
+
+   /* compute, return address of the 64x64 tile */
+   tile_offset = (ty * lpr->tiles_per_row[level] + tx)
+         * TILE_SIZE * TILE_SIZE * 4;
+
+   return (ubyte *) tiled_image + tile_offset;
+}
+
+
+/**
+ * Return size of resource in bytes
+ */
+unsigned
+llvmpipe_resource_size(const struct pipe_resource *resource)
+{
+   const struct llvmpipe_resource *lpr = llvmpipe_resource_const(resource);
+   unsigned lvl, size = 0;
+
+   for (lvl = 0; lvl <= lpr->base.last_level; lvl++) {
+      if (lpr->linear[lvl].data)
+         size += tex_image_size(lpr, lvl, LP_TEX_LAYOUT_LINEAR);
+
+      if (lpr->tiled[lvl].data)
+         size += tex_image_size(lpr, lvl, LP_TEX_LAYOUT_TILED);
+   }
+
+   return size;
+}
+
+
+#ifdef DEBUG
+void
+llvmpipe_print_resources(void)
+{
+   struct llvmpipe_resource *lpr;
+   unsigned n = 0, total = 0;
+
+   debug_printf("LLVMPIPE: current resources:\n");
+   foreach(lpr, &resource_list) {
+      unsigned size = llvmpipe_resource_size(&lpr->base);
+      debug_printf("resource %u at %p, size %ux%ux%u: %u bytes, refcount %u\n",
+                   lpr->id, (void *) lpr,
+                   lpr->base.width0, lpr->base.height0, lpr->base.depth0,
+                   size, lpr->base.reference.count);
+      total += size;
+      n++;
+   }
+   debug_printf("LLVMPIPE: total size of %u resources: %u\n", n, total);
+}
+#endif
+
+
+void
+llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen)
+{
+#ifdef DEBUG
+   /* init linked list for tracking resources */
+   {
+      static boolean first_call = TRUE;
+      if (first_call) {
+         memset(&resource_list, 0, sizeof(resource_list));
+         make_empty_list(&resource_list);
+         first_call = FALSE;
+      }
+   }
+#endif
+
+   screen->resource_create = llvmpipe_resource_create;
+   screen->resource_destroy = llvmpipe_resource_destroy;
+   screen->resource_from_handle = llvmpipe_resource_from_handle;
+   screen->resource_get_handle = llvmpipe_resource_get_handle;
+   screen->user_buffer_create = llvmpipe_user_buffer_create;
+
+   screen->get_tex_surface = llvmpipe_get_tex_surface;
+   screen->tex_surface_destroy = llvmpipe_tex_surface_destroy;
+}
+
+
+void
+llvmpipe_init_context_resource_funcs(struct pipe_context *pipe)
+{
+   pipe->get_transfer = llvmpipe_get_transfer;
+   pipe->transfer_destroy = llvmpipe_transfer_destroy;
+   pipe->transfer_map = llvmpipe_transfer_map;
+   pipe->transfer_unmap = llvmpipe_transfer_unmap;
+   pipe->is_resource_referenced = llvmpipe_is_resource_referenced;
+ 
+   pipe->transfer_flush_region = u_default_transfer_flush_region;
+   pipe->transfer_inline_write = u_default_transfer_inline_write;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h
new file mode 100644
index 0000000000..503b6a19a8
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_texture.h
@@ -0,0 +1,237 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_TEXTURE_H
+#define LP_TEXTURE_H
+
+
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "lp_limits.h"
+
+
+enum lp_texture_usage
+{
+   LP_TEX_USAGE_READ = 100,
+   LP_TEX_USAGE_READ_WRITE,
+   LP_TEX_USAGE_WRITE_ALL
+};
+
+
+/** Per-tile layout mode */
+enum lp_texture_layout
+{
+   LP_TEX_LAYOUT_NONE = 0,  /**< no layout for the tile data yet */
+   LP_TEX_LAYOUT_TILED,     /**< the tile data is in tiled layout */
+   LP_TEX_LAYOUT_LINEAR,    /**< the tile data is in linear layout */
+   LP_TEX_LAYOUT_BOTH       /**< the tile data is in both modes */
+};
+
+
+struct pipe_context;
+struct pipe_screen;
+struct llvmpipe_context;
+
+struct sw_displaytarget;
+
+
+/**
+ * We keep one or two copies of the texture image data:  one in a simple
+ * linear layout (for texture sampling) and another in a tiled layout (for
+ * render targets).  We keep track of whether each image tile is linear
+ * or tiled on a per-tile basis.
+ */
+
+
+/** A 1D/2D/3D image, one mipmap level */
+struct llvmpipe_texture_image
+{
+   void *data;
+};
+
+
+/**
+ * llvmpipe subclass of pipe_resource.  A texture, drawing surface,
+ * vertex buffer, const buffer, etc.
+ * Textures are stored differently than othere types of objects such as
+ * vertex buffers and const buffers.
+ * The former are tiled and have per-tile layout flags.
+ * The later are simple malloc'd blocks of memory.
+ */
+struct llvmpipe_resource
+{
+   struct pipe_resource base;
+
+   /** Row stride in bytes */
+   unsigned row_stride[LP_MAX_TEXTURE_LEVELS];
+   /** Image stride (for cube maps or 3D textures) in bytes */
+   unsigned img_stride[LP_MAX_TEXTURE_LEVELS];
+   unsigned tiles_per_row[LP_MAX_TEXTURE_LEVELS];
+   unsigned tiles_per_image[LP_MAX_TEXTURE_LEVELS];
+   /** Number of 3D slices or cube faces per level */
+   unsigned num_slices_faces[LP_MAX_TEXTURE_LEVELS];
+
+   /**
+    * Display target, for textures with the PIPE_BIND_DISPLAY_TARGET
+    * usage.
+    */
+   struct sw_displaytarget *dt;
+
+   /**
+    * Malloc'ed data for regular textures, or a mapping to dt above.
+    */
+   struct llvmpipe_texture_image tiled[LP_MAX_TEXTURE_LEVELS];
+   struct llvmpipe_texture_image linear[LP_MAX_TEXTURE_LEVELS];
+
+   /**
+    * Data for non-texture resources.
+    */
+   void *data;
+
+   /** array [level][face or slice][tile_y][tile_x] of layout values) */
+   enum lp_texture_layout *layout[LP_MAX_TEXTURE_LEVELS];
+
+   boolean userBuffer;  /** Is this a user-space buffer? */
+   unsigned timestamp;
+
+   unsigned id;  /**< temporary, for debugging */
+
+#ifdef DEBUG
+   /** for linked list */
+   struct llvmpipe_resource *prev, *next;
+#endif
+};
+
+
+struct llvmpipe_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned long offset;
+};
+
+
+/** cast wrappers */
+static INLINE struct llvmpipe_resource *
+llvmpipe_resource(struct pipe_resource *pt)
+{
+   return (struct llvmpipe_resource *) pt;
+}
+
+
+static INLINE const struct llvmpipe_resource *
+llvmpipe_resource_const(const struct pipe_resource *pt)
+{
+   return (const struct llvmpipe_resource *) pt;
+}
+
+
+static INLINE struct llvmpipe_transfer *
+llvmpipe_transfer(struct pipe_transfer *pt)
+{
+   return (struct llvmpipe_transfer *) pt;
+}
+
+
+void llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen);
+void llvmpipe_init_context_resource_funcs(struct pipe_context *pipe);
+
+static INLINE unsigned
+llvmpipe_resource_stride(struct pipe_resource *resource,
+                        unsigned level)
+{
+   struct llvmpipe_resource *lpr = llvmpipe_resource(resource);
+   assert(level < LP_MAX_TEXTURE_2D_LEVELS);
+   return lpr->row_stride[level];
+}
+
+
+void *
+llvmpipe_resource_map(struct pipe_resource *resource,
+		      unsigned face_slice,
+		      unsigned level,
+		      unsigned zslice,
+                      enum lp_texture_usage tex_usage,
+                      enum lp_texture_layout layout);
+
+void
+llvmpipe_resource_unmap(struct pipe_resource *resource,
+                       unsigned face_slice,
+                       unsigned level,
+                       unsigned zslice);
+
+
+void *
+llvmpipe_resource_data(struct pipe_resource *resource);
+
+
+unsigned
+llvmpipe_resource_size(const struct pipe_resource *resource);
+
+
+ubyte *
+llvmpipe_get_texture_image_address(struct llvmpipe_resource *lpr,
+                                    unsigned face_slice, unsigned level,
+                                    enum lp_texture_layout layout);
+
+void *
+llvmpipe_get_texture_image(struct llvmpipe_resource *resource,
+                            unsigned face_slice, unsigned level,
+                            enum lp_texture_usage usage,
+                            enum lp_texture_layout layout);
+
+void *
+llvmpipe_get_texture_image_all(struct llvmpipe_resource *lpr,
+                               unsigned level,
+                               enum lp_texture_usage usage,
+                               enum lp_texture_layout layout);
+
+ubyte *
+llvmpipe_get_texture_tile_linear(struct llvmpipe_resource *lpr,
+                                  unsigned face_slice, unsigned level,
+                                  enum lp_texture_usage usage,
+                                  unsigned x, unsigned y);
+
+ubyte *
+llvmpipe_get_texture_tile(struct llvmpipe_resource *lpr,
+                           unsigned face_slice, unsigned level,
+                           enum lp_texture_usage usage,
+                           unsigned x, unsigned y);
+
+
+
+extern void
+llvmpipe_print_resources(void);
+
+
+extern void
+llvmpipe_init_screen_texture_funcs(struct pipe_screen *screen);
+
+extern void
+llvmpipe_init_context_texture_funcs(struct pipe_context *pipe);
+
+#endif /* LP_TEXTURE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_image.c b/src/gallium/drivers/llvmpipe/lp_tile_image.c
new file mode 100644
index 0000000000..2b63992dd7
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_image.c
@@ -0,0 +1,328 @@
+/**************************************************************************
+ * 
+ * Copyright 2010 VMware, Inc.  All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Code to convert images from tiled to linear and back.
+ * XXX there are quite a few assumptions about color and z/stencil being
+ * 32bpp.
+ */
+
+
+#include "util/u_format.h"
+#include "lp_tile_soa.h"
+#include "lp_tile_image.h"
+
+
+#define BYTES_PER_TILE (TILE_SIZE * TILE_SIZE * 4)
+
+
+/**
+ * Untile a 4x4 block of 32-bit words (all contiguous) to linear layout
+ * at dst, with dst_stride words between rows.
+ */
+static void
+untile_4_4_uint32(const uint32_t *src, uint32_t *dst, unsigned dst_stride)
+{
+   uint32_t *d0 = dst;
+   uint32_t *d1 = d0 + dst_stride;
+   uint32_t *d2 = d1 + dst_stride;
+   uint32_t *d3 = d2 + dst_stride;
+
+   d0[0] = src[0];   d0[1] = src[1];   d0[2] = src[4];   d0[3] = src[5];
+   d1[0] = src[2];   d1[1] = src[3];   d1[2] = src[6];   d1[3] = src[7];
+   d2[0] = src[8];   d2[1] = src[9];   d2[2] = src[12];  d2[3] = src[13];
+   d3[0] = src[10];  d3[1] = src[11];  d3[2] = src[14];  d3[3] = src[15];
+}
+
+
+
+/**
+ * Untile a 4x4 block of 16-bit words (all contiguous) to linear layout
+ * at dst, with dst_stride words between rows.
+ */
+static void
+untile_4_4_uint16(const uint16_t *src, uint16_t *dst, unsigned dst_stride)
+{
+   uint16_t *d0 = dst;
+   uint16_t *d1 = d0 + dst_stride;
+   uint16_t *d2 = d1 + dst_stride;
+   uint16_t *d3 = d2 + dst_stride;
+
+   d0[0] = src[0];   d0[1] = src[1];   d0[2] = src[4];   d0[3] = src[5];
+   d1[0] = src[2];   d1[1] = src[3];   d1[2] = src[6];   d1[3] = src[7];
+   d2[0] = src[8];   d2[1] = src[9];   d2[2] = src[12];  d2[3] = src[13];
+   d3[0] = src[10];  d3[1] = src[11];  d3[2] = src[14];  d3[3] = src[15];
+}
+
+
+
+/**
+ * Convert a 4x4 rect of 32-bit words from a linear layout into tiled
+ * layout (in which all 16 words are contiguous).
+ */
+static void
+tile_4_4_uint32(const uint32_t *src, uint32_t *dst, unsigned src_stride)
+{
+   const uint32_t *s0 = src;
+   const uint32_t *s1 = s0 + src_stride;
+   const uint32_t *s2 = s1 + src_stride;
+   const uint32_t *s3 = s2 + src_stride;
+
+   dst[0] = s0[0];   dst[1] = s0[1];   dst[4] = s0[2];   dst[5] = s0[3];
+   dst[2] = s1[0];   dst[3] = s1[1];   dst[6] = s1[2];   dst[7] = s1[3];
+   dst[8] = s2[0];   dst[9] = s2[1];   dst[12] = s2[2];  dst[13] = s2[3];
+   dst[10] = s3[0];  dst[11] = s3[1];  dst[14] = s3[2];  dst[15] = s3[3];
+}
+
+
+
+/**
+ * Convert a 4x4 rect of 16-bit words from a linear layout into tiled
+ * layout (in which all 16 words are contiguous).
+ */
+static void
+tile_4_4_uint16(const uint16_t *src, uint16_t *dst, unsigned src_stride)
+{
+   const uint16_t *s0 = src;
+   const uint16_t *s1 = s0 + src_stride;
+   const uint16_t *s2 = s1 + src_stride;
+   const uint16_t *s3 = s2 + src_stride;
+
+   dst[0] = s0[0];   dst[1] = s0[1];   dst[4] = s0[2];   dst[5] = s0[3];
+   dst[2] = s1[0];   dst[3] = s1[1];   dst[6] = s1[2];   dst[7] = s1[3];
+   dst[8] = s2[0];   dst[9] = s2[1];   dst[12] = s2[2];  dst[13] = s2[3];
+   dst[10] = s3[0];  dst[11] = s3[1];  dst[14] = s3[2];  dst[15] = s3[3];
+}
+
+
+
+/**
+ * Convert a tiled image into a linear image.
+ * \param dst_stride  dest row stride in bytes
+ */
+void
+lp_tiled_to_linear(const void *src, void *dst,
+                   unsigned x, unsigned y,
+                   unsigned width, unsigned height,
+                   enum pipe_format format,
+                   unsigned dst_stride,
+                   unsigned tiles_per_row)
+{
+   assert(x % TILE_SIZE == 0);
+   assert(y % TILE_SIZE == 0);
+   /*assert(width % TILE_SIZE == 0);
+     assert(height % TILE_SIZE == 0);*/
+
+   /* Note that Z/stencil surfaces use a different tiling size than
+    * color surfaces.
+    */
+   if (util_format_is_depth_or_stencil(format)) {
+      const uint bpp = util_format_get_blocksize(format);
+      const uint src_stride = dst_stride * TILE_VECTOR_WIDTH;
+      const uint tile_w = TILE_VECTOR_WIDTH, tile_h = TILE_VECTOR_HEIGHT;
+      const uint tiles_per_row = src_stride / (tile_w * tile_h * bpp);
+
+      dst_stride /= bpp;   /* convert from bytes to words */
+
+      if (bpp == 4) {
+         const uint32_t *src32 = (const uint32_t *) src;
+         uint32_t *dst32 = (uint32_t *) dst;
+         uint i, j;
+
+         for (j = 0; j < height; j += tile_h) {
+            for (i = 0; i < width; i += tile_w) {
+               /* compute offsets in 32-bit words */
+               uint ii = i + x, jj = j + y;
+               uint src_offset = (jj / tile_h * tiles_per_row + ii / tile_w)
+                  * (tile_w * tile_h);
+               uint dst_offset = jj * dst_stride + ii;
+               untile_4_4_uint32(src32 + src_offset,
+                                 dst32 + dst_offset,
+                                 dst_stride);
+            }
+         }
+      }
+      else {
+         const uint16_t *src16 = (const uint16_t *) src;
+         uint16_t *dst16 = (uint16_t *) dst;
+         uint i, j;
+
+         assert(bpp == 2);
+
+         for (j = 0; j < height; j += tile_h) {
+            for (i = 0; i < width; i += tile_w) {
+               /* compute offsets in 16-bit words */
+               uint ii = i + x, jj = j + y;
+               uint src_offset = (jj / tile_h * tiles_per_row + ii / tile_w)
+                  * (tile_w * tile_h);
+               uint dst_offset = jj * dst_stride + ii;
+               untile_4_4_uint16(src16 + src_offset,
+                                 dst16 + dst_offset,
+                                 dst_stride);
+            }
+         }
+      }
+   }
+   else {
+      /* color image */
+      const uint bpp = 4;
+      const uint tile_w = TILE_SIZE, tile_h = TILE_SIZE;
+      const uint bytes_per_tile = tile_w * tile_h * bpp;
+      uint i, j;
+
+      for (j = 0; j < height; j += tile_h) {
+         for (i = 0; i < width; i += tile_w) {
+            uint ii = i + x, jj = j + y;
+            uint tile_offset = ((jj / tile_h) * tiles_per_row + ii / tile_w);
+            uint byte_offset = tile_offset * bytes_per_tile;
+            const uint8_t *src_tile = (uint8_t *) src + byte_offset;
+
+            lp_tile_unswizzle_4ub(format,
+                              src_tile,
+                              dst, dst_stride,
+                              ii, jj, tile_w, tile_h);
+         }
+      }
+   }
+}
+
+
+/**
+ * Convert a linear image into a tiled image.
+ * \param src_stride  source row stride in bytes
+ */
+void
+lp_linear_to_tiled(const void *src, void *dst,
+                   unsigned x, unsigned y,
+                   unsigned width, unsigned height,
+                   enum pipe_format format,
+                   unsigned src_stride,
+                   unsigned tiles_per_row)
+{
+   assert(x % TILE_SIZE == 0);
+   assert(y % TILE_SIZE == 0);
+   /*
+   assert(width % TILE_SIZE == 0);
+   assert(height % TILE_SIZE == 0);
+   */
+
+   if (util_format_is_depth_or_stencil(format)) {
+      const uint bpp = util_format_get_blocksize(format);
+      const uint dst_stride = src_stride * TILE_VECTOR_WIDTH;
+      const uint tile_w = TILE_VECTOR_WIDTH, tile_h = TILE_VECTOR_HEIGHT;
+      const uint tiles_per_row = dst_stride / (tile_w * tile_h * bpp);
+
+      src_stride /= bpp;   /* convert from bytes to words */
+
+      if (bpp == 4) {
+         const uint32_t *src32 = (const uint32_t *) src;
+         uint32_t *dst32 = (uint32_t *) dst;
+         uint i, j;
+
+         for (j = 0; j < height; j += tile_h) {
+            for (i = 0; i < width; i += tile_w) {
+               /* compute offsets in 32-bit words */
+               uint ii = i + x, jj = j + y;
+               uint src_offset = jj * src_stride + ii;
+               uint dst_offset = (jj / tile_h * tiles_per_row + ii / tile_w)
+                  * (tile_w * tile_h);
+               tile_4_4_uint32(src32 + src_offset,
+                               dst32 + dst_offset,
+                               src_stride);
+            }
+         }
+      }
+      else {
+         const uint16_t *src16 = (const uint16_t *) src;
+         uint16_t *dst16 = (uint16_t *) dst;
+         uint i, j;
+
+         assert(bpp == 2);
+
+         for (j = 0; j < height; j += tile_h) {
+            for (i = 0; i < width; i += tile_w) {
+               /* compute offsets in 16-bit words */
+               uint ii = i + x, jj = j + y;
+               uint src_offset = jj * src_stride + ii;
+               uint dst_offset = (jj / tile_h * tiles_per_row + ii / tile_w)
+                  * (tile_w * tile_h);
+               tile_4_4_uint16(src16 + src_offset,
+                               dst16 + dst_offset,
+                               src_stride);
+            }
+         }
+      }
+   }
+   else {
+      const uint bpp = 4;
+      const uint tile_w = TILE_SIZE, tile_h = TILE_SIZE;
+      const uint bytes_per_tile = tile_w * tile_h * bpp;
+      uint i, j;
+
+      for (j = 0; j < height; j += TILE_SIZE) {
+         for (i = 0; i < width; i += TILE_SIZE) {
+            uint ii = i + x, jj = j + y;
+            uint tile_offset = ((jj / tile_h) * tiles_per_row + ii / tile_w);
+            uint byte_offset = tile_offset * bytes_per_tile;
+            uint8_t *dst_tile = (uint8_t *) dst + byte_offset;
+
+            lp_tile_swizzle_4ub(format,
+                             dst_tile,
+                             src, src_stride,
+                             ii, jj, tile_w, tile_h);
+         }
+      }
+   }
+}
+
+
+/**
+ * For testing only.
+ */
+void
+test_tiled_linear_conversion(void *data,
+                             enum pipe_format format,
+                             unsigned width, unsigned height,
+                             unsigned stride)
+{
+   /* size in tiles */
+   unsigned wt = (width + TILE_SIZE - 1) / TILE_SIZE;
+   unsigned ht = (height + TILE_SIZE - 1) / TILE_SIZE;
+
+   uint8_t *tiled = malloc(wt * ht * TILE_SIZE * TILE_SIZE * 4);
+
+   /*unsigned tiled_stride = wt * TILE_SIZE * TILE_SIZE * 4;*/
+
+   lp_linear_to_tiled(data, tiled, 0, 0, width, height, format,
+                      stride, wt);
+
+   lp_tiled_to_linear(tiled, data, 0, 0, width, height, format,
+                      stride, wt);
+
+   free(tiled);
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_image.h b/src/gallium/drivers/llvmpipe/lp_tile_image.h
new file mode 100644
index 0000000000..8de8efc6c1
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_image.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ * 
+ * Copyright 2010 VMware, Inc.  All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef LP_TILE_IMAGE_H
+#define LP_TILE_IMAGE_H
+
+
+void
+lp_tiled_to_linear(const void *src, void *dst,
+                   unsigned x, unsigned y,
+                   unsigned width, unsigned height,
+                   enum pipe_format format,
+                   unsigned dst_stride,
+                   unsigned tiles_per_row);
+
+
+void
+lp_linear_to_tiled(const void *src, void *dst,
+                   unsigned x, unsigned y,
+                   unsigned width, unsigned height,
+                   enum pipe_format format,
+                   unsigned src_stride,
+                   unsigned tiles_per_row);
+
+
+void
+test_tiled_linear_conversion(void *data,
+                             enum pipe_format format,
+                             unsigned width, unsigned height,
+                             unsigned stride);
+
+
+#endif /* LP_TILE_IMAGE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.h b/src/gallium/drivers/llvmpipe/lp_tile_soa.h
new file mode 100644
index 0000000000..07f71b8411
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.h
@@ -0,0 +1,97 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_TILE_SOA_H
+#define LP_TILE_SOA_H
+
+#include "pipe/p_compiler.h"
+#include "tgsi/tgsi_exec.h" /* for NUM_CHANNELS */
+#include "lp_limits.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct pipe_transfer;
+
+
+#define TILE_VECTOR_HEIGHT 4
+#define TILE_VECTOR_WIDTH 4
+
+extern const unsigned char
+tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH];
+
+#define TILE_C_STRIDE (TILE_VECTOR_HEIGHT * TILE_VECTOR_WIDTH) //16
+#define TILE_X_STRIDE (NUM_CHANNELS * TILE_C_STRIDE) //64
+#define TILE_Y_STRIDE (TILE_VECTOR_HEIGHT * TILE_SIZE * NUM_CHANNELS) //1024
+
+
+#ifdef DEBUG
+extern unsigned lp_tile_unswizzle_count;
+extern unsigned lp_tile_swizzle_count;
+#endif
+
+
+/**
+ * Return offset of the given pixel (and color channel) from the start
+ * of a tile, in bytes.
+ */
+static INLINE unsigned
+tile_pixel_offset(unsigned x, unsigned y, unsigned c)
+{
+   unsigned ix = (x / TILE_VECTOR_WIDTH) * TILE_X_STRIDE;
+   unsigned iy = (y / TILE_VECTOR_HEIGHT) * TILE_Y_STRIDE;
+   unsigned offset = iy + ix + c * TILE_C_STRIDE +
+      tile_offset[y % TILE_VECTOR_HEIGHT][x % TILE_VECTOR_WIDTH];
+   return offset;
+}
+
+
+#define TILE_PIXEL(_p, _x, _y, _c)   ((_p)[tile_pixel_offset(_x, _y, _c)])
+
+
+void
+lp_tile_swizzle_4ub(enum pipe_format format,
+                 uint8_t *dst,
+                 const void *src, unsigned src_stride,
+                 unsigned x, unsigned y, unsigned w, unsigned h);
+
+
+void
+lp_tile_unswizzle_4ub(enum pipe_format format,
+                  const uint8_t *src,
+                  void *dst, unsigned dst_stride,
+                  unsigned x, unsigned y, unsigned w, unsigned h);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
new file mode 100644
index 0000000000..5ab63cbac6
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
@@ -0,0 +1,403 @@
+#!/usr/bin/env python
+
+'''
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Pixel format accessor functions.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+'''
+
+
+import sys
+import os.path
+
+sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '../../auxiliary/util'))
+
+from u_format_pack import *
+
+
+def is_format_supported(format):
+    '''Determines whether we actually have the plumbing necessary to generate the 
+    to read/write to/from this format.'''
+
+    # FIXME: Ideally we would support any format combination here.
+
+    if format.layout != PLAIN:
+        return False
+
+    for i in range(4):
+        channel = format.channels[i]
+        if channel.type not in (VOID, UNSIGNED, SIGNED, FLOAT):
+            return False
+        if channel.type == FLOAT and channel.size not in (16, 32 ,64):
+            return False
+
+    if format.colorspace not in ('rgb', 'srgb'):
+        return False
+
+    return True
+
+
+def generate_format_read(format, dst_channel, dst_native_type, dst_suffix):
+    '''Generate the function to read pixels from a particular format'''
+
+    name = format.short_name()
+
+    src_native_type = native_type(format)
+
+    print 'static void'
+    print 'lp_tile_%s_swizzle_%s(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, dst_suffix, dst_native_type)
+    print '{'
+    print '   unsigned x, y;'
+    print '   const uint8_t *src_row = src + y0*src_stride;'
+    print '   for (y = 0; y < h; ++y) {'
+    print '      const %s *src_pixel = (const %s *)(src_row + x0*%u);' % (src_native_type, src_native_type, format.stride())
+    print '      for (x = 0; x < w; ++x) {'
+
+    names = ['']*4
+    if format.colorspace in ('rgb', 'srgb'):
+        for i in range(4):
+            swizzle = format.swizzles[i]
+            if swizzle < 4:
+                names[swizzle] += 'rgba'[i]
+    elif format.colorspace == 'zs':
+        swizzle = format.swizzles[0]
+        if swizzle < 4:
+            names[swizzle] = 'z'
+        else:
+            assert False
+    else:
+        assert False
+
+    if format.layout == PLAIN:
+        if not format.is_array():
+            print '         %s pixel = *src_pixel++;' % src_native_type
+            shift = 0;
+            for i in range(4):
+                src_channel = format.channels[i]
+                width = src_channel.size
+                if names[i]:
+                    value = 'pixel'
+                    mask = (1 << width) - 1
+                    if shift:
+                        value = '(%s >> %u)' % (value, shift)
+                    if shift + width < format.block_size():
+                        value = '(%s & 0x%x)' % (value, mask)
+                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
+                    print '         %s %s = %s;' % (dst_native_type, names[i], value)
+                shift += width
+        else:
+            for i in range(4):
+                if names[i]:
+                    print '         %s %s;' % (dst_native_type, names[i])
+            for i in range(4):
+                src_channel = format.channels[i]
+                if names[i]:
+                    value = '(*src_pixel++)'
+                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
+                    print '         %s = %s;' % (names[i], value)
+                elif src_channel.size:
+                    print '         ++src_pixel;'
+    else:
+        assert False
+
+    for i in range(4):
+        if format.colorspace in ('rgb', 'srgb'):
+            swizzle = format.swizzles[i]
+            if swizzle < 4:
+                value = names[swizzle]
+            elif swizzle == SWIZZLE_0:
+                value = '0'
+            elif swizzle == SWIZZLE_1:
+                value = get_one(dst_channel)
+            else:
+                assert False
+        elif format.colorspace == 'zs':
+            if i < 3:
+                value = 'z'
+            else:
+                value = get_one(dst_channel)
+        else:
+            assert False
+        print '         TILE_PIXEL(dst, x, y, %u) = %s; /* %s */' % (i, value, 'rgba'[i])
+
+    print '      }'
+    print '      src_row += src_stride;'
+    print '   }'
+    print '}'
+    print
+    
+
+def pack_rgba(format, src_channel, r, g, b, a):
+    """Return an expression for packing r, g, b, a into a pixel of the
+    given format.  Ex: '(b << 24) | (g << 16) | (r << 8) | (a << 0)'
+    """
+    assert format.colorspace in ('rgb', 'srgb')
+    inv_swizzle = format.inv_swizzles()
+    shift = 0
+    expr = None
+    for i in range(4):
+        # choose r, g, b, or a depending on the inverse swizzle term
+        if inv_swizzle[i] == 0:
+            value = r
+        elif inv_swizzle[i] == 1:
+            value = g
+        elif inv_swizzle[i] == 2:
+            value = b
+        elif inv_swizzle[i] == 3:
+            value = a
+        else:
+            value = None
+
+        if value:
+            dst_channel = format.channels[i]
+            dst_native_type = native_type(format)
+            value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
+            term = "((%s) << %d)" % (value, shift)
+            if expr:
+                expr = expr + " | " + term
+            else:
+                expr = term
+
+        width = format.channels[i].size
+        shift = shift + width
+    return expr
+
+
+def emit_unrolled_unswizzle_code(format, src_channel):
+    '''Emit code for writing a block based on unrolled loops.
+    This is considerably faster than the TILE_PIXEL-based code below.
+    '''
+    dst_native_type = 'uint%u_t' % format.block_size()
+    print '   const unsigned dstpix_stride = dst_stride / %d;' % format.stride()
+    print '   %s *dstpix = (%s *) dst;' % (dst_native_type, dst_native_type)
+    print '   unsigned int qx, qy, i;'
+    print
+    print '   for (qy = 0; qy < h; qy += TILE_VECTOR_HEIGHT) {'
+    print '      const unsigned py = y0 + qy;'
+    print '      for (qx = 0; qx < w; qx += TILE_VECTOR_WIDTH) {'
+    print '         const unsigned px = x0 + qx;'
+    print '         const uint8_t *r = src + 0 * TILE_C_STRIDE;'
+    print '         const uint8_t *g = src + 1 * TILE_C_STRIDE;'
+    print '         const uint8_t *b = src + 2 * TILE_C_STRIDE;'
+    print '         const uint8_t *a = src + 3 * TILE_C_STRIDE;'
+    print '         (void) r; (void) g; (void) b; (void) a; /* silence warnings */'
+    print '         for (i = 0; i < TILE_C_STRIDE; i += 2) {'
+    print '            const uint32_t pixel0 = %s;' % pack_rgba(format, src_channel, "r[i+0]", "g[i+0]", "b[i+0]", "a[i+0]")
+    print '            const uint32_t pixel1 = %s;' % pack_rgba(format, src_channel, "r[i+1]", "g[i+1]", "b[i+1]", "a[i+1]")
+    print '            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);'
+    print '            dstpix[offset + 0] = pixel0;'
+    print '            dstpix[offset + 1] = pixel1;'
+    print '         }'
+    print '         src += TILE_X_STRIDE;'
+    print '      }'
+    print '   }'
+
+
+def emit_tile_pixel_unswizzle_code(format, src_channel):
+    '''Emit code for writing a block based on the TILE_PIXEL macro.'''
+    dst_native_type = native_type(format)
+
+    inv_swizzle = format.inv_swizzles()
+
+    print '   unsigned x, y;'
+    print '   uint8_t *dst_row = dst + y0*dst_stride;'
+    print '   for (y = 0; y < h; ++y) {'
+    print '      %s *dst_pixel = (%s *)(dst_row + x0*%u);' % (dst_native_type, dst_native_type, format.stride())
+    print '      for (x = 0; x < w; ++x) {'
+
+    if format.layout == PLAIN:
+        if not format.is_array():
+            print '         %s pixel = 0;' % dst_native_type
+            shift = 0;
+            for i in range(4):
+                dst_channel = format.channels[i]
+                width = dst_channel.size
+                if inv_swizzle[i] is not None:
+                    value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
+                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
+                    if shift:
+                        value = '(%s << %u)' % (value, shift)
+                    print '         pixel |= %s;' % value
+                shift += width
+            print '         *dst_pixel++ = pixel;'
+        else:
+            for i in range(4):
+                dst_channel = format.channels[i]
+                if inv_swizzle[i] is not None:
+                    value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
+                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
+                    print '         *dst_pixel++ = %s;' % value
+                elif dst_channel.size:
+                    print '         ++dst_pixel;'
+    else:
+        assert False
+
+    print '      }'
+    print '      dst_row += dst_stride;'
+    print '   }'
+
+
+def generate_format_write(format, src_channel, src_native_type, src_suffix):
+    '''Generate the function to write pixels to a particular format'''
+
+    name = format.short_name()
+
+    print 'static void'
+    print 'lp_tile_%s_unswizzle_%s(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, src_suffix, src_native_type)
+    print '{'
+    if format.layout == PLAIN \
+        and format.colorspace == 'rgb' \
+        and format.block_size() <= 32 \
+        and format.is_pot() \
+        and not format.is_mixed() \
+        and (format.channels[0].type == UNSIGNED \
+             or format.channels[1].type == UNSIGNED):
+        emit_unrolled_unswizzle_code(format, src_channel)
+    else:
+        emit_tile_pixel_unswizzle_code(format, src_channel)
+    print '}'
+    print
+    
+
+def generate_swizzle(formats, dst_channel, dst_native_type, dst_suffix):
+    '''Generate the dispatch function to read pixels from any format'''
+
+    for format in formats:
+        if is_format_supported(format):
+            generate_format_read(format, dst_channel, dst_native_type, dst_suffix)
+
+    print 'void'
+    print 'lp_tile_swizzle_%s(enum pipe_format format, %s *dst, const void *src, unsigned src_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (dst_suffix, dst_native_type)
+    print '{'
+    print '   void (*func)(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);' % dst_native_type
+    print '#ifdef DEBUG'
+    print '   lp_tile_swizzle_count += 1;'
+    print '#endif'
+    print '   switch(format) {'
+    for format in formats:
+        if is_format_supported(format):
+            print '   case %s:' % format.name
+            print '      func = &lp_tile_%s_swizzle_%s;' % (format.short_name(), dst_suffix)
+            print '      break;'
+    print '   default:'
+    print '      debug_printf("%s: unsupported format %s\\n", __FUNCTION__, util_format_name(format));'
+    print '      return;'
+    print '   }'
+    print '   func(dst, (const uint8_t *)src, src_stride, x, y, w, h);'
+    print '}'
+    print
+
+
+def generate_unswizzle(formats, src_channel, src_native_type, src_suffix):
+    '''Generate the dispatch function to write pixels to any format'''
+
+    for format in formats:
+        if is_format_supported(format):
+            generate_format_write(format, src_channel, src_native_type, src_suffix)
+
+    print 'void'
+    print 'lp_tile_unswizzle_%s(enum pipe_format format, const %s *src, void *dst, unsigned dst_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (src_suffix, src_native_type)
+    
+    print '{'
+    print '   void (*func)(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);' % src_native_type
+    print '#ifdef DEBUG'
+    print '   lp_tile_unswizzle_count += 1;'
+    print '#endif'
+    print '   switch(format) {'
+    for format in formats:
+        if is_format_supported(format):
+            print '   case %s:' % format.name
+            print '      func = &lp_tile_%s_unswizzle_%s;' % (format.short_name(), src_suffix)
+            print '      break;'
+    print '   default:'
+    print '      debug_printf("%s: unsupported format %s\\n", __FUNCTION__, util_format_name(format));'
+    print '      return;'
+    print '   }'
+    print '   func(src, (uint8_t *)dst, dst_stride, x, y, w, h);'
+    print '}'
+    print
+
+
+def main():
+    formats = []
+    for arg in sys.argv[1:]:
+        formats.extend(parse(arg))
+
+    print '/* This file is autogenerated by lp_tile_soa.py from u_format.csv. Do not edit directly. */'
+    print
+    # This will print the copyright message on the top of this file
+    print __doc__.strip()
+    print
+    print '#include "pipe/p_compiler.h"'
+    print '#include "util/u_format.h"'
+    print '#include "util/u_math.h"'
+    print '#include "util/u_half.h"'
+    print '#include "lp_tile_soa.h"'
+    print
+    print '#ifdef DEBUG'
+    print 'unsigned lp_tile_unswizzle_count = 0;'
+    print 'unsigned lp_tile_swizzle_count = 0;'
+    print '#endif'
+    print
+    print 'const unsigned char'
+    print 'tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH] = {'
+    print '   {  0,  1,  4,  5},'
+    print '   {  2,  3,  6,  7},'
+    print '   {  8,  9, 12, 13},'
+    print '   { 10, 11, 14, 15}'
+    print '};'
+    print
+    print '/* Note: these lookup tables could be replaced with some'
+    print ' * bit-twiddling code, but this is a little faster.'
+    print ' */'
+    print 'static unsigned tile_x_offset[TILE_VECTOR_WIDTH * TILE_VECTOR_HEIGHT] = {'
+    print '   0, 1, 0, 1, 2, 3, 2, 3,'
+    print '   0, 1, 0, 1, 2, 3, 2, 3'
+    print '};'
+    print
+    print 'static unsigned tile_y_offset[TILE_VECTOR_WIDTH * TILE_VECTOR_HEIGHT] = {'
+    print '   0, 0, 1, 1, 0, 0, 1, 1,'
+    print '   2, 2, 3, 3, 2, 2, 3, 3'
+    print '};'
+    print
+
+    channel = Channel(UNSIGNED, True, 8)
+    native_type = 'uint8_t'
+    suffix = '4ub'
+
+    generate_swizzle(formats, channel, native_type, suffix)
+    generate_unswizzle(formats, channel, native_type, suffix)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gallium/drivers/llvmpipe/sse_mathfun.h b/src/gallium/drivers/llvmpipe/sse_mathfun.h
new file mode 100644
index 0000000000..8ac2064b7b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/sse_mathfun.h
@@ -0,0 +1,773 @@
+/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+
+   The default is to use the SSE1 version. If you define USE_SSE2 the
+   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
+   not expect any significant performance improvement with SSE2.
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <xmmintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END 
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+
+#ifdef USE_SSE2
+# include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+#else
+typedef __m64 v2si;   // vector of 2 int (mmx)
+#endif
+
+/* declare some SSE constants -- why can't I figure a better way to do that? */
+#define _PS_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PI32_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+
+v4sf log_ps(v4sf x);
+v4sf exp_ps(v4sf x);
+v4sf sin_ps(v4sf x);
+v4sf cos_ps(v4sf x);
+void sincos_ps(v4sf x, v4sf *s, v4sf *c);
+
+#if defined (__MINGW32__)
+
+/* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
+   The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
+   broken on my mingw gcc 3.4.5 ...
+
+   Note that the bug on _mm_cmp* does occur only at -O0 optimization level
+*/
+
+inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
+	asm (
+			"movhlps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;                                 }
+#warning "redefined _mm_movehl_ps (see gcc bug 21179)"
+#define _mm_movehl_ps my_movehl_ps
+
+inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
+	asm (
+			"cmpltps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;               
+                  }
+inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
+	asm (
+			"cmpnleps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;               
+}
+inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
+	asm (
+			"cmpeqps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;               
+}
+#warning "redefined _mm_cmpxx_ps functions..."
+#define _mm_cmplt_ps my_cmplt_ps
+#define _mm_cmpgt_ps my_cmpgt_ps
+#define _mm_cmpeq_ps my_cmpeq_ps
+#endif
+
+#ifndef USE_SSE2
+typedef union xmm_mm_union {
+  __m128 xmm;
+  __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
+    xmm_mm_union u; u.xmm = xmm_;                   \
+    mm0_ = u.mm[0];                                 \
+    mm1_ = u.mm[1];                                 \
+}
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
+    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
+  }
+
+#endif // USE_SSE2
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+v4sf log_ps(v4sf x) {
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+  v4sf e, mask, tmp, z, y;
+
+  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
+
+#ifndef USE_SSE2
+  /* part 1: x = frexpf(x, &e); */
+  COPY_XMM_TO_MM(x, mm0, mm1);
+  mm0 = _mm_srli_pi32(mm0, 23);
+  mm1 = _mm_srli_pi32(mm1, 23);
+#else
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+#endif
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+
+#ifndef USE_SSE2
+  /* now e=mm0:mm1 contain the really base-2 exponent */
+  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+  e = _mm_cvtpi32x2_ps(mm0, mm1);
+  _mm_empty(); /* bye bye mmx */
+#else
+  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+  e = _mm_cvtepi32_ps(emm0);
+#endif
+
+  e = _mm_add_ps(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+
+  mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
+  tmp = _mm_and_ps(x, mask);
+  x = _mm_sub_ps(x, one);
+  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+  x = _mm_add_ps(x, tmp);
+
+
+  z = _mm_mul_ps(x,x);
+
+  y = *(v4sf*)_ps_cephes_log_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
+  y = _mm_mul_ps(y, x);
+
+  y = _mm_mul_ps(y, z);
+  
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
+  y = _mm_add_ps(y, tmp);
+
+
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
+  x = _mm_add_ps(x, y);
+  x = _mm_add_ps(x, tmp);
+  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS_CONST(exp_hi,	88.3762626647949f);
+_PS_CONST(exp_lo,	-88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+v4sf exp_ps(v4sf x) {
+  v4sf tmp = _mm_setzero_ps(), fx;
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+  v4sf mask, z, y, pow2n; 
+
+  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
+  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+#ifndef USE_SSE2
+  /* step 1 : cast to int */
+  tmp = _mm_movehl_ps(tmp, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(tmp);
+  /* step 2 : cast back to float */
+  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+#endif
+  /* if greater, substract 1 */
+  mask = _mm_cmpgt_ps(tmp, fx);    
+  mask = _mm_and_ps(mask, one);
+  fx = _mm_sub_ps(tmp, mask);
+
+  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
+  z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
+  x = _mm_sub_ps(x, tmp);
+  x = _mm_sub_ps(x, z);
+
+  z = _mm_mul_ps(x,x);
+  
+  y = *(v4sf*)_ps_cephes_exp_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, x);
+  y = _mm_add_ps(y, one);
+
+  /* build 2^n */
+#ifndef USE_SSE2
+  z = _mm_movehl_ps(z, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(z);
+  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+  mm0 = _mm_slli_pi32(mm0, 23); 
+  mm1 = _mm_slli_pi32(mm1, 23);
+  
+  COPY_MM_TO_XMM(mm0, mm1, pow2n);
+  _mm_empty();
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  pow2n = _mm_castsi128_ps(emm0);
+#endif
+  y = _mm_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
+   it runs also on old athlons XPs and the pentium III of your grand
+   mother.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Performance is also surprisingly good, 1.33 times faster than the
+   macos vsinf SSE2 function, and 1.5 times faster than the
+   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
+   too bad for an SSE1 function (with no special tuning) !
+   However the latter libraries probably have a much better handling of NaN,
+   Inf, denormalized and other special arguments..
+
+   On my core 1 duo, the execution of this function takes approximately 95 cycles.
+
+   From what I have observed on the experiments with Intel AMath lib, switching to an
+   SSE2 version would improve the perf by only 10%.
+
+   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
+   deliver full speed.
+*/
+v4sf sin_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  v4sf swap_sign_bit, poly_mask, z, tmp, y2;
+
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+  //printf("plop:"); print4(y); 
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  swap_sign_bit = _mm_castsi128_ps(emm0);
+  poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  /* get the swap sign flag */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  /* get the polynom selection mask */
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* almost the same as sin_ps */
+v4sf cos_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  v4sf sign_bit, poly_mask, z, tmp, y2;
+
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+  
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+  
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  sign_bit = _mm_castsi128_ps(emm0);
+  poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+
+  /* get the swap sign flag in mm0:mm1 and the 
+     polynom selection mask in mm2:mm3 */
+
+  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
+  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2, emm4;
+#else
+  v2si mm0, mm1, mm2, mm3, mm4, mm5;
+#endif
+  v4sf swap_sign_bit_sin, poly_mask, z, tmp, y2, ysin1, ysin2;
+  v4sf sign_bit_cos;
+
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    
+#ifdef USE_SSE2
+  /* store the integer part of y in emm2 */
+  emm2 = _mm_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm4 = emm2;
+
+  /* get the swap sign flag for the sine */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+  /* get the polynom selection mask for the sine*/
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm2:mm3 */
+  xmm3 = _mm_movehl_ps(xmm3, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm3);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+  mm4 = mm2;
+  mm5 = mm3;
+
+  /* get the swap sign flag for the sine */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+  /* get the polynom selection mask for the sine */
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+#endif
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+#ifdef USE_SSE2
+  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+  emm4 = _mm_slli_epi32(emm4, 29);
+  sign_bit_cos = _mm_castsi128_ps(emm4);
+#else
+  /* get the sign flag for the cosine */
+  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+  mm4 = _mm_slli_pi32(mm4, 29);
+  mm5 = _mm_slli_pi32(mm5, 29);
+  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+  _mm_empty(); /* good-bye mmx */
+#endif
+
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  z = _mm_mul_ps(x,x);
+  y = *(v4sf*)_ps_coscof_p0;
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  ysin2 = _mm_and_ps(xmm3, y2);
+  ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
+
diff --git a/src/gallium/drivers/nouveau/Makefile b/src/gallium/drivers/nouveau/Makefile
new file mode 100644
index 0000000000..db591b756c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/Makefile
@@ -0,0 +1,11 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nouveau
+
+LIBRARY_INCLUDES = \
+	-I$(TOP)/src/gallium/drivers/nouveau/include
+
+C_SOURCES = nouveau_screen.c 
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/nouveau/SConscript b/src/gallium/drivers/nouveau/SConscript
new file mode 100644
index 0000000000..fe7af4d2ae
--- /dev/null
+++ b/src/gallium/drivers/nouveau/SConscript
@@ -0,0 +1,11 @@
+Import('*')
+
+env = env.Clone()
+
+nouveau = env.ConvenienceLibrary(
+    target = 'nouveau',
+    source = [
+        'nouveau_screen.c',
+    ])
+
+Export('nouveau')
diff --git a/src/gallium/drivers/nouveau/nouveau_class.h b/src/gallium/drivers/nouveau/nouveau_class.h
new file mode 100644
index 0000000000..adfdd37b1b
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_class.h
@@ -0,0 +1,9025 @@
+/*************************************************************************
+
+   Autogenerated file, do not edit !
+
+   This file was generated by renouveau-gen from renouveau.xml, the
+   XML database of nvidia objects and methods. renouveau-gen and
+   renouveau.xml can be found in CVS module renouveau of sourceforge.net
+   project nouveau:
+
+cvs -z3 -d:pserver:anonymous@nouveau.cvs.sourceforge.net:/cvsroot/nouveau co -P renouveau
+
+**************************************************************************
+
+   Copyright (C) 2006-2008 :
+   Dmitry Baryshkov,
+   Laurent Carlier,
+   Matthieu Castet,
+   Dawid Gajownik,
+   Jeremy Kolb,
+   Stephane Loeuillet,
+   Patrice Mandin,
+   Stephane Marchesin,
+   Serge Martin,
+   Sylvain Munaut,
+   Simon Raffeiner,
+   Ben Skeggs,
+   Erik Waling,
+   koala_br,
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*************************************************************************/
+
+
+#ifndef NOUVEAU_REG_H
+#define NOUVEAU_REG_H 1
+
+
+#define NV01_ROOT									0x00000001
+
+
+
+#define NV01_CONTEXT_DMA								0x00000002
+
+
+
+#define NV01_DEVICE									0x00000003
+
+
+
+#define NV01_TIMER									0x00000004
+
+#define  NV01_TIMER_SYNCHRONIZE								0x00000100
+#define  NV01_TIMER_STOP_ALARM								0x00000104
+#define  NV01_TIMER_DMA_NOTIFY								0x00000180
+#define  NV01_TIMER_TIME(x)								(0x00000300+((x)*4))
+#define  NV01_TIMER_TIME__SIZE								0x00000002
+#define  NV01_TIMER_ALARM_NOTIFY							0x00000308
+
+
+#define NV01_CONTEXT_BETA1								0x00000012
+
+#define  NV01_CONTEXT_BETA1_NOP								0x00000100
+#define  NV01_CONTEXT_BETA1_NOTIFY							0x00000104
+#define  NV01_CONTEXT_BETA1_DMA_NOTIFY							0x00000180
+#define  NV01_CONTEXT_BETA1_BETA_1D31							0x00000300
+
+
+#define NV01_CONTEXT_COLOR_KEY								0x00000017
+
+#define  NV01_CONTEXT_COLOR_KEY_NOP							0x00000100
+#define  NV01_CONTEXT_COLOR_KEY_NOTIFY							0x00000104
+#define  NV01_CONTEXT_COLOR_KEY_DMA_NOTIFY						0x00000180
+#define  NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT						0x00000300
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X16A8Y8					0x00000001
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X24Y8					0x00000002
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X16A1R5G5B5				0x00000003
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X17R5G5B5					0x00000004
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_A8R8G8B8					0x00000005
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X8R8G8B8					0x00000006
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_A16Y16					0x00000007
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X16Y16					0x00000008
+#define  NV01_CONTEXT_COLOR_KEY_COLOR							0x00000304
+
+
+#define NV04_CONTEXT_COLOR_KEY								0x00000057
+
+
+
+#define NV01_CONTEXT_PATTERN								0x00000018
+
+#define  NV01_CONTEXT_PATTERN_NOP							0x00000100
+#define  NV01_CONTEXT_PATTERN_NOTIFY							0x00000104
+#define  NV01_CONTEXT_PATTERN_DMA_NOTIFY						0x00000180
+#define  NV01_CONTEXT_PATTERN_COLOR_FORMAT						0x00000300
+#define  NV01_CONTEXT_PATTERN_MONOCHROME_FORMAT						0x00000304
+#define  NV01_CONTEXT_PATTERN_SHAPE							0x00000308
+#define  NV01_CONTEXT_PATTERN_COLOR(x)							(0x00000310+((x)*4))
+#define  NV01_CONTEXT_PATTERN_COLOR__SIZE						0x00000002
+#define  NV01_CONTEXT_PATTERN_PATTERN(x)						(0x00000318+((x)*4))
+#define  NV01_CONTEXT_PATTERN_PATTERN__SIZE						0x00000002
+
+
+#define NV01_CONTEXT_CLIP_RECTANGLE							0x00000019
+
+#define  NV01_CONTEXT_CLIP_RECTANGLE_NOP						0x00000100
+#define  NV01_CONTEXT_CLIP_RECTANGLE_NOTIFY						0x00000104
+#define  NV01_CONTEXT_CLIP_RECTANGLE_DMA_NOTIFY						0x00000180
+#define  NV01_CONTEXT_CLIP_RECTANGLE_POINT						0x00000300
+#define   NV01_CONTEXT_CLIP_RECTANGLE_POINT_X_SHIFT					0
+#define   NV01_CONTEXT_CLIP_RECTANGLE_POINT_X_MASK					0x0000ffff
+#define   NV01_CONTEXT_CLIP_RECTANGLE_POINT_Y_SHIFT					16
+#define   NV01_CONTEXT_CLIP_RECTANGLE_POINT_Y_MASK					0xffff0000
+#define  NV01_CONTEXT_CLIP_RECTANGLE_SIZE						0x00000304
+#define   NV01_CONTEXT_CLIP_RECTANGLE_SIZE_W_SHIFT					0
+#define   NV01_CONTEXT_CLIP_RECTANGLE_SIZE_W_MASK					0x0000ffff
+#define   NV01_CONTEXT_CLIP_RECTANGLE_SIZE_H_SHIFT					16
+#define   NV01_CONTEXT_CLIP_RECTANGLE_SIZE_H_MASK					0xffff0000
+
+
+#define NV01_RENDER_SOLID_LINE								0x0000001c
+
+#define  NV01_RENDER_SOLID_LINE_NOP							0x00000100
+#define  NV01_RENDER_SOLID_LINE_NOTIFY							0x00000104
+#define  NV01_RENDER_SOLID_LINE_PATCH							0x0000010c
+#define  NV01_RENDER_SOLID_LINE_DMA_NOTIFY						0x00000180
+#define  NV01_RENDER_SOLID_LINE_CLIP_RECTANGLE						0x00000184
+#define  NV01_RENDER_SOLID_LINE_PATTERN							0x00000188
+#define  NV01_RENDER_SOLID_LINE_ROP							0x0000018c
+#define  NV01_RENDER_SOLID_LINE_BETA1							0x00000190
+#define  NV01_RENDER_SOLID_LINE_SURFACE							0x00000194
+#define  NV01_RENDER_SOLID_LINE_OPERATION						0x000002fc
+#define   NV01_RENDER_SOLID_LINE_OPERATION_SRCCOPY_AND					0x00000000
+#define   NV01_RENDER_SOLID_LINE_OPERATION_ROP_AND					0x00000001
+#define   NV01_RENDER_SOLID_LINE_OPERATION_BLEND_AND					0x00000002
+#define   NV01_RENDER_SOLID_LINE_OPERATION_SRCCOPY					0x00000003
+#define   NV01_RENDER_SOLID_LINE_OPERATION_SRCCOPY_PREMULT				0x00000004
+#define   NV01_RENDER_SOLID_LINE_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV01_RENDER_SOLID_LINE_COLOR_FORMAT						0x00000300
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X16A8Y8					0x00000001
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X24Y8					0x00000002
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X16A1R5G5B5				0x00000003
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X17R5G5B5					0x00000004
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_A8R8G8B8					0x00000005
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X8R8G8B8					0x00000006
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_A16Y16					0x00000007
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X16Y16					0x00000008
+#define  NV01_RENDER_SOLID_LINE_COLOR							0x00000304
+#define  NV01_RENDER_SOLID_LINE_LINE_POINT0(x)						(0x00000400+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_LINE_POINT0__SIZE					0x00000010
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT0_X_SHIFT					0
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT0_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT0_Y_SHIFT					16
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT0_Y_MASK					0xffff0000
+#define  NV01_RENDER_SOLID_LINE_LINE_POINT1(x)						(0x00000404+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_LINE_POINT1__SIZE					0x00000010
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT1_X_SHIFT					0
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT1_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT1_Y_SHIFT					16
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT1_Y_MASK					0xffff0000
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT0_X(x)					(0x00000480+((x)*16))
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT0_X__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT0_Y(x)					(0x00000484+((x)*16))
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT0_Y__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT1_X(x)					(0x00000488+((x)*16))
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT1_X__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT1_Y(x)					(0x0000048c+((x)*16))
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT1_Y__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_POLYLINE(x)						(0x00000500+((x)*4))
+#define  NV01_RENDER_SOLID_LINE_POLYLINE__SIZE						0x00000020
+#define   NV01_RENDER_SOLID_LINE_POLYLINE_X_SHIFT					0
+#define   NV01_RENDER_SOLID_LINE_POLYLINE_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_LINE_POLYLINE_Y_SHIFT					16
+#define   NV01_RENDER_SOLID_LINE_POLYLINE_Y_MASK					0xffff0000
+#define  NV01_RENDER_SOLID_LINE_POLYLINE32_POINT_X(x)					(0x00000580+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_POLYLINE32_POINT_X__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_LINE_POLYLINE32_POINT_Y(x)					(0x00000584+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_POLYLINE32_POINT_Y__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_LINE_CPOLYLINE_COLOR(x)					(0x00000600+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_CPOLYLINE_COLOR__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT(x)					(0x00000604+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT__SIZE					0x00000010
+#define   NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT_X_SHIFT				0
+#define   NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT_Y_MASK					0xffff0000
+
+
+#define NV04_RENDER_SOLID_LINE								0x0000005c
+
+#define  NV04_RENDER_SOLID_LINE_BETA4							0x00000194
+#define  NV04_RENDER_SOLID_LINE_SURFACE							0x00000198
+
+
+#define NV01_RENDER_SOLID_TRIANGLE							0x0000001d
+
+#define  NV01_RENDER_SOLID_TRIANGLE_NOP							0x00000100
+#define  NV01_RENDER_SOLID_TRIANGLE_NOTIFY						0x00000104
+#define  NV01_RENDER_SOLID_TRIANGLE_PATCH						0x0000010c
+#define  NV01_RENDER_SOLID_TRIANGLE_DMA_NOTIFY						0x00000180
+#define  NV01_RENDER_SOLID_TRIANGLE_CLIP_RECTANGLE					0x00000184
+#define  NV01_RENDER_SOLID_TRIANGLE_PATTERN						0x00000188
+#define  NV01_RENDER_SOLID_TRIANGLE_ROP							0x0000018c
+#define  NV01_RENDER_SOLID_TRIANGLE_BETA1						0x00000190
+#define  NV01_RENDER_SOLID_TRIANGLE_SURFACE						0x00000194
+#define  NV01_RENDER_SOLID_TRIANGLE_OPERATION						0x000002fc
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_SRCCOPY_AND				0x00000000
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_ROP_AND					0x00000001
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_BLEND_AND				0x00000002
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_SRCCOPY					0x00000003
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_SRCCOPY_PREMULT				0x00000004
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV01_RENDER_SOLID_TRIANGLE_COLOR_FORMAT					0x00000300
+#define  NV01_RENDER_SOLID_TRIANGLE_COLOR						0x00000304
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0					0x00000310
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1					0x00000314
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2					0x00000318
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT0_X					0x00000320
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT0_Y					0x00000324
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT1_X					0x00000328
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT1_Y					0x0000032c
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT2_X					0x00000330
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT2_Y					0x00000334
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH(x)						(0x00000400+((x)*4))
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH__SIZE					0x00000020
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIMESH_X_SHIFT					0
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIMESH_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIMESH_Y_SHIFT					16
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIMESH_Y_MASK					0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH32_POINT_X(x)				(0x00000480+((x)*8))
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH32_POINT_X__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH32_POINT_Y(x)				(0x00000484+((x)*8))
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH32_POINT_Y__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_COLOR(x)					(0x00000500+((x)*16))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_COLOR__SIZE				0x00000008
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0(x)					(0x00000504+((x)*16))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0__SIZE				0x00000008
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1(x)					(0x00000508+((x)*16))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1__SIZE				0x00000008
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2(x)					(0x0000050c+((x)*16))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2__SIZE				0x00000008
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_COLOR(x)					(0x00000580+((x)*8))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_COLOR__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT(x)					(0x00000584+((x)*8))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT__SIZE				0x00000010
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT_Y_MASK				0xffff0000
+
+
+#define NV04_RENDER_SOLID_TRIANGLE							0x0000005d
+
+#define  NV04_RENDER_SOLID_TRIANGLE_BETA4						0x00000194
+#define  NV04_RENDER_SOLID_TRIANGLE_SURFACE						0x00000198
+
+
+#define NV01_RENDER_SOLID_RECTANGLE							0x0000001e
+
+#define  NV01_RENDER_SOLID_RECTANGLE_NOP						0x00000100
+#define  NV01_RENDER_SOLID_RECTANGLE_NOTIFY						0x00000104
+#define  NV01_RENDER_SOLID_RECTANGLE_PATCH						0x0000010c
+#define  NV01_RENDER_SOLID_RECTANGLE_DMA_NOTIFY						0x00000180
+#define  NV01_RENDER_SOLID_RECTANGLE_CLIP_RECTANGLE					0x00000184
+#define  NV01_RENDER_SOLID_RECTANGLE_PATTERN						0x00000188
+#define  NV01_RENDER_SOLID_RECTANGLE_ROP						0x0000018c
+#define  NV01_RENDER_SOLID_RECTANGLE_BETA1						0x00000190
+#define  NV01_RENDER_SOLID_RECTANGLE_SURFACE						0x00000194
+#define  NV01_RENDER_SOLID_RECTANGLE_OPERATION						0x000002fc
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_SRCCOPY_AND				0x00000000
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_ROP_AND					0x00000001
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_BLEND_AND				0x00000002
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_SRCCOPY					0x00000003
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_SRCCOPY_PREMULT				0x00000004
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV01_RENDER_SOLID_RECTANGLE_COLOR_FORMAT					0x00000300
+#define  NV01_RENDER_SOLID_RECTANGLE_COLOR						0x00000304
+#define  NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT(x)					(0x00000400+((x)*8))
+#define  NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT__SIZE				0x00000010
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT_X_SHIFT				0
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE(x)					(0x00000404+((x)*8))
+#define  NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE__SIZE				0x00000010
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE_W_SHIFT				0
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE_W_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE_H_SHIFT				16
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE_H_MASK				0xffff0000
+
+
+#define NV04_RENDER_SOLID_RECTANGLE							0x0000005e
+
+#define  NV04_RENDER_SOLID_RECTANGLE_BETA4						0x00000194
+#define  NV04_RENDER_SOLID_RECTANGLE_SURFACE						0x00000198
+
+
+#define NV01_IMAGE_BLIT									0x0000001f
+
+#define  NV01_IMAGE_BLIT_NOP								0x00000100
+#define  NV01_IMAGE_BLIT_NOTIFY								0x00000104
+#define  NV01_IMAGE_BLIT_PATCH								0x0000010c
+#define  NV01_IMAGE_BLIT_DMA_NOTIFY							0x00000180
+#define  NV01_IMAGE_BLIT_COLOR_KEY							0x00000184
+#define  NV01_IMAGE_BLIT_CLIP_RECTANGLE							0x00000188
+#define  NV01_IMAGE_BLIT_PATTERN							0x0000018c
+#define  NV01_IMAGE_BLIT_ROP								0x00000190
+#define  NV01_IMAGE_BLIT_BETA1								0x00000194
+#define  NV01_IMAGE_BLIT_SURFACE							0x0000019c
+#define  NV01_IMAGE_BLIT_OPERATION							0x000002fc
+#define   NV01_IMAGE_BLIT_OPERATION_SRCCOPY_AND						0x00000000
+#define   NV01_IMAGE_BLIT_OPERATION_ROP_AND						0x00000001
+#define   NV01_IMAGE_BLIT_OPERATION_BLEND_AND						0x00000002
+#define   NV01_IMAGE_BLIT_OPERATION_SRCCOPY						0x00000003
+#define   NV01_IMAGE_BLIT_OPERATION_SRCCOPY_PREMULT					0x00000004
+#define   NV01_IMAGE_BLIT_OPERATION_BLEND_PREMULT					0x00000005
+#define  NV01_IMAGE_BLIT_IMAGE_INPUT							0x00000204
+#define  NV01_IMAGE_BLIT_POINT_IN							0x00000300
+#define   NV01_IMAGE_BLIT_POINT_IN_X_SHIFT						0
+#define   NV01_IMAGE_BLIT_POINT_IN_X_MASK						0x0000ffff
+#define   NV01_IMAGE_BLIT_POINT_IN_Y_SHIFT						16
+#define   NV01_IMAGE_BLIT_POINT_IN_Y_MASK						0xffff0000
+#define  NV01_IMAGE_BLIT_POINT_OUT							0x00000304
+#define   NV01_IMAGE_BLIT_POINT_OUT_X_SHIFT						0
+#define   NV01_IMAGE_BLIT_POINT_OUT_X_MASK						0x0000ffff
+#define   NV01_IMAGE_BLIT_POINT_OUT_Y_SHIFT						16
+#define   NV01_IMAGE_BLIT_POINT_OUT_Y_MASK						0xffff0000
+#define  NV01_IMAGE_BLIT_SIZE								0x00000308
+#define   NV01_IMAGE_BLIT_SIZE_W_SHIFT							0
+#define   NV01_IMAGE_BLIT_SIZE_W_MASK							0x0000ffff
+#define   NV01_IMAGE_BLIT_SIZE_H_SHIFT							16
+#define   NV01_IMAGE_BLIT_SIZE_H_MASK							0xffff0000
+
+
+#define NV04_IMAGE_BLIT									0x0000005f
+
+#define  NV04_IMAGE_BLIT_ROP								0x00000190
+#define  NV04_IMAGE_BLIT_BETA4								0x00000198
+#define  NV04_IMAGE_BLIT_SURFACE							0x0000019c
+
+
+#define NV12_IMAGE_BLIT									0x0000009f
+
+#define  NV12_IMAGE_BLIT_WAIT_FOR_IDLE							0x00000108
+
+
+#define NV01_IMAGE_FROM_CPU								0x00000021
+
+#define  NV01_IMAGE_FROM_CPU_NOP							0x00000100
+#define  NV01_IMAGE_FROM_CPU_NOTIFY							0x00000104
+#define  NV01_IMAGE_FROM_CPU_PATCH							0x0000010c
+#define  NV01_IMAGE_FROM_CPU_DMA_NOTIFY							0x00000180
+#define  NV01_IMAGE_FROM_CPU_COLOR_KEY							0x00000184
+#define  NV01_IMAGE_FROM_CPU_CLIP_RECTANGLE						0x00000188
+#define  NV01_IMAGE_FROM_CPU_PATTERN							0x0000018c
+#define  NV01_IMAGE_FROM_CPU_ROP							0x00000190
+#define  NV01_IMAGE_FROM_CPU_BETA1							0x00000194
+#define  NV01_IMAGE_FROM_CPU_SURFACE							0x00000198
+#define  NV01_IMAGE_FROM_CPU_OPERATION							0x000002fc
+#define   NV01_IMAGE_FROM_CPU_OPERATION_SRCCOPY_AND					0x00000000
+#define   NV01_IMAGE_FROM_CPU_OPERATION_ROP_AND						0x00000001
+#define   NV01_IMAGE_FROM_CPU_OPERATION_BLEND_AND					0x00000002
+#define   NV01_IMAGE_FROM_CPU_OPERATION_SRCCOPY						0x00000003
+#define   NV01_IMAGE_FROM_CPU_OPERATION_SRCCOPY_PREMULT					0x00000004
+#define   NV01_IMAGE_FROM_CPU_OPERATION_BLEND_PREMULT					0x00000005
+#define  NV01_IMAGE_FROM_CPU_COLOR_FORMAT						0x00000300
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_Y8						0x00000001
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_A1R5G5B5					0x00000002
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_X1R5G5B5					0x00000003
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_A8R8G8B8					0x00000004
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_X8R8G8B8					0x00000005
+#define  NV01_IMAGE_FROM_CPU_POINT							0x00000304
+#define   NV01_IMAGE_FROM_CPU_POINT_X_SHIFT						0
+#define   NV01_IMAGE_FROM_CPU_POINT_X_MASK						0x0000ffff
+#define   NV01_IMAGE_FROM_CPU_POINT_Y_SHIFT						16
+#define   NV01_IMAGE_FROM_CPU_POINT_Y_MASK						0xffff0000
+#define  NV01_IMAGE_FROM_CPU_SIZE_OUT							0x00000308
+#define   NV01_IMAGE_FROM_CPU_SIZE_OUT_W_SHIFT						0
+#define   NV01_IMAGE_FROM_CPU_SIZE_OUT_W_MASK						0x0000ffff
+#define   NV01_IMAGE_FROM_CPU_SIZE_OUT_H_SHIFT						16
+#define   NV01_IMAGE_FROM_CPU_SIZE_OUT_H_MASK						0xffff0000
+#define  NV01_IMAGE_FROM_CPU_SIZE_IN							0x0000030c
+#define   NV01_IMAGE_FROM_CPU_SIZE_IN_W_SHIFT						0
+#define   NV01_IMAGE_FROM_CPU_SIZE_IN_W_MASK						0x0000ffff
+#define   NV01_IMAGE_FROM_CPU_SIZE_IN_H_SHIFT						16
+#define   NV01_IMAGE_FROM_CPU_SIZE_IN_H_MASK						0xffff0000
+#define  NV01_IMAGE_FROM_CPU_COLOR(x)							(0x00000400+((x)*4))
+#define  NV01_IMAGE_FROM_CPU_COLOR__SIZE						0x00000020
+
+
+#define NV04_IMAGE_FROM_CPU								0x00000061
+
+#define  NV04_IMAGE_FROM_CPU_BETA4							0x00000198
+#define  NV04_IMAGE_FROM_CPU_SURFACE							0x0000019c
+
+
+#define NV05_IMAGE_FROM_CPU								0x00000065
+
+#define  NV05_IMAGE_FROM_CPU_COLOR_CONVERSION						0x000002f8
+
+
+#define NV10_IMAGE_FROM_CPU								0x0000008a
+
+#define  NV10_IMAGE_FROM_CPU_WAIT_FOR_IDLE						0x00000108
+
+
+#define NV30_IMAGE_FROM_CPU								0x0000038a
+
+
+
+#define NV40_IMAGE_FROM_CPU								0x0000308a
+
+
+
+#define NV01_NULL									0x00000030
+
+
+
+#define NV03_STRETCHED_IMAGE_FROM_CPU							0x00000036
+
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_NOP						0x00000100
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_NOTIFY						0x00000104
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_PATCH						0x0000010c
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_DMA_NOTIFY					0x00000180
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_COLOR_KEY					0x00000184
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_PATTERN						0x00000188
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_ROP						0x0000018c
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_BETA1						0x00000190
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_SURFACE						0x00000194
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_OPERATION					0x000002fc
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_COLOR_FORMAT					0x00000300
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN						0x00000304
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN_W_SHIFT					0
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN_W_MASK					0x0000ffff
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN_H_SHIFT					16
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN_H_MASK					0xffff0000
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_DX_DU						0x00000308
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_DY_DV						0x0000030c
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT					0x00000310
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT_X_SHIFT				0
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT_X_MASK				0x0000ffff
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT_Y_SHIFT				16
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT_Y_MASK				0xffff0000
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE					0x00000314
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE_W_SHIFT				0
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE_W_MASK				0x0000ffff
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE_H_SHIFT				16
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE_H_MASK				0xffff0000
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4					0x00000318
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4_X_SHIFT				0
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4_X_MASK				0x0000ffff
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4_Y_SHIFT				16
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4_Y_MASK				0xffff0000
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_COLOR(x)						(0x00000400+((x)*4))
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_COLOR__SIZE					0x00000020
+
+
+#define NV04_STRETCHED_IMAGE_FROM_CPU							0x00000076
+
+#define  NV04_STRETCHED_IMAGE_FROM_CPU_BETA4						0x00000194
+#define  NV04_STRETCHED_IMAGE_FROM_CPU_SURFACE						0x00000198
+
+
+#define NV05_STRETCHED_IMAGE_FROM_CPU							0x00000066
+
+#define  NV05_STRETCHED_IMAGE_FROM_CPU_COLOR_CONVERSION					0x000002f8
+
+
+#define NV30_STRETCHED_IMAGE_FROM_CPU							0x00000366
+
+
+
+#define NV40_STRETCHED_IMAGE_FROM_CPU							0x00003066
+
+
+
+#define NV03_SCALED_IMAGE_FROM_MEMORY							0x00000037
+
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_NOP						0x00000100
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_NOTIFY						0x00000104
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_DMA_NOTIFY					0x00000180
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE					0x00000184
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_PATTERN						0x00000188
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_ROP						0x0000018c
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_BETA1						0x00000190
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_SURFACE						0x00000194
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT					0x00000300
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5				0x00000001
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X1R5G5B5				0x00000002
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8				0x00000003
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8				0x00000004
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_V8YB8U8YA8				0x00000005
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_YB8V8YA8U8				0x00000006
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5				0x00000007
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8					0x00000008
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_AY8				0x00000009
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION					0x00000304
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY_AND				0x00000000
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_ROP_AND				0x00000001
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_BLEND_AND				0x00000002
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY				0x00000003
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT					0x00000308
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_X_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_X_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE					0x0000030c
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_W_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_W_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT					0x00000310
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_X_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_X_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE						0x00000314
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_W_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_W_MASK					0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_MASK					0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_DU_DX						0x00000318
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_DV_DY						0x0000031c
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_SIZE						0x00000400
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_W_SHIFT					0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_W_MASK					0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT					16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_MASK					0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT						0x00000404
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_PITCH_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_PITCH_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_MASK				0x00ff0000
+#define    NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER				0x00010000
+#define    NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CORNER				0x00020000
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_SHIFT				24
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_MASK				0xff000000
+#define    NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE			0x00000000
+#define    NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_BILINEAR				0x01000000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_OFFSET						0x00000408
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_POINT						0x0000040c
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_POINT_U_SHIFT					0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_POINT_U_MASK					0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_POINT_V_SHIFT					16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_POINT_V_MASK					0xffff0000
+
+
+#define NV04_SCALED_IMAGE_FROM_MEMORY							0x00000077
+
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_BETA4						0x00000194
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE						0x00000198
+
+
+#define NV05_SCALED_IMAGE_FROM_MEMORY							0x00000063
+
+#define  NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION					0x000002fc
+#define   NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_DITHER				0x00000000
+#define   NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE			0x00000001
+#define   NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_SUBTR_TRUNCATE			0x00000002
+
+
+#define NV10_SCALED_IMAGE_FROM_MEMORY							0x00000089
+
+#define  NV10_SCALED_IMAGE_FROM_MEMORY_WAIT_FOR_IDLE					0x00000108
+
+
+#define NV30_SCALED_IMAGE_FROM_MEMORY							0x00000389
+
+
+
+#define NV40_SCALED_IMAGE_FROM_MEMORY							0x00003089
+
+
+
+#define NV04_DVD_SUBPICTURE								0x00000038
+
+#define  NV04_DVD_SUBPICTURE_NOP							0x00000100
+#define  NV04_DVD_SUBPICTURE_NOTIFY							0x00000104
+#define  NV04_DVD_SUBPICTURE_DMA_NOTIFY							0x00000180
+#define  NV04_DVD_SUBPICTURE_DMA_OVERLAY						0x00000184
+#define  NV04_DVD_SUBPICTURE_DMA_IMAGEIN						0x00000188
+#define  NV04_DVD_SUBPICTURE_DMA_IMAGEOUT						0x0000018c
+#define  NV04_DVD_SUBPICTURE_IMAGEOUT_POINT						0x00000300
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_POINT_X_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_POINT_X_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_POINT_Y_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_POINT_Y_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE						0x00000304
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE_W_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE_W_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE_H_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE_H_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT						0x00000308
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT_PITCH_SHIFT				0
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT_PITCH_MASK				0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT_COLOR_SHIFT				16
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT_COLOR_MASK				0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEOUT_OFFSET						0x0000030c
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_DELTA_DU_DX					0x00000310
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_DELTA_DV_DY					0x00000314
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_SIZE						0x00000318
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_SIZE_W_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_SIZE_W_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_SIZE_H_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_SIZE_H_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT						0x0000031c
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT_PITCH_SHIFT				0
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT_PITCH_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT_COLOR_SHIFT				16
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT_COLOR_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_OFFSET						0x00000320
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_POINT						0x00000324
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_POINT_U_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_POINT_U_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_POINT_V_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_POINT_V_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_OVERLAY_DELTA_DU_DX					0x00000328
+#define  NV04_DVD_SUBPICTURE_OVERLAY_DELTA_DV_DY					0x0000032c
+#define  NV04_DVD_SUBPICTURE_OVERLAY_SIZE						0x00000330
+#define   NV04_DVD_SUBPICTURE_OVERLAY_SIZE_W_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_OVERLAY_SIZE_W_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_OVERLAY_SIZE_H_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_OVERLAY_SIZE_H_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_OVERLAY_FORMAT						0x00000334
+#define   NV04_DVD_SUBPICTURE_OVERLAY_FORMAT_PITCH_SHIFT				0
+#define   NV04_DVD_SUBPICTURE_OVERLAY_FORMAT_PITCH_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_OVERLAY_FORMAT_COLOR_SHIFT				16
+#define   NV04_DVD_SUBPICTURE_OVERLAY_FORMAT_COLOR_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_OVERLAY_OFFSET						0x00000338
+#define  NV04_DVD_SUBPICTURE_OVERLAY_POINT						0x0000033c
+#define   NV04_DVD_SUBPICTURE_OVERLAY_POINT_U_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_OVERLAY_POINT_U_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_OVERLAY_POINT_V_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_OVERLAY_POINT_V_MASK					0xffff0000
+
+
+#define NV10_DVD_SUBPICTURE								0x00000088
+
+#define  NV10_DVD_SUBPICTURE_WAIT_FOR_IDLE						0x00000108
+
+
+#define NV04_MEMORY_TO_MEMORY_FORMAT							0x00000039
+
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_NOP						0x00000100
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_NOTIFY						0x00000104
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY					0x00000180
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN					0x00000184
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_OUT					0x00000188
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN						0x0000030c
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_OUT					0x00000310
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_PITCH_IN						0x00000314
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_PITCH_OUT						0x00000318
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_LINE_LENGTH_IN					0x0000031c
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_LINE_COUNT					0x00000320
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT						0x00000324
+#define   NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT_INPUT_INC_SHIFT				0
+#define   NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT_INPUT_INC_MASK				0x000000ff
+#define   NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT_OUTPUT_INC_SHIFT				8
+#define   NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT_OUTPUT_INC_MASK				0x0000ff00
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_BUF_NOTIFY					0x00000328
+
+
+#define NV50_MEMORY_TO_MEMORY_FORMAT							0x00005039
+
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_SERIALIZE						0x00000110
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_IN						0x00000200
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_TILING_MODE_IN					0x00000204
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_TILING_PITCH_IN					0x00000208
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_TILING_HEIGHT_IN					0x0000020c
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_TILING_DEPTH_IN					0x00000210
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN_Z				0x00000214
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN				0x00000218
+#define   NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN_X_SHIFT			0
+#define   NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN_X_MASK			0x0000ffff
+#define   NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN_Y_SHIFT			16
+#define   NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN_Y_MASK			0xffff0000
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_OUT					0x0000021c
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_TILING_MODE_OUT					0x00000220
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_TILING_PITCH_OUT					0x00000224
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_TILING_HEIGHT_OUT					0x00000228
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_TILING_DEPTH_OUT					0x0000022c
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT_Z				0x00000230
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT				0x00000234
+#define   NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT_X_SHIFT			0
+#define   NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT_X_MASK			0x0000ffff
+#define   NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT_Y_SHIFT			16
+#define   NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT_Y_MASK			0xffff0000
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN_HIGH					0x00000238
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_OUT_HIGH					0x0000023c
+
+
+#define NV01_MEMORY_LOCAL_BANKED							0x0000003d
+
+
+
+#define NV01_MAPPING_SYSTEM								0x0000003e
+
+
+
+#define NV03_MEMORY_LOCAL_CURSOR							0x0000003f
+
+
+
+#define NV01_MEMORY_LOCAL_LINEAR							0x00000040
+
+
+
+#define NV01_MAPPING_LOCAL								0x00000041
+
+
+
+#define NV04_CONTEXT_SURFACES_2D							0x00000042
+
+#define  NV04_CONTEXT_SURFACES_2D_NOP							0x00000100
+#define  NV04_CONTEXT_SURFACES_2D_NOTIFY						0x00000104
+#define  NV04_CONTEXT_SURFACES_2D_PM_TRIGGER						0x00000140
+#define  NV04_CONTEXT_SURFACES_2D_DMA_NOTIFY						0x00000180
+#define  NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE					0x00000184
+#define  NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_DESTIN					0x00000188
+#define  NV04_CONTEXT_SURFACES_2D_FORMAT						0x00000300
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_Y8						0x00000001
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X1R5G5B5_Z1R5G5B5				0x00000002
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X1R5G5B5_X1R5G5B5				0x00000003
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5					0x00000004
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_Y16						0x00000005
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X8R8G8B8_Z8R8G8B8				0x00000006
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X8R8G8B8_X8R8G8B8				0x00000007
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X1A7R8G8B8_Z1A7R8G8B8				0x00000008
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X1A7R8G8B8_X1A7R8G8B8				0x00000009
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8					0x0000000a
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_Y32						0x0000000b
+#define  NV04_CONTEXT_SURFACES_2D_PITCH							0x00000304
+#define   NV04_CONTEXT_SURFACES_2D_PITCH_SOURCE_SHIFT					0
+#define   NV04_CONTEXT_SURFACES_2D_PITCH_SOURCE_MASK					0x0000ffff
+#define   NV04_CONTEXT_SURFACES_2D_PITCH_DESTIN_SHIFT					16
+#define   NV04_CONTEXT_SURFACES_2D_PITCH_DESTIN_MASK					0xffff0000
+#define  NV04_CONTEXT_SURFACES_2D_OFFSET_SOURCE						0x00000308
+#define  NV04_CONTEXT_SURFACES_2D_OFFSET_DESTIN						0x0000030c
+
+
+#define NV10_CONTEXT_SURFACES_2D							0x00000062
+
+
+
+#define NV30_CONTEXT_SURFACES_2D							0x00000362
+
+
+
+#define NV40_CONTEXT_SURFACES_2D							0x00003062
+
+
+
+#define NV03_CONTEXT_ROP								0x00000043
+
+#define  NV03_CONTEXT_ROP_NOP								0x00000100
+#define  NV03_CONTEXT_ROP_NOTIFY							0x00000104
+#define  NV03_CONTEXT_ROP_DMA_NOTIFY							0x00000180
+#define  NV03_CONTEXT_ROP_ROP								0x00000300
+#define   NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_SHIFT					0
+#define   NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_MASK					0x0000000f
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_CLEAR					0x00000000
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_NOR					0x00000001
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_AND_INVERTED				0x00000002
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_COPY_INVERTED				0x00000003
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_AND_REVERSE				0x00000004
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_INVERT					0x00000005
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_XOR					0x00000006
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_NAND					0x00000007
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_AND					0x00000008
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_EQUI					0x00000009
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_NOOP					0x0000000a
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_OR_INVERTED				0x0000000b
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_COPY					0x0000000c
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_OR_REVERSE					0x0000000d
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_OR						0x0000000e
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_SET					0x0000000f
+#define   NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_SHIFT					4
+#define   NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_MASK					0x000000f0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_CLEAR					0x00000000
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_NOR					0x00000010
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_AND_INVERTED				0x00000020
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_COPY_INVERTED				0x00000030
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_AND_REVERSE				0x00000040
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_INVERT					0x00000050
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_XOR					0x00000060
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_NAND					0x00000070
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_AND					0x00000080
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_EQUI					0x00000090
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_NOOP					0x000000a0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_OR_INVERTED				0x000000b0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_COPY					0x000000c0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_OR_REVERSE					0x000000d0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_OR						0x000000e0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_SET					0x000000f0
+
+
+#define NV04_IMAGE_PATTERN								0x00000044
+
+#define  NV04_IMAGE_PATTERN_NOP								0x00000100
+#define  NV04_IMAGE_PATTERN_NOTIFY							0x00000104
+#define  NV04_IMAGE_PATTERN_DMA_NOTIFY							0x00000180
+#define  NV04_IMAGE_PATTERN_COLOR_FORMAT						0x00000300
+#define   NV04_IMAGE_PATTERN_COLOR_FORMAT_A16R5G6B5					0x00000001
+#define   NV04_IMAGE_PATTERN_COLOR_FORMAT_X16A1R5G5B5					0x00000002
+#define   NV04_IMAGE_PATTERN_COLOR_FORMAT_A8R8G8B8					0x00000003
+#define  NV04_IMAGE_PATTERN_MONOCHROME_FORMAT						0x00000304
+#define   NV04_IMAGE_PATTERN_MONOCHROME_FORMAT_CGA6					0x00000001
+#define   NV04_IMAGE_PATTERN_MONOCHROME_FORMAT_LE					0x00000002
+#define  NV04_IMAGE_PATTERN_MONOCHROME_SHAPE						0x00000308
+#define   NV04_IMAGE_PATTERN_MONOCHROME_SHAPE_8X8					0x00000000
+#define   NV04_IMAGE_PATTERN_MONOCHROME_SHAPE_64X1					0x00000001
+#define   NV04_IMAGE_PATTERN_MONOCHROME_SHAPE_1X64					0x00000002
+#define  NV04_IMAGE_PATTERN_PATTERN_SELECT						0x0000030c
+#define   NV04_IMAGE_PATTERN_PATTERN_SELECT_MONO					0x00000001
+#define   NV04_IMAGE_PATTERN_PATTERN_SELECT_COLOR					0x00000002
+#define  NV04_IMAGE_PATTERN_MONOCHROME_COLOR0						0x00000310
+#define  NV04_IMAGE_PATTERN_MONOCHROME_COLOR1						0x00000314
+#define  NV04_IMAGE_PATTERN_MONOCHROME_PATTERN0						0x00000318
+#define  NV04_IMAGE_PATTERN_MONOCHROME_PATTERN1						0x0000031c
+#define  NV04_IMAGE_PATTERN_PATTERN_Y8(x)						(0x00000400+((x)*4))
+#define  NV04_IMAGE_PATTERN_PATTERN_Y8__SIZE						0x00000010
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y0_SHIFT					0
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y0_MASK						0x000000ff
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y1_SHIFT					8
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y1_MASK						0x0000ff00
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y2_SHIFT					16
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y2_MASK						0x00ff0000
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y3_SHIFT					24
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y3_MASK						0xff000000
+#define  NV04_IMAGE_PATTERN_PATTERN_R5G6B5(x)						(0x00000500+((x)*4))
+#define  NV04_IMAGE_PATTERN_PATTERN_R5G6B5__SIZE					0x00000020
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_B0_SHIFT					0
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_B0_MASK					0x0000001f
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_G0_SHIFT					5
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_G0_MASK					0x000007e0
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_R0_SHIFT					11
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_R0_MASK					0x0000f800
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_B1_SHIFT					16
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_B1_MASK					0x001f0000
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_G1_SHIFT					21
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_G1_MASK					0x07e00000
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_R1_SHIFT					27
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_R1_MASK					0xf8000000
+#define  NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5(x)						(0x00000600+((x)*4))
+#define  NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5__SIZE					0x00000020
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_B0_SHIFT					0
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_B0_MASK					0x0000001f
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_G0_SHIFT					5
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_G0_MASK					0x000003e0
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_R0_SHIFT					10
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_R0_MASK					0x00007c00
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_B1_SHIFT					16
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_B1_MASK					0x001f0000
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_G1_SHIFT					21
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_G1_MASK					0x03e00000
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_R1_SHIFT					26
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_R1_MASK					0x7c000000
+#define  NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8(x)						(0x00000700+((x)*4))
+#define  NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8__SIZE					0x00000040
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_B_SHIFT					0
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_B_MASK					0x000000ff
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_G_SHIFT					8
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_G_MASK					0x0000ff00
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_R_SHIFT					16
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_R_MASK					0x00ff0000
+
+
+#define NV03_VIDEO_LUT_CURSOR_DAC							0x00000046
+
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SYNCHRONIZE						0x00000100
+#define  NV03_VIDEO_LUT_CURSOR_DAC_STOP_IMAGE						0x00000104
+#define  NV03_VIDEO_LUT_CURSOR_DAC_STOP_CURSOR						0x00000108
+#define  NV03_VIDEO_LUT_CURSOR_DAC_STOP_DAC						0x0000010c
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_NOTIFY						0x00000180
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_IMAGE(x)						(0x00000184+((x)*4))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_IMAGE__SIZE					0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_LUT(x)						(0x0000018c+((x)*4))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_LUT__SIZE					0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_CURSOR(x)					(0x00000194+((x)*4))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_CURSOR__SIZE					0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_GET							0x000002fc
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_OFFSET(x)					(0x00000300+((x)*8))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_OFFSET__SIZE				0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT(x)					(0x00000304+((x)*8))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT__SIZE				0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_PITCH_SHIFT			0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_PITCH_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_COLOR_SHIFT			16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_COLOR_MASK				0x0fff0000
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_NOTIFY_SHIFT			28
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_NOTIFY_MASK			0xf0000000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_OFFSET(x)					(0x00000340+((x)*12))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_OFFSET__SIZE				0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT(x)				(0x00000344+((x)*12))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT__SIZE				0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_X_SHIFT			0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_X_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_Y_SHIFT			16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_Y_MASK				0xffff0000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_FORMAT(x)					(0x00000348+((x)*12))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_FORMAT__SIZE				0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A				0x00000358
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A_X_SHIFT			0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A_X_MASK			0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A_Y_SHIFT			16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A_Y_MASK			0xffff0000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE(x)				(0x00000380+((x)*16))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE__SIZE				0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE_W_SHIFT				0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE_W_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE_H_SHIFT				16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE_H_MASK				0xffff0000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC(x)					(0x00000384+((x)*16))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC__SIZE					0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_START_SHIFT				0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_START_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_WIDTH_SHIFT				16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_WIDTH_MASK				0x0fff0000
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_POLARITY_SHIFT			28
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_POLARITY_MASK				0xf0000000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC(x)					(0x00000388+((x)*16))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC__SIZE					0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_START_SHIFT				0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_START_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_WIDTH_SHIFT				16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_WIDTH_MASK				0x0fff0000
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_POLARITY_SHIFT			28
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_POLARITY_MASK				0xf0000000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE(x)				(0x0000038c+((x)*16))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE__SIZE				0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_WIDTH_SHIFT			0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_WIDTH_MASK			0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_HEIGHT_SHIFT			16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_HEIGHT_MASK			0x0fff0000
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_NOTIFY_SHIFT			28
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_NOTIFY_MASK			0xf0000000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_PIXEL_CLOCK					0x000003a0
+
+
+#define NV03_TEXTURED_TRIANGLE								0x00000048
+
+#define  NV03_TEXTURED_TRIANGLE_NOP							0x00000100
+#define  NV03_TEXTURED_TRIANGLE_NOTIFY							0x00000104
+#define  NV03_TEXTURED_TRIANGLE_PATCH							0x0000010c
+#define  NV03_TEXTURED_TRIANGLE_DMA_NOTIFY						0x00000180
+#define  NV03_TEXTURED_TRIANGLE_DMA_TEXTURE						0x00000184
+#define  NV03_TEXTURED_TRIANGLE_CLIP_RECTANGLE						0x00000188
+#define  NV03_TEXTURED_TRIANGLE_SURFACE							0x0000018c
+#define  NV03_TEXTURED_TRIANGLE_TEXTURE_OFFSET						0x00000304
+#define  NV03_TEXTURED_TRIANGLE_TEXTURE_FORMAT						0x00000308
+#define   NV03_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_KEY_MASK_SHIFT			0
+#define   NV03_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_KEY_MASK_MASK			0x0000ffff
+#define   NV03_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_KEY_ENABLE_SHIFT			16
+#define   NV03_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_KEY_ENABLE_MASK			0x000f0000
+#define   NV03_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_SHIFT				20
+#define   NV03_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_MASK				0x00f00000
+#define   NV03_TEXTURED_TRIANGLE_TEXTURE_FORMAT_SIZE_MIN_SHIFT				24
+#define   NV03_TEXTURED_TRIANGLE_TEXTURE_FORMAT_SIZE_MIN_MASK				0x0f000000
+#define   NV03_TEXTURED_TRIANGLE_TEXTURE_FORMAT_SIZE_MAX_SHIFT				28
+#define   NV03_TEXTURED_TRIANGLE_TEXTURE_FORMAT_SIZE_MAX_MASK				0xf0000000
+#define  NV03_TEXTURED_TRIANGLE_FILTER							0x0000030c
+#define   NV03_TEXTURED_TRIANGLE_FILTER_SPREAD_X_SHIFT					0
+#define   NV03_TEXTURED_TRIANGLE_FILTER_SPREAD_X_MASK					0x0000001f
+#define   NV03_TEXTURED_TRIANGLE_FILTER_SPREAD_Y_SHIFT					8
+#define   NV03_TEXTURED_TRIANGLE_FILTER_SPREAD_Y_MASK					0x00001f00
+#define   NV03_TEXTURED_TRIANGLE_FILTER_SIZE_ADJUST_SHIFT				16
+#define   NV03_TEXTURED_TRIANGLE_FILTER_SIZE_ADJUST_MASK				0x00ff0000
+#define  NV03_TEXTURED_TRIANGLE_FOG_COLOR						0x00000310
+#define   NV03_TEXTURED_TRIANGLE_FOG_COLOR_B_SHIFT					0
+#define   NV03_TEXTURED_TRIANGLE_FOG_COLOR_B_MASK					0x000000ff
+#define   NV03_TEXTURED_TRIANGLE_FOG_COLOR_G_SHIFT					8
+#define   NV03_TEXTURED_TRIANGLE_FOG_COLOR_G_MASK					0x0000ff00
+#define   NV03_TEXTURED_TRIANGLE_FOG_COLOR_R_SHIFT					16
+#define   NV03_TEXTURED_TRIANGLE_FOG_COLOR_R_MASK					0x00ff0000
+#define  NV03_TEXTURED_TRIANGLE_CONTROL_OUT						0x00000314
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_INTERPOLATOR_SHIFT				0
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_INTERPOLATOR_MASK				0x0000000f
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_WRAP_U_SHIFT				4
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_WRAP_U_MASK				0x00000030
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_WRAP_V_SHIFT				6
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_WRAP_V_MASK				0x000000c0
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_SOURCE_COLOR_SHIFT				8
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_SOURCE_COLOR_MASK				0x00000f00
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_CULLING_SHIFT				12
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_CULLING_MASK				0x00007000
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_Z_PERSPECTIVE_ENABLE			(1 << 15)
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_Z_FUNC_SHIFT				16
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_Z_FUNC_MASK				0x000f0000
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_Z_WRITE_ENABLE_SHIFT			20
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_Z_WRITE_ENABLE_MASK			0x00f00000
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_COLOR_WRITE_ENABLE_SHIFT			24
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_COLOR_WRITE_ENABLE_MASK			0x07000000
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_ROP_SHIFT					27
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_ROP_MASK					0x18000000
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_BETA					(1 << 29)
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_DST_BLEND					(1 << 30)
+#define   NV03_TEXTURED_TRIANGLE_CONTROL_OUT_SRC_BLEND					(1 << 31)
+#define  NV03_TEXTURED_TRIANGLE_ALPHA_CONTROL						0x00000318
+#define   NV03_TEXTURED_TRIANGLE_ALPHA_CONTROL_ALPHA_REF_SHIFT				0
+#define   NV03_TEXTURED_TRIANGLE_ALPHA_CONTROL_ALPHA_REF_MASK				0x000000ff
+#define   NV03_TEXTURED_TRIANGLE_ALPHA_CONTROL_ALPHA_FUNC_SHIFT				8
+#define   NV03_TEXTURED_TRIANGLE_ALPHA_CONTROL_ALPHA_FUNC_MASK				0xffffff00
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR(x)					(0x00001000+((x)*32))
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR__SIZE					0x00000080
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_I0_SHIFT				0
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_I0_MASK				0x0000000f
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_I1_SHIFT				4
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_I1_MASK				0x000000f0
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_I2_SHIFT				8
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_I2_MASK				0x00000f00
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_I3_SHIFT				12
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_I3_MASK				0x0000f000
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_I4_SHIFT				16
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_I4_MASK				0x000f0000
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_I5_SHIFT				20
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_I5_MASK				0x00f00000
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_FOG_SHIFT				24
+#define   NV03_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_FOG_MASK				0xff000000
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_COLOR(x)					(0x00001004+((x)*32))
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_COLOR__SIZE					0x00000080
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_SX(x)						(0x00001008+((x)*32))
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_SX__SIZE					0x00000080
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_SY(x)						(0x0000100c+((x)*32))
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_SY__SIZE					0x00000080
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_SZ(x)						(0x00001010+((x)*32))
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_SZ__SIZE					0x00000080
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_RHW(x)						(0x00001014+((x)*32))
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_RHW__SIZE					0x00000080
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_TU(x)						(0x00001018+((x)*32))
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_TU__SIZE					0x00000080
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_TV(x)						(0x0000101c+((x)*32))
+#define  NV03_TEXTURED_TRIANGLE_TLVERTEX_TV__SIZE					0x00000080
+
+
+#define NV04_GDI_RECTANGLE_TEXT								0x0000004a
+
+#define  NV04_GDI_RECTANGLE_TEXT_NOP							0x00000100
+#define  NV04_GDI_RECTANGLE_TEXT_NOTIFY							0x00000104
+#define  NV04_GDI_RECTANGLE_TEXT_PATCH							0x0000010c
+#define  NV04_GDI_RECTANGLE_TEXT_PM_TRIGGER						0x00000140
+#define  NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY						0x00000180
+#define  NV04_GDI_RECTANGLE_TEXT_DMA_FONTS						0x00000184
+#define  NV04_GDI_RECTANGLE_TEXT_PATTERN						0x00000188
+#define  NV04_GDI_RECTANGLE_TEXT_ROP							0x0000018c
+#define  NV04_GDI_RECTANGLE_TEXT_BETA1							0x00000190
+#define  NV04_GDI_RECTANGLE_TEXT_BETA4							0x00000194
+#define  NV04_GDI_RECTANGLE_TEXT_SURFACE						0x00000198
+#define  NV04_GDI_RECTANGLE_TEXT_OPERATION						0x000002fc
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY_AND					0x00000000
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_ROP_AND					0x00000001
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_BLEND_AND					0x00000002
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY					0x00000003
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY_PREMULT				0x00000004
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT						0x00000300
+#define   NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5				0x00000001
+#define   NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_X16A1R5G5B5				0x00000002
+#define   NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8					0x00000003
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT					0x00000304
+#define   NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_CGA6				0x00000001
+#define   NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE					0x00000002
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_A						0x000003fc
+#define  NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(x)				(0x00000400+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT__SIZE			0x00000020
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_Y_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_Y_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_X_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_X_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE(x)				(0x00000404+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE__SIZE				0x00000020
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_H_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_H_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_W_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_W_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0						0x000005f4
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1						0x000005f8
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_B						0x000005fc
+#define  NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0(x)				(0x00000600+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0__SIZE			0x00000020
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_L_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_L_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_T_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_T_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1(x)				(0x00000604+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1__SIZE			0x00000020
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_R_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_R_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_B_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_B_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0						0x000007ec
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1						0x000007f0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_C						0x000007f4
+#define  NV04_GDI_RECTANGLE_TEXT_SIZE_C							0x000007f8
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_C_W_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_C_W_MASK						0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_C_H_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_C_H_MASK						0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_POINT_C						0x000007fc
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_C_X_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_C_X_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_C_Y_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_C_Y_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_C(x)					(0x00000800+((x)*4))
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_C__SIZE				0x00000080
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0						0x00000be4
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1						0x00000be8
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR0_E						0x00000bec
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_E						0x00000bf0
+#define  NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E						0x00000bf4
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E_W_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E_W_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E_H_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E_H_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E						0x00000bf8
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E_W_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E_W_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E_H_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E_H_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_POINT_E						0x00000bfc
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_E_X_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_E_X_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_E_Y_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_E_Y_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR01_E(x)				(0x00000c00+((x)*4))
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR01_E__SIZE				0x00000080
+#define  NV04_GDI_RECTANGLE_TEXT_FONT_F							0x00000ff0
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_F_OFFSET_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_F_OFFSET_MASK					0x0fffffff
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_F_PITCH_SHIFT					28
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_F_PITCH_MASK					0xf0000000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0						0x00000ff4
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1						0x00000ff8
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_F						0x00000ffc
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F(x)					(0x00001000+((x)*4))
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F__SIZE				0x00000100
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_INDEX_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_INDEX_MASK				0x000000ff
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_X_SHIFT				8
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_X_MASK				0x000fff00
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_Y_SHIFT				20
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_Y_MASK				0xfff00000
+#define  NV04_GDI_RECTANGLE_TEXT_FONT_G							0x000017f0
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_G_OFFSET_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_G_OFFSET_MASK					0x0fffffff
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_G_PITCH_SHIFT					28
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_G_PITCH_MASK					0xf0000000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0						0x000017f4
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1						0x000017f8
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_G						0x000017fc
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT(x)				(0x00001800+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT__SIZE				0x00000100
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT_X_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT_X_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT_Y_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT_Y_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_INDEX(x)				(0x00001804+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_INDEX__SIZE				0x00000100
+
+
+#define NV03_GDI_RECTANGLE_TEXT								0x0000004b
+
+#define  NV03_GDI_RECTANGLE_TEXT_NOP							0x00000100
+#define  NV03_GDI_RECTANGLE_TEXT_NOTIFY							0x00000104
+#define  NV03_GDI_RECTANGLE_TEXT_DMA_NOTIFY						0x00000180
+#define  NV03_GDI_RECTANGLE_TEXT_PATTERN						0x00000184
+#define  NV03_GDI_RECTANGLE_TEXT_ROP							0x00000188
+#define  NV03_GDI_RECTANGLE_TEXT_BETA1							0x0000018c
+#define  NV03_GDI_RECTANGLE_TEXT_SURFACE						0x00000190
+#define  NV03_GDI_RECTANGLE_TEXT_OPERATION						0x000002fc
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR_FORMAT						0x00000300
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT					0x00000304
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_A						0x000003fc
+#define  NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT				0x00000400
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_Y_SHIFT			0
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_Y_MASK			0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_X_SHIFT			16
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_X_MASK			0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE				0x00000404
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_H_SHIFT			0
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_H_MASK			0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_W_SHIFT			16
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_W_MASK			0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B						0x000007f4
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B_L_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B_L_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B_T_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B_T_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B						0x000007f8
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B_R_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B_R_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B_B_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B_B_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_B						0x000007fc
+#define  NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0				0x00000800
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_L_SHIFT			0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_L_MASK			0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_T_SHIFT			16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_T_MASK			0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1				0x00000804
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_R_SHIFT			0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_R_MASK			0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_B_SHIFT			16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_B_MASK			0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0						0x00000bec
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_L_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_L_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_T_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_T_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1						0x00000bf0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_R_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_R_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_B_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_B_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_C						0x00000bf4
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_C							0x00000bf8
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_C_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_C_W_MASK						0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_C_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_C_H_MASK						0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_POINT_C						0x00000bfc
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_C_X_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_C_X_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_C_Y_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_C_Y_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_C(x)					(0x00000c00+((x)*4))
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_C__SIZE				0x00000020
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0						0x00000fe8
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0_L_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0_L_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0_T_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0_T_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1						0x00000fec
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1_R_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1_R_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1_B_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1_B_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_D						0x00000ff0
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D						0x00000ff4
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D_W_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D_H_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D						0x00000ff8
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D_W_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D_H_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_POINT_D						0x00000ffc
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_D_X_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_D_X_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_D_Y_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_D_Y_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_D(x)					(0x00001000+((x)*4))
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_D__SIZE				0x00000020
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0						0x000013e4
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_L_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_L_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_T_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_T_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1						0x000013e8
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_R_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_R_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_B_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_B_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR0_E						0x000013ec
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_E						0x000013f0
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E						0x000013f4
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E_W_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E_H_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E						0x000013f8
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E_W_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E_H_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_POINT_E						0x000013fc
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_E_X_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_E_X_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_E_Y_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_E_Y_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR01_E(x)				(0x00001400+((x)*4))
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR01_E__SIZE				0x00000020
+
+
+#define NV04_SWIZZLED_SURFACE								0x00000052
+
+#define  NV04_SWIZZLED_SURFACE_NOP							0x00000100
+#define  NV04_SWIZZLED_SURFACE_NOTIFY							0x00000104
+#define  NV04_SWIZZLED_SURFACE_DMA_NOTIFY						0x00000180
+#define  NV04_SWIZZLED_SURFACE_DMA_IMAGE						0x00000184
+#define  NV04_SWIZZLED_SURFACE_FORMAT							0x00000300
+#define   NV04_SWIZZLED_SURFACE_FORMAT_COLOR_SHIFT					0
+#define   NV04_SWIZZLED_SURFACE_FORMAT_COLOR_MASK					0x000000ff
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_Y8					0x00000001
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X1R5G5B5_Z1R5G5B5				0x00000002
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X1R5G5B5_X1R5G5B5				0x00000003
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_R5G6B5					0x00000004
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_Y16					0x00000005
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X8R8G8B8_Z8R8G8B8				0x00000006
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X8R8G8B8_X8R8G8B8				0x00000007
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X1A7R8G8B8_Z1A7R8G8B8			0x00000008
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X1A7R8G8B8_X1A7R8G8B8			0x00000009
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_A8R8G8B8					0x0000000a
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_Y32					0x0000000b
+#define   NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT				16
+#define   NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_MASK					0x00ff0000
+#define   NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT				24
+#define   NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_MASK					0xff000000
+#define  NV04_SWIZZLED_SURFACE_OFFSET							0x00000304
+
+
+#define NV20_SWIZZLED_SURFACE								0x0000009e
+
+
+
+#define NV30_SWIZZLED_SURFACE								0x0000039e
+
+
+
+#define NV40_SWIZZLED_SURFACE								0x0000309e
+
+
+
+#define NV04_CONTEXT_SURFACES_3D							0x00000053
+
+#define  NV04_CONTEXT_SURFACES_3D_NOP							0x00000100
+#define  NV04_CONTEXT_SURFACES_3D_NOTIFY						0x00000104
+#define  NV04_CONTEXT_SURFACES_3D_DMA_NOTIFY						0x00000180
+#define  NV04_CONTEXT_SURFACES_3D_DMA_COLOR						0x00000184
+#define  NV04_CONTEXT_SURFACES_3D_DMA_ZETA						0x00000188
+#define  NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL					0x000002f8
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL_X_SHIFT				0
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL_X_MASK				0x0000ffff
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL_W_SHIFT				16
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL_W_MASK				0xffff0000
+#define  NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL						0x000002fc
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL_Y_SHIFT				0
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL_Y_MASK					0x0000ffff
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL_H_SHIFT				16
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL_H_MASK					0xffff0000
+#define  NV04_CONTEXT_SURFACES_3D_FORMAT						0x00000300
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_SHIFT					0
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_MASK					0x000000ff
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X1R5G5B5_Z1R5G5B5			0x00000001
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X1R5G5B5_X1R5G5B5			0x00000002
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_R5G6B5					0x00000003
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X8R8G8B8_Z8R8G8B8			0x00000004
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X8R8G8B8_X8R8G8B8			0x00000005
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X1A7R8G8B8_Z1A7R8G8B8			0x00000006
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X1A7R8G8B8_X1A7R8G8B8			0x00000007
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_A8R8G8B8				0x00000008
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_TYPE_SHIFT					8
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_TYPE_MASK					0x0000ff00
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_TYPE_PITCH					0x00000100
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_TYPE_SWIZZLE					0x00000200
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_BASE_SIZE_U_SHIFT				16
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_BASE_SIZE_U_MASK				0x00ff0000
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_BASE_SIZE_V_SHIFT				24
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_BASE_SIZE_V_MASK				0xff000000
+#define  NV04_CONTEXT_SURFACES_3D_CLIP_SIZE						0x00000304
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_SIZE_W_SHIFT					0
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_SIZE_W_MASK					0x0000ffff
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_SIZE_H_SHIFT					16
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_SIZE_H_MASK					0xffff0000
+#define  NV04_CONTEXT_SURFACES_3D_PITCH							0x00000308
+#define   NV04_CONTEXT_SURFACES_3D_PITCH_COLOR_SHIFT					0
+#define   NV04_CONTEXT_SURFACES_3D_PITCH_COLOR_MASK					0x0000ffff
+#define   NV04_CONTEXT_SURFACES_3D_PITCH_ZETA_SHIFT					16
+#define   NV04_CONTEXT_SURFACES_3D_PITCH_ZETA_MASK					0xffff0000
+#define  NV04_CONTEXT_SURFACES_3D_OFFSET_COLOR						0x0000030c
+#define  NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA						0x00000310
+
+
+#define NV10_CONTEXT_SURFACES_3D							0x00000093
+
+
+
+#define NV04_TEXTURED_TRIANGLE								0x00000054
+
+#define  NV04_TEXTURED_TRIANGLE_NOP							0x00000100
+#define  NV04_TEXTURED_TRIANGLE_NOTIFY							0x00000104
+#define  NV04_TEXTURED_TRIANGLE_DMA_NOTIFY						0x00000180
+#define  NV04_TEXTURED_TRIANGLE_DMA_A							0x00000184
+#define  NV04_TEXTURED_TRIANGLE_DMA_B							0x00000188
+#define  NV04_TEXTURED_TRIANGLE_SURFACE							0x0000018c
+#define  NV04_TEXTURED_TRIANGLE_COLORKEY						0x00000300
+#define  NV04_TEXTURED_TRIANGLE_OFFSET							0x00000304
+#define  NV04_TEXTURED_TRIANGLE_FORMAT							0x00000308
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_DMA_A						(1 <<  0)
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_DMA_B						(1 <<  1)
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_KEY_MATCH_SHIFT				2
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_KEY_MATCH_MASK				0x0000000c
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_SHIFT				4
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_MASK					0x00000030
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CENTER				0x00000010
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CORNER				0x00000020
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_SHIFT				6
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_MASK					0x000000c0
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CENTER				0x00000040
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CORNER				0x00000080
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_SHIFT					8
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_MASK					0x00000f00
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_Y8					0x00000100
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_A1R5G5B5					0x00000200
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_X1R5G5B5					0x00000300
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_A4R4G4B4					0x00000400
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_R5G6B5					0x00000500
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_A8R8G8B8					0x00000600
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_X8R8G8B8					0x00000700
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT				12
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_MASK				0x0000f000
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT				16
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_MASK				0x000f0000
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT				20
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_MASK				0x00f00000
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT					24
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MASK					0x07000000
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_REPEAT				0x01000000
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MIRRORED_REPEAT			0x02000000
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE				0x03000000
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_BORDER			0x04000000
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP					0x05000000
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_WRAPU						(1 << 27)
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_SHIFT					28
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_MASK					0x70000000
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_REPEAT				0x10000000
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_MIRRORED_REPEAT			0x20000000
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_EDGE				0x30000000
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_BORDER			0x40000000
+#define    NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP					0x50000000
+#define   NV04_TEXTURED_TRIANGLE_FORMAT_WRAPV						(1 << 31)
+#define  NV04_TEXTURED_TRIANGLE_FILTER							0x0000030c
+#define   NV04_TEXTURED_TRIANGLE_FILTER_KERNEL_SIZE_X_SHIFT				0
+#define   NV04_TEXTURED_TRIANGLE_FILTER_KERNEL_SIZE_X_MASK				0x000000ff
+#define   NV04_TEXTURED_TRIANGLE_FILTER_KERNEL_SIZE_Y_SHIFT				8
+#define   NV04_TEXTURED_TRIANGLE_FILTER_KERNEL_SIZE_Y_MASK				0x00007f00
+#define   NV04_TEXTURED_TRIANGLE_FILTER_MIPMAP_DITHER_ENABLE				(1 << 15)
+#define   NV04_TEXTURED_TRIANGLE_FILTER_MIPMAP_LODBIAS_SHIFT				16
+#define   NV04_TEXTURED_TRIANGLE_FILTER_MIPMAP_LODBIAS_MASK				0x00ff0000
+#define   NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_SHIFT					24
+#define   NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_MASK					0x07000000
+#define    NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST					0x01000000
+#define    NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR					0x02000000
+#define    NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST			0x03000000
+#define    NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST			0x04000000
+#define    NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR			0x05000000
+#define    NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR			0x06000000
+#define   NV04_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE			(1 << 27)
+#define   NV04_TEXTURED_TRIANGLE_FILTER_MAGNIFY_SHIFT					28
+#define   NV04_TEXTURED_TRIANGLE_FILTER_MAGNIFY_MASK					0x70000000
+#define    NV04_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST				0x10000000
+#define    NV04_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR					0x20000000
+#define   NV04_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE			(1 << 31)
+#define  NV04_TEXTURED_TRIANGLE_BLEND							0x00000310
+#define   NV04_TEXTURED_TRIANGLE_BLEND_TEXTURE_MAP_SHIFT				0
+#define   NV04_TEXTURED_TRIANGLE_BLEND_TEXTURE_MAP_MASK					0x0000000f
+#define   NV04_TEXTURED_TRIANGLE_BLEND_MASK_BIT_SHIFT					4
+#define   NV04_TEXTURED_TRIANGLE_BLEND_MASK_BIT_MASK					0x00000030
+#define   NV04_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_SHIFT					6
+#define   NV04_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_MASK					0x000000c0
+#define    NV04_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_FLAT					0x00000040
+#define    NV04_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_GOURAUD				0x00000080
+#define    NV04_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_PHONG				0x000000c0
+#define   NV04_TEXTURED_TRIANGLE_BLEND_TEXTURE_PERSPECTIVE_ENABLE			(1 <<  8)
+#define   NV04_TEXTURED_TRIANGLE_BLEND_SPECULAR_ENABLE					(1 << 12)
+#define   NV04_TEXTURED_TRIANGLE_BLEND_FOG_ENABLE					(1 << 16)
+#define   NV04_TEXTURED_TRIANGLE_BLEND_BLEND_ENABLE					(1 << 20)
+#define   NV04_TEXTURED_TRIANGLE_BLEND_SRC_SHIFT					24
+#define   NV04_TEXTURED_TRIANGLE_BLEND_SRC_MASK						0x0f000000
+#define   NV04_TEXTURED_TRIANGLE_BLEND_DST_SHIFT					28
+#define   NV04_TEXTURED_TRIANGLE_BLEND_DST_MASK						0xf0000000
+#define  NV04_TEXTURED_TRIANGLE_CONTROL							0x00000314
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_ALPHA_REF_SHIFT				0
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_ALPHA_REF_MASK					0x000000ff
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_SHIFT				8
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_MASK				0x00000f00
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_ALPHA_ENABLE					(1 << 12)
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_ORIGIN						(1 << 13)
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE					(1 << 14)
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_SHIFT					16
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_MASK					0x000f0000
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_SHIFT				20
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_MASK					0x00300000
+#define    NV04_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_BOTH				0x00000000
+#define    NV04_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_NONE				0x00100000
+#define    NV04_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_CW					0x00200000
+#define    NV04_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_CCW					0x00300000
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_DITHER_ENABLE					(1 << 22)
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_Z_PERSPECTIVE_ENABLE				(1 << 23)
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_Z_WRITE					(1 << 24)
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_SHIFT					30
+#define   NV04_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_MASK					0xc0000000
+#define  NV04_TEXTURED_TRIANGLE_FOGCOLOR						0x00000318
+#define   NV04_TEXTURED_TRIANGLE_FOGCOLOR_B_SHIFT					0
+#define   NV04_TEXTURED_TRIANGLE_FOGCOLOR_B_MASK					0x000000ff
+#define   NV04_TEXTURED_TRIANGLE_FOGCOLOR_G_SHIFT					8
+#define   NV04_TEXTURED_TRIANGLE_FOGCOLOR_G_MASK					0x0000ff00
+#define   NV04_TEXTURED_TRIANGLE_FOGCOLOR_R_SHIFT					16
+#define   NV04_TEXTURED_TRIANGLE_FOGCOLOR_R_MASK					0x00ff0000
+#define   NV04_TEXTURED_TRIANGLE_FOGCOLOR_A_SHIFT					24
+#define   NV04_TEXTURED_TRIANGLE_FOGCOLOR_A_MASK					0xff000000
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(x)						(0x00000400+((x)*32))
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_SX__SIZE					0x00000010
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_SY(x)						(0x00000404+((x)*32))
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_SY__SIZE					0x00000010
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_SZ(x)						(0x00000408+((x)*32))
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_SZ__SIZE					0x00000010
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_RHW(x)						(0x0000040c+((x)*32))
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_RHW__SIZE					0x00000010
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_COLOR(x)					(0x00000410+((x)*32))
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_COLOR__SIZE					0x00000010
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_COLOR_B_SHIFT					0
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_COLOR_B_MASK					0x000000ff
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_COLOR_G_SHIFT					8
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_COLOR_G_MASK					0x0000ff00
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_COLOR_R_SHIFT					16
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_COLOR_R_MASK					0x00ff0000
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_COLOR_A_SHIFT					24
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_COLOR_A_MASK					0xff000000
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR(x)					(0x00000414+((x)*32))
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR__SIZE					0x00000010
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_B_SHIFT				0
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_B_MASK				0x000000ff
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_G_SHIFT				8
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_G_MASK				0x0000ff00
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_R_SHIFT				16
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_R_MASK				0x00ff0000
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_FOG_SHIFT				24
+#define   NV04_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_FOG_MASK				0xff000000
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_TU(x)						(0x00000418+((x)*32))
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_TU__SIZE					0x00000010
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_TV(x)						(0x0000041c+((x)*32))
+#define  NV04_TEXTURED_TRIANGLE_TLVERTEX_TV__SIZE					0x00000010
+#define  NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(x)					(0x00000600+((x)*4))
+#define  NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE__SIZE					0x00000040
+#define   NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE_I0_SHIFT					0
+#define   NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE_I0_MASK					0x0000000f
+#define   NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE_I1_SHIFT					4
+#define   NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE_I1_MASK					0x000000f0
+#define   NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE_I2_SHIFT					8
+#define   NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE_I2_MASK					0x00000f00
+#define   NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE_I3_SHIFT					12
+#define   NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE_I3_MASK					0x0000f000
+#define   NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE_I4_SHIFT					16
+#define   NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE_I4_MASK					0x000f0000
+#define   NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE_I5_SHIFT					20
+#define   NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE_I5_MASK					0x00f00000
+
+
+#define NV10_TEXTURED_TRIANGLE								0x00000094
+
+
+
+#define NV04_MULTITEX_TRIANGLE								0x00000055
+
+#define  NV04_MULTITEX_TRIANGLE_NOP							0x00000100
+#define  NV04_MULTITEX_TRIANGLE_NOTIFY							0x00000104
+#define  NV04_MULTITEX_TRIANGLE_DMA_NOTIFY						0x00000180
+#define  NV04_MULTITEX_TRIANGLE_DMA_A							0x00000184
+#define  NV04_MULTITEX_TRIANGLE_DMA_B							0x00000188
+#define  NV04_MULTITEX_TRIANGLE_SURFACE							0x0000018c
+#define  NV04_MULTITEX_TRIANGLE_OFFSET(x)						(0x00000308+((x)*4))
+#define  NV04_MULTITEX_TRIANGLE_OFFSET__SIZE						0x00000002
+#define  NV04_MULTITEX_TRIANGLE_FORMAT(x)						(0x00000310+((x)*4))
+#define  NV04_MULTITEX_TRIANGLE_FORMAT__SIZE						0x00000002
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_DMA_A						(1 <<  0)
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_DMA_B						(1 <<  1)
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_ORIGIN_ZOH_SHIFT				4
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_ORIGIN_ZOH_MASK					0x00000030
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_ORIGIN_FOH_SHIFT				6
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_ORIGIN_FOH_MASK					0x000000c0
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_COLOR_SHIFT					8
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_COLOR_MASK					0x00000f00
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT				12
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_MIPMAP_LEVELS_MASK				0x0000f000
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT				16
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_BASE_SIZE_U_MASK				0x000f0000
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT				20
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_BASE_SIZE_V_MASK				0x00f00000
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_ADDRESSU_SHIFT					24
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_ADDRESSU_MASK					0x07000000
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_WRAPU						(1 << 27)
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_ADDRESSV_SHIFT					28
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_ADDRESSV_MASK					0x70000000
+#define   NV04_MULTITEX_TRIANGLE_FORMAT_WRAPV						(1 << 31)
+#define  NV04_MULTITEX_TRIANGLE_FILTER(x)						(0x00000318+((x)*4))
+#define  NV04_MULTITEX_TRIANGLE_FILTER__SIZE						0x00000002
+#define   NV04_MULTITEX_TRIANGLE_FILTER_KERNEL_SIZE_X_SHIFT				0
+#define   NV04_MULTITEX_TRIANGLE_FILTER_KERNEL_SIZE_X_MASK				0x000000ff
+#define   NV04_MULTITEX_TRIANGLE_FILTER_KERNEL_SIZE_Y_SHIFT				8
+#define   NV04_MULTITEX_TRIANGLE_FILTER_KERNEL_SIZE_Y_MASK				0x00007f00
+#define   NV04_MULTITEX_TRIANGLE_FILTER_MIPMAP_DITHER_ENABLE				(1 << 15)
+#define   NV04_MULTITEX_TRIANGLE_FILTER_MIPMAP_LODBIAS_SHIFT				16
+#define   NV04_MULTITEX_TRIANGLE_FILTER_MIPMAP_LODBIAS_MASK				0x00ff0000
+#define   NV04_MULTITEX_TRIANGLE_FILTER_MINIFY_SHIFT					24
+#define   NV04_MULTITEX_TRIANGLE_FILTER_MINIFY_MASK					0x07000000
+#define   NV04_MULTITEX_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE			(1 << 27)
+#define   NV04_MULTITEX_TRIANGLE_FILTER_MAGNIFY_SHIFT					28
+#define   NV04_MULTITEX_TRIANGLE_FILTER_MAGNIFY_MASK					0x70000000
+#define   NV04_MULTITEX_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE			(1 << 31)
+#define  NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA(x)					(0x00000320+((x)*12))
+#define  NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA__SIZE					0x00000002
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_INVERSE0					(1 <<  0)
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT0_SHIFT				2
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT0_MASK				0x000000fc
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT0_ZERO				0x00000004
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT0_CONSTANT			0x00000008
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT0_PRIMARY_COLOR			0x0000000c
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT0_PREVIOUS			0x00000010
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT0_TEXTURE0			0x00000014
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT0_TEXTURE1			0x00000018
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_INVERSE1					(1 <<  8)
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT1_SHIFT				10
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT1_MASK				0x0000fc00
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT1_ZERO				0x00000400
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT1_CONSTANT			0x00000800
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT1_PRIMARY_COLOR			0x00000c00
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT1_PREVIOUS			0x00001000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT1_TEXTURE0			0x00001400
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT1_TEXTURE1			0x00001800
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_INVERSE2					(1 << 16)
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT2_SHIFT				18
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT2_MASK				0x00fc0000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT2_ZERO				0x00040000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT2_CONSTANT			0x00080000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT2_PRIMARY_COLOR			0x000c0000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT2_PREVIOUS			0x00100000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT2_TEXTURE0			0x00140000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT2_TEXTURE1			0x00180000
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_INVERSE3					(1 << 24)
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT3_SHIFT				26
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT3_MASK				0x1c000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT3_ZERO				0x04000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT3_CONSTANT			0x08000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT3_PRIMARY_COLOR			0x0c000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT3_PREVIOUS			0x10000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT3_TEXTURE0			0x14000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_ARGUMENT3_TEXTURE1			0x18000000
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_MAP_SHIFT				29
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_MAP_MASK					0xe0000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_MAP_IDENTITY				0x20000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_MAP_SCALE2				0x40000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_MAP_SCALE4				0x60000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_MAP_BIAS				0x80000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_ALPHA_MAP_BIAS_SCALE2				0xe0000000
+#define  NV04_MULTITEX_TRIANGLE_COMBINE_COLOR(x)					(0x00000324+((x)*12))
+#define  NV04_MULTITEX_TRIANGLE_COMBINE_COLOR__SIZE					0x00000002
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_INVERSE0					(1 <<  0)
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ALPHA0					(1 <<  1)
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT0_SHIFT				2
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT0_MASK				0x000000fc
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT0_ZERO				0x00000004
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT0_CONSTANT			0x00000008
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT0_PRIMARY_COLOR			0x0000000c
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT0_PREVIOUS			0x00000010
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT0_TEXTURE0			0x00000014
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT0_TEXTURE1			0x00000018
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_INVERSE1					(1 <<  8)
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ALPHA1					(1 <<  9)
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT1_SHIFT				10
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT1_MASK				0x0000fc00
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT1_ZERO				0x00000400
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT1_CONSTANT			0x00000800
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT1_PRIMARY_COLOR			0x00000c00
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT1_PREVIOUS			0x00001000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT1_TEXTURE0			0x00001400
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT1_TEXTURE1			0x00001800
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_INVERSE2					(1 << 16)
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ALPHA2					(1 << 17)
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT2_SHIFT				18
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT2_MASK				0x00fc0000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT2_ZERO				0x00040000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT2_CONSTANT			0x00080000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT2_PRIMARY_COLOR			0x000c0000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT2_PREVIOUS			0x00100000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT2_TEXTURE0			0x00140000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT2_TEXTURE1			0x00180000
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_INVERSE3					(1 << 24)
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ALPHA3					(1 << 25)
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT3_SHIFT				26
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT3_MASK				0x1c000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT3_ZERO				0x04000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT3_CONSTANT			0x08000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT3_PRIMARY_COLOR			0x0c000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT3_PREVIOUS			0x10000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT3_TEXTURE0			0x14000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_ARGUMENT3_TEXTURE1			0x18000000
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_MAP_SHIFT				29
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_MAP_MASK					0xe0000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_MAP_IDENTITY				0x20000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_MAP_SCALE2				0x40000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_MAP_SCALE4				0x60000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_MAP_BIAS				0x80000000
+#define    NV04_MULTITEX_TRIANGLE_COMBINE_COLOR_MAP_BIAS_SCALE2				0xe0000000
+#define  NV04_MULTITEX_TRIANGLE_COMBINE_FACTOR						0x00000334
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_FACTOR_B_SHIFT					0
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_FACTOR_B_MASK					0x000000ff
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_FACTOR_G_SHIFT					8
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_FACTOR_G_MASK					0x0000ff00
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_FACTOR_R_SHIFT					16
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_FACTOR_R_MASK					0x00ff0000
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_FACTOR_A_SHIFT					24
+#define   NV04_MULTITEX_TRIANGLE_COMBINE_FACTOR_A_MASK					0xff000000
+#define  NV04_MULTITEX_TRIANGLE_BLEND							0x00000338
+#define   NV04_MULTITEX_TRIANGLE_BLEND_MASK_BIT_SHIFT					4
+#define   NV04_MULTITEX_TRIANGLE_BLEND_MASK_BIT_MASK					0x00000030
+#define   NV04_MULTITEX_TRIANGLE_BLEND_SHADE_MODE_SHIFT					6
+#define   NV04_MULTITEX_TRIANGLE_BLEND_SHADE_MODE_MASK					0x000000c0
+#define    NV04_MULTITEX_TRIANGLE_BLEND_SHADE_MODE_FLAT					0x00000040
+#define    NV04_MULTITEX_TRIANGLE_BLEND_SHADE_MODE_GOURAUD				0x00000080
+#define    NV04_MULTITEX_TRIANGLE_BLEND_SHADE_MODE_PHONG				0x000000c0
+#define   NV04_MULTITEX_TRIANGLE_BLEND_TEXTURE_PERSPECTIVE_ENABLE			(1 <<  8)
+#define   NV04_MULTITEX_TRIANGLE_BLEND_SPECULAR_ENABLE					(1 << 12)
+#define   NV04_MULTITEX_TRIANGLE_BLEND_FOG_ENABLE					(1 << 16)
+#define   NV04_MULTITEX_TRIANGLE_BLEND_BLEND_ENABLE					(1 << 20)
+#define   NV04_MULTITEX_TRIANGLE_BLEND_SRC_SHIFT					24
+#define   NV04_MULTITEX_TRIANGLE_BLEND_SRC_MASK						0x0f000000
+#define   NV04_MULTITEX_TRIANGLE_BLEND_DST_SHIFT					28
+#define   NV04_MULTITEX_TRIANGLE_BLEND_DST_MASK						0xf0000000
+#define  NV04_MULTITEX_TRIANGLE_CONTROL0						0x0000033c
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_ALPHA_REF_SHIFT				0
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_ALPHA_REF_MASK				0x000000ff
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_ALPHA_FUNC_SHIFT				8
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_ALPHA_FUNC_MASK				0x00000f00
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_ALPHA_ENABLE					(1 << 12)
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_ORIGIN					(1 << 13)
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_Z_ENABLE					(1 << 14)
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_Z_FUNC_SHIFT					16
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_Z_FUNC_MASK					0x000f0000
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_CULL_MODE_SHIFT				20
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_CULL_MODE_MASK				0x00300000
+#define    NV04_MULTITEX_TRIANGLE_CONTROL0_CULL_MODE_BOTH				0x00000000
+#define    NV04_MULTITEX_TRIANGLE_CONTROL0_CULL_MODE_NONE				0x00100000
+#define    NV04_MULTITEX_TRIANGLE_CONTROL0_CULL_MODE_CW					0x00200000
+#define    NV04_MULTITEX_TRIANGLE_CONTROL0_CULL_MODE_CCW				0x00300000
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_DITHER_ENABLE					(1 << 22)
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_Z_PERSPECTIVE_ENABLE				(1 << 23)
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_Z_WRITE					(1 << 24)
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_STENCIL_WRITE					(1 << 25)
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_ALPHA_WRITE					(1 << 26)
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_RED_WRITE					(1 << 27)
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_GREEN_WRITE					(1 << 28)
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_BLUE_WRITE					(1 << 29)
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_Z_FORMAT_SHIFT				30
+#define   NV04_MULTITEX_TRIANGLE_CONTROL0_Z_FORMAT_MASK					0xc0000000
+#define  NV04_MULTITEX_TRIANGLE_CONTROL1						0x00000340
+#define   NV04_MULTITEX_TRIANGLE_CONTROL1_STENCIL_ENABLE				(1 <<  0)
+#define   NV04_MULTITEX_TRIANGLE_CONTROL1_STENCIL_FUNC_SHIFT				4
+#define   NV04_MULTITEX_TRIANGLE_CONTROL1_STENCIL_FUNC_MASK				0x000000f0
+#define   NV04_MULTITEX_TRIANGLE_CONTROL1_STENCIL_REF_SHIFT				8
+#define   NV04_MULTITEX_TRIANGLE_CONTROL1_STENCIL_REF_MASK				0x0000ff00
+#define   NV04_MULTITEX_TRIANGLE_CONTROL1_STENCIL_MASK_READ_SHIFT			16
+#define   NV04_MULTITEX_TRIANGLE_CONTROL1_STENCIL_MASK_READ_MASK			0x00ff0000
+#define   NV04_MULTITEX_TRIANGLE_CONTROL1_STENCIL_MASK_WRITE_SHIFT			24
+#define   NV04_MULTITEX_TRIANGLE_CONTROL1_STENCIL_MASK_WRITE_MASK			0xff000000
+#define  NV04_MULTITEX_TRIANGLE_CONTROL2						0x00000344
+#define   NV04_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_FAIL_SHIFT				0
+#define   NV04_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_FAIL_MASK				0x0000000f
+#define   NV04_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_ZFAIL_SHIFT			4
+#define   NV04_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_ZFAIL_MASK				0x000000f0
+#define   NV04_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_ZPASS_SHIFT			8
+#define   NV04_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_ZPASS_MASK				0x00000f00
+#define  NV04_MULTITEX_TRIANGLE_FOGCOLOR						0x00000348
+#define   NV04_MULTITEX_TRIANGLE_FOGCOLOR_B_SHIFT					0
+#define   NV04_MULTITEX_TRIANGLE_FOGCOLOR_B_MASK					0x000000ff
+#define   NV04_MULTITEX_TRIANGLE_FOGCOLOR_G_SHIFT					8
+#define   NV04_MULTITEX_TRIANGLE_FOGCOLOR_G_MASK					0x0000ff00
+#define   NV04_MULTITEX_TRIANGLE_FOGCOLOR_R_SHIFT					16
+#define   NV04_MULTITEX_TRIANGLE_FOGCOLOR_R_MASK					0x00ff0000
+#define   NV04_MULTITEX_TRIANGLE_FOGCOLOR_A_SHIFT					24
+#define   NV04_MULTITEX_TRIANGLE_FOGCOLOR_A_MASK					0xff000000
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SX(x)					(0x00000400+((x)*40))
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SX__SIZE					0x00000008
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SY(x)					(0x00000404+((x)*40))
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SY__SIZE					0x00000008
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SZ(x)					(0x00000408+((x)*40))
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SZ__SIZE					0x00000008
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_RHW(x)					(0x0000040c+((x)*40))
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_RHW__SIZE					0x00000008
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR(x)					(0x00000410+((x)*40))
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR__SIZE					0x00000008
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_B_SHIFT				0
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_B_MASK				0x000000ff
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_G_SHIFT				8
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_G_MASK				0x0000ff00
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_R_SHIFT				16
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_R_MASK				0x00ff0000
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_A_SHIFT				24
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_A_MASK				0xff000000
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR(x)					(0x00000414+((x)*40))
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR__SIZE				0x00000008
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_B_SHIFT				0
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_B_MASK				0x000000ff
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_G_SHIFT				8
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_G_MASK				0x0000ff00
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_R_SHIFT				16
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_R_MASK				0x00ff0000
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_FOG_SHIFT				24
+#define   NV04_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_FOG_MASK				0xff000000
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_TU0(x)					(0x00000418+((x)*40))
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_TU0__SIZE					0x00000008
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_TV0(x)					(0x0000041c+((x)*40))
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_TV0__SIZE					0x00000008
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_TU1(x)					(0x00000420+((x)*40))
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_TU1__SIZE					0x00000008
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_TV1(x)					(0x00000424+((x)*40))
+#define  NV04_MULTITEX_TRIANGLE_TLMTVERTEX_TV1__SIZE					0x00000008
+#define  NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE(x)					(0x00000540+((x)*4))
+#define  NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE__SIZE					0x00000030
+#define   NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE_I0_SHIFT					0
+#define   NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE_I0_MASK					0x0000000f
+#define   NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE_I1_SHIFT					4
+#define   NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE_I1_MASK					0x000000f0
+#define   NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE_I2_SHIFT					8
+#define   NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE_I2_MASK					0x00000f00
+#define   NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE_I3_SHIFT					12
+#define   NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE_I3_MASK					0x0000f000
+#define   NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE_I4_SHIFT					16
+#define   NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE_I4_MASK					0x000f0000
+#define   NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE_I5_SHIFT					20
+#define   NV04_MULTITEX_TRIANGLE_DRAWPRIMITIVE_I5_MASK					0x00f00000
+
+
+#define NV10_MULTITEX_TRIANGLE								0x00000095
+
+
+
+#define NV10TCL										0x00000056
+
+#define  NV10TCL_NOP									0x00000100
+#define  NV10TCL_NOTIFY									0x00000104
+#define  NV10TCL_DMA_NOTIFY								0x00000180
+#define  NV10TCL_DMA_IN_MEMORY0								0x00000184
+#define  NV10TCL_DMA_IN_MEMORY1								0x00000188
+#define  NV10TCL_DMA_VTXBUF0								0x0000018c
+#define  NV10TCL_DMA_IN_MEMORY2								0x00000194
+#define  NV10TCL_DMA_IN_MEMORY3								0x00000198
+#define  NV10TCL_RT_HORIZ								0x00000200
+#define   NV10TCL_RT_HORIZ_X_SHIFT							0
+#define   NV10TCL_RT_HORIZ_X_MASK							0x0000ffff
+#define   NV10TCL_RT_HORIZ_W_SHIFT							16
+#define   NV10TCL_RT_HORIZ_W_MASK							0xffff0000
+#define  NV10TCL_RT_VERT								0x00000204
+#define   NV10TCL_RT_VERT_Y_SHIFT							0
+#define   NV10TCL_RT_VERT_Y_MASK							0x0000ffff
+#define   NV10TCL_RT_VERT_H_SHIFT							16
+#define   NV10TCL_RT_VERT_H_MASK							0xffff0000
+#define  NV10TCL_RT_FORMAT								0x00000208
+#define   NV10TCL_RT_FORMAT_TYPE_SHIFT							8
+#define   NV10TCL_RT_FORMAT_TYPE_MASK							0x00000f00
+#define    NV10TCL_RT_FORMAT_TYPE_LINEAR						0x00000100
+#define    NV10TCL_RT_FORMAT_TYPE_SWIZZLED						0x00000200
+#define   NV10TCL_RT_FORMAT_COLOR_SHIFT							0
+#define   NV10TCL_RT_FORMAT_COLOR_MASK							0x0000001f
+#define    NV10TCL_RT_FORMAT_COLOR_R5G6B5						0x00000003
+#define    NV10TCL_RT_FORMAT_COLOR_X8R8G8B8						0x00000005
+#define    NV10TCL_RT_FORMAT_COLOR_A8R8G8B8						0x00000008
+#define    NV10TCL_RT_FORMAT_COLOR_B8							0x00000009
+#define    NV10TCL_RT_FORMAT_COLOR_UNKNOWN						0x0000000d
+#define    NV10TCL_RT_FORMAT_COLOR_X8B8G8R8						0x0000000f
+#define    NV10TCL_RT_FORMAT_COLOR_A8B8G8R8						0x00000010
+#define  NV10TCL_RT_PITCH								0x0000020c
+#define   NV10TCL_RT_PITCH_COLOR_PITCH_SHIFT						0
+#define   NV10TCL_RT_PITCH_COLOR_PITCH_MASK						0x0000ffff
+#define   NV10TCL_RT_PITCH_ZETA_PITCH_SHIFT						16
+#define   NV10TCL_RT_PITCH_ZETA_PITCH_MASK						0xffff0000
+#define  NV10TCL_COLOR_OFFSET								0x00000210
+#define  NV10TCL_ZETA_OFFSET								0x00000214
+#define  NV10TCL_TX_OFFSET(x)								(0x00000218+((x)*4))
+#define  NV10TCL_TX_OFFSET__SIZE							0x00000002
+#define  NV10TCL_TX_FORMAT(x)								(0x00000220+((x)*4))
+#define  NV10TCL_TX_FORMAT__SIZE							0x00000002
+#define   NV10TCL_TX_FORMAT_DMA0							(1 <<  0)
+#define   NV10TCL_TX_FORMAT_DMA1							(1 <<  1)
+#define   NV10TCL_TX_FORMAT_CUBE_MAP							(1 <<  2)
+#define   NV10TCL_TX_FORMAT_FORMAT_SHIFT						7
+#define   NV10TCL_TX_FORMAT_FORMAT_MASK							0x00000f80
+#define    NV10TCL_TX_FORMAT_FORMAT_L8							0x00000000
+#define    NV10TCL_TX_FORMAT_FORMAT_A8							0x00000080
+#define    NV10TCL_TX_FORMAT_FORMAT_A1R5G5B5						0x00000100
+#define    NV10TCL_TX_FORMAT_FORMAT_A4R4G4B4						0x00000200
+#define    NV10TCL_TX_FORMAT_FORMAT_R5G6B5						0x00000280
+#define    NV10TCL_TX_FORMAT_FORMAT_A8R8G8B8						0x00000300
+#define    NV10TCL_TX_FORMAT_FORMAT_X8R8G8B8						0x00000380
+#define    NV10TCL_TX_FORMAT_FORMAT_INDEX8						0x00000580
+#define    NV10TCL_TX_FORMAT_FORMAT_DXT1						0x00000600
+#define    NV10TCL_TX_FORMAT_FORMAT_DXT3						0x00000700
+#define    NV10TCL_TX_FORMAT_FORMAT_DXT5						0x00000780
+#define    NV10TCL_TX_FORMAT_FORMAT_A1R5G5B5_RECT					0x00000800
+#define    NV10TCL_TX_FORMAT_FORMAT_R5G6B5_RECT						0x00000880
+#define    NV10TCL_TX_FORMAT_FORMAT_A8R8G8B8_RECT					0x00000900
+#define    NV10TCL_TX_FORMAT_FORMAT_A8_RECT						0x00000980
+#define   NV10TCL_TX_FORMAT_MIPMAP							(1 << 15)
+#define   NV10TCL_TX_FORMAT_BASE_SIZE_U_SHIFT						16
+#define   NV10TCL_TX_FORMAT_BASE_SIZE_U_MASK						0x000f0000
+#define   NV10TCL_TX_FORMAT_BASE_SIZE_V_SHIFT						20
+#define   NV10TCL_TX_FORMAT_BASE_SIZE_V_MASK						0x00f00000
+#define   NV10TCL_TX_FORMAT_WRAP_S_SHIFT						24
+#define   NV10TCL_TX_FORMAT_WRAP_S_MASK							0x0f000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_REPEAT						0x01000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_MIRRORED_REPEAT					0x02000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_EDGE					0x03000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_BORDER					0x04000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_CLAMP						0x05000000
+#define   NV10TCL_TX_FORMAT_WRAP_T_SHIFT						28
+#define   NV10TCL_TX_FORMAT_WRAP_T_MASK							0xf0000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_REPEAT						0x10000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_MIRRORED_REPEAT					0x20000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_CLAMP_TO_EDGE					0x30000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_CLAMP_TO_BORDER					0x40000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_CLAMP						0x50000000
+#define  NV10TCL_TX_ENABLE(x)								(0x00000228+((x)*4))
+#define  NV10TCL_TX_ENABLE__SIZE							0x00000002
+#define   NV10TCL_TX_ENABLE_CULL_SHIFT							0
+#define   NV10TCL_TX_ENABLE_CULL_MASK							0x0000000f
+#define    NV10TCL_TX_ENABLE_CULL_DISABLED						0x00000000
+#define    NV10TCL_TX_ENABLE_CULL_TEST_ALL						0x00000003
+#define    NV10TCL_TX_ENABLE_CULL_TEST_ALPHA						0x00000004
+#define   NV10TCL_TX_ENABLE_ANISOTROPY_SHIFT						4
+#define   NV10TCL_TX_ENABLE_ANISOTROPY_MASK						0x00000030
+#define   NV10TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT					14
+#define   NV10TCL_TX_ENABLE_MIPMAP_MAX_LOD_MASK						0x0003c000
+#define   NV10TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT					26
+#define   NV10TCL_TX_ENABLE_MIPMAP_MIN_LOD_MASK						0x3c000000
+#define   NV10TCL_TX_ENABLE_ENABLE							(1 << 30)
+#define  NV10TCL_TX_NPOT_PITCH(x)							(0x00000230+((x)*4))
+#define  NV10TCL_TX_NPOT_PITCH__SIZE							0x00000002
+#define   NV10TCL_TX_NPOT_PITCH_PITCH_SHIFT						16
+#define   NV10TCL_TX_NPOT_PITCH_PITCH_MASK						0xffff0000
+#define  NV10TCL_TX_NPOT_SIZE(x)							(0x00000240+((x)*4))
+#define  NV10TCL_TX_NPOT_SIZE__SIZE							0x00000002
+#define   NV10TCL_TX_NPOT_SIZE_H_SHIFT							0
+#define   NV10TCL_TX_NPOT_SIZE_H_MASK							0x0000ffff
+#define   NV10TCL_TX_NPOT_SIZE_W_SHIFT							16
+#define   NV10TCL_TX_NPOT_SIZE_W_MASK							0xffff0000
+#define  NV10TCL_TX_FILTER(x)								(0x00000248+((x)*4))
+#define  NV10TCL_TX_FILTER__SIZE							0x00000002
+#define   NV10TCL_TX_FILTER_LOD_BIAS_SHIFT						8
+#define   NV10TCL_TX_FILTER_LOD_BIAS_MASK						0x00000f00
+#define   NV10TCL_TX_FILTER_MINIFY_SHIFT						24
+#define   NV10TCL_TX_FILTER_MINIFY_MASK							0x0f000000
+#define    NV10TCL_TX_FILTER_MINIFY_NEAREST						0x01000000
+#define    NV10TCL_TX_FILTER_MINIFY_LINEAR						0x02000000
+#define    NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST				0x03000000
+#define    NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST				0x04000000
+#define    NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR				0x05000000
+#define    NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR				0x06000000
+#define   NV10TCL_TX_FILTER_MAGNIFY_SHIFT						28
+#define   NV10TCL_TX_FILTER_MAGNIFY_MASK						0xf0000000
+#define    NV10TCL_TX_FILTER_MAGNIFY_NEAREST						0x10000000
+#define    NV10TCL_TX_FILTER_MAGNIFY_LINEAR						0x20000000
+#define  NV10TCL_TX_PALETTE_OFFSET(x)							(0x00000250+((x)*4))
+#define  NV10TCL_TX_PALETTE_OFFSET__SIZE						0x00000002
+#define  NV10TCL_RC_IN_ALPHA(x)								(0x00000260+((x)*4))
+#define  NV10TCL_RC_IN_ALPHA__SIZE							0x00000002
+#define   NV10TCL_RC_IN_ALPHA_D_INPUT_SHIFT						0
+#define   NV10TCL_RC_IN_ALPHA_D_INPUT_MASK						0x0000000f
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR0					0x00000001
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR1					0x00000002
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_FOG						0x00000003
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_PRIMARY_COLOR					0x00000004
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_SECONDARY_COLOR					0x00000005
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_TEXTURE0						0x00000008
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_TEXTURE1						0x00000009
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_SPARE0						0x0000000c
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_SPARE1						0x0000000d
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_E_TIMES_F					0x0000000f
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_TEXTURE2						0x0000000a
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_TEXTURE3						0x0000000b
+#define   NV10TCL_RC_IN_ALPHA_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV10TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV10TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV10TCL_RC_IN_ALPHA_D_MAPPING_SHIFT						5
+#define   NV10TCL_RC_IN_ALPHA_D_MAPPING_MASK						0x000000e0
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_INVERT				0x00000020
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NORMAL					0x00000040
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NEGATE					0x00000060
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NORMAL				0x00000080
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NEGATE				0x000000a0
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_IDENTITY				0x000000c0
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_NEGATE					0x000000e0
+#define   NV10TCL_RC_IN_ALPHA_C_INPUT_SHIFT						8
+#define   NV10TCL_RC_IN_ALPHA_C_INPUT_MASK						0x00000f00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR0					0x00000100
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR1					0x00000200
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_FOG						0x00000300
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_PRIMARY_COLOR					0x00000400
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_SECONDARY_COLOR					0x00000500
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_TEXTURE0						0x00000800
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_TEXTURE1						0x00000900
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_SPARE0						0x00000c00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_SPARE1						0x00000d00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_E_TIMES_F					0x00000f00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_TEXTURE2						0x00000a00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_TEXTURE3						0x00000b00
+#define   NV10TCL_RC_IN_ALPHA_C_COMPONENT_USAGE						(1 << 12)
+#define    NV10TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV10TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV10TCL_RC_IN_ALPHA_C_MAPPING_SHIFT						13
+#define   NV10TCL_RC_IN_ALPHA_C_MAPPING_MASK						0x0000e000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_INVERT				0x00002000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NORMAL					0x00004000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NEGATE					0x00006000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NORMAL				0x00008000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NEGATE				0x0000a000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_IDENTITY				0x0000c000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_NEGATE					0x0000e000
+#define   NV10TCL_RC_IN_ALPHA_B_INPUT_SHIFT						16
+#define   NV10TCL_RC_IN_ALPHA_B_INPUT_MASK						0x000f0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR0					0x00010000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR1					0x00020000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_FOG						0x00030000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_PRIMARY_COLOR					0x00040000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_SECONDARY_COLOR					0x00050000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_TEXTURE0						0x00080000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_TEXTURE1						0x00090000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_SPARE0						0x000c0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_SPARE1						0x000d0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000e0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_E_TIMES_F					0x000f0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_TEXTURE2						0x000a0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_TEXTURE3						0x000b0000
+#define   NV10TCL_RC_IN_ALPHA_B_COMPONENT_USAGE						(1 << 20)
+#define    NV10TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV10TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV10TCL_RC_IN_ALPHA_B_MAPPING_SHIFT						21
+#define   NV10TCL_RC_IN_ALPHA_B_MAPPING_MASK						0x00e00000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_INVERT				0x00200000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NORMAL					0x00400000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NEGATE					0x00600000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NORMAL				0x00800000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NEGATE				0x00a00000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_IDENTITY				0x00c00000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_NEGATE					0x00e00000
+#define   NV10TCL_RC_IN_ALPHA_A_INPUT_SHIFT						24
+#define   NV10TCL_RC_IN_ALPHA_A_INPUT_MASK						0x0f000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR0					0x01000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR1					0x02000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_FOG						0x03000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_PRIMARY_COLOR					0x04000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_SECONDARY_COLOR					0x05000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_TEXTURE0						0x08000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_TEXTURE1						0x09000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_SPARE0						0x0c000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_SPARE1						0x0d000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0e000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_E_TIMES_F					0x0f000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_TEXTURE2						0x0a000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_TEXTURE3						0x0b000000
+#define   NV10TCL_RC_IN_ALPHA_A_COMPONENT_USAGE						(1 << 28)
+#define    NV10TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV10TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV10TCL_RC_IN_ALPHA_A_MAPPING_SHIFT						29
+#define   NV10TCL_RC_IN_ALPHA_A_MAPPING_MASK						0xe0000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_INVERT				0x20000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NORMAL					0x40000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NEGATE					0x60000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NORMAL				0x80000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NEGATE				0xa0000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_IDENTITY				0xc0000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_NEGATE					0xe0000000
+#define  NV10TCL_RC_IN_RGB(x)								(0x00000268+((x)*4))
+#define  NV10TCL_RC_IN_RGB__SIZE							0x00000002
+#define   NV10TCL_RC_IN_RGB_D_INPUT_SHIFT						0
+#define   NV10TCL_RC_IN_RGB_D_INPUT_MASK						0x0000000f
+#define    NV10TCL_RC_IN_RGB_D_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR0					0x00000001
+#define    NV10TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR1					0x00000002
+#define    NV10TCL_RC_IN_RGB_D_INPUT_FOG						0x00000003
+#define    NV10TCL_RC_IN_RGB_D_INPUT_PRIMARY_COLOR					0x00000004
+#define    NV10TCL_RC_IN_RGB_D_INPUT_SECONDARY_COLOR					0x00000005
+#define    NV10TCL_RC_IN_RGB_D_INPUT_TEXTURE0						0x00000008
+#define    NV10TCL_RC_IN_RGB_D_INPUT_TEXTURE1						0x00000009
+#define    NV10TCL_RC_IN_RGB_D_INPUT_SPARE0						0x0000000c
+#define    NV10TCL_RC_IN_RGB_D_INPUT_SPARE1						0x0000000d
+#define    NV10TCL_RC_IN_RGB_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV10TCL_RC_IN_RGB_D_INPUT_E_TIMES_F						0x0000000f
+#define    NV10TCL_RC_IN_RGB_D_INPUT_TEXTURE2						0x0000000a
+#define    NV10TCL_RC_IN_RGB_D_INPUT_TEXTURE3						0x0000000b
+#define   NV10TCL_RC_IN_RGB_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV10TCL_RC_IN_RGB_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_IN_RGB_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV10TCL_RC_IN_RGB_D_MAPPING_SHIFT						5
+#define   NV10TCL_RC_IN_RGB_D_MAPPING_MASK						0x000000e0
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_INVERT					0x00000020
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_EXPAND_NORMAL					0x00000040
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_EXPAND_NEGATE					0x00000060
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NORMAL					0x00000080
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NEGATE					0x000000a0
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_SIGNED_IDENTITY					0x000000c0
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_SIGNED_NEGATE					0x000000e0
+#define   NV10TCL_RC_IN_RGB_C_INPUT_SHIFT						8
+#define   NV10TCL_RC_IN_RGB_C_INPUT_MASK						0x00000f00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR0					0x00000100
+#define    NV10TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR1					0x00000200
+#define    NV10TCL_RC_IN_RGB_C_INPUT_FOG						0x00000300
+#define    NV10TCL_RC_IN_RGB_C_INPUT_PRIMARY_COLOR					0x00000400
+#define    NV10TCL_RC_IN_RGB_C_INPUT_SECONDARY_COLOR					0x00000500
+#define    NV10TCL_RC_IN_RGB_C_INPUT_TEXTURE0						0x00000800
+#define    NV10TCL_RC_IN_RGB_C_INPUT_TEXTURE1						0x00000900
+#define    NV10TCL_RC_IN_RGB_C_INPUT_SPARE0						0x00000c00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_SPARE1						0x00000d00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_E_TIMES_F						0x00000f00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_TEXTURE2						0x00000a00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_TEXTURE3						0x00000b00
+#define   NV10TCL_RC_IN_RGB_C_COMPONENT_USAGE						(1 << 12)
+#define    NV10TCL_RC_IN_RGB_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_IN_RGB_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV10TCL_RC_IN_RGB_C_MAPPING_SHIFT						13
+#define   NV10TCL_RC_IN_RGB_C_MAPPING_MASK						0x0000e000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_INVERT					0x00002000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_EXPAND_NORMAL					0x00004000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_EXPAND_NEGATE					0x00006000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NORMAL					0x00008000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NEGATE					0x0000a000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_SIGNED_IDENTITY					0x0000c000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_SIGNED_NEGATE					0x0000e000
+#define   NV10TCL_RC_IN_RGB_B_INPUT_SHIFT						16
+#define   NV10TCL_RC_IN_RGB_B_INPUT_MASK						0x000f0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR0					0x00010000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR1					0x00020000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_FOG						0x00030000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_PRIMARY_COLOR					0x00040000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_SECONDARY_COLOR					0x00050000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_TEXTURE0						0x00080000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_TEXTURE1						0x00090000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_SPARE0						0x000c0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_SPARE1						0x000d0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000e0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_E_TIMES_F						0x000f0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_TEXTURE2						0x000a0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_TEXTURE3						0x000b0000
+#define   NV10TCL_RC_IN_RGB_B_COMPONENT_USAGE						(1 << 20)
+#define    NV10TCL_RC_IN_RGB_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_IN_RGB_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV10TCL_RC_IN_RGB_B_MAPPING_SHIFT						21
+#define   NV10TCL_RC_IN_RGB_B_MAPPING_MASK						0x00e00000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_INVERT					0x00200000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_EXPAND_NORMAL					0x00400000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_EXPAND_NEGATE					0x00600000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NORMAL					0x00800000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NEGATE					0x00a00000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_SIGNED_IDENTITY					0x00c00000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_SIGNED_NEGATE					0x00e00000
+#define   NV10TCL_RC_IN_RGB_A_INPUT_SHIFT						24
+#define   NV10TCL_RC_IN_RGB_A_INPUT_MASK						0x0f000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR0					0x01000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR1					0x02000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_FOG						0x03000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_PRIMARY_COLOR					0x04000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_SECONDARY_COLOR					0x05000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_TEXTURE0						0x08000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_TEXTURE1						0x09000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_SPARE0						0x0c000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_SPARE1						0x0d000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0e000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_E_TIMES_F						0x0f000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_TEXTURE2						0x0a000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_TEXTURE3						0x0b000000
+#define   NV10TCL_RC_IN_RGB_A_COMPONENT_USAGE						(1 << 28)
+#define    NV10TCL_RC_IN_RGB_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_IN_RGB_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV10TCL_RC_IN_RGB_A_MAPPING_SHIFT						29
+#define   NV10TCL_RC_IN_RGB_A_MAPPING_MASK						0xe0000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_INVERT					0x20000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_EXPAND_NORMAL					0x40000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_EXPAND_NEGATE					0x60000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NORMAL					0x80000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NEGATE					0xa0000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_SIGNED_IDENTITY					0xc0000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_SIGNED_NEGATE					0xe0000000
+#define  NV10TCL_RC_COLOR(x)								(0x00000270+((x)*4))
+#define  NV10TCL_RC_COLOR__SIZE								0x00000002
+#define   NV10TCL_RC_COLOR_B_SHIFT							0
+#define   NV10TCL_RC_COLOR_B_MASK							0x000000ff
+#define   NV10TCL_RC_COLOR_G_SHIFT							8
+#define   NV10TCL_RC_COLOR_G_MASK							0x0000ff00
+#define   NV10TCL_RC_COLOR_R_SHIFT							16
+#define   NV10TCL_RC_COLOR_R_MASK							0x00ff0000
+#define   NV10TCL_RC_COLOR_A_SHIFT							24
+#define   NV10TCL_RC_COLOR_A_MASK							0xff000000
+#define  NV10TCL_RC_OUT_ALPHA(x)							(0x00000278+((x)*4))
+#define  NV10TCL_RC_OUT_ALPHA__SIZE							0x00000002
+#define   NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SHIFT						0
+#define   NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_MASK						0x0000000f
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR0				0x00000001
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR1				0x00000002
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_FOG						0x00000003
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_PRIMARY_COLOR					0x00000004
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SECONDARY_COLOR				0x00000005
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE0					0x00000008
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE1					0x00000009
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0					0x0000000c
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE1					0x0000000d
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_E_TIMES_F					0x0000000f
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE2					0x0000000a
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE3					0x0000000b
+#define   NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SHIFT						4
+#define   NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_MASK						0x000000f0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR0				0x00000010
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR1				0x00000020
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_FOG						0x00000030
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_PRIMARY_COLOR					0x00000040
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SECONDARY_COLOR				0x00000050
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE0					0x00000080
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE1					0x00000090
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0					0x000000c0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE1					0x000000d0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000000e0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_E_TIMES_F					0x000000f0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE2					0x000000a0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE3					0x000000b0
+#define   NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SHIFT						8
+#define   NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_MASK						0x00000f00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR0				0x00000100
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR1				0x00000200
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_FOG						0x00000300
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_PRIMARY_COLOR				0x00000400
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SECONDARY_COLOR				0x00000500
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE0					0x00000800
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE1					0x00000900
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0					0x00000c00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE1					0x00000d00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_E_TIMES_F					0x00000f00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE2					0x00000a00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE3					0x00000b00
+#define   NV10TCL_RC_OUT_ALPHA_CD_DOT_PRODUCT						(1 << 12)
+#define   NV10TCL_RC_OUT_ALPHA_AB_DOT_PRODUCT						(1 << 13)
+#define   NV10TCL_RC_OUT_ALPHA_MUX_SUM							(1 << 14)
+#define   NV10TCL_RC_OUT_ALPHA_BIAS							(1 << 15)
+#define    NV10TCL_RC_OUT_ALPHA_BIAS_NONE						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_BIAS_BIAS_BY_NEGATIVE_ONE_HALF				0x00008000
+#define   NV10TCL_RC_OUT_ALPHA_SCALE_SHIFT						17
+#define   NV10TCL_RC_OUT_ALPHA_SCALE_MASK						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_SCALE_NONE						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_TWO					0x00020000
+#define    NV10TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_FOUR					0x00040000
+#define    NV10TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_ONE_HALF					0x00060000
+#define  NV10TCL_RC_OUT_RGB(x)								(0x00000280+((x)*4))
+#define  NV10TCL_RC_OUT_RGB__SIZE							0x00000002
+#define   NV10TCL_RC_OUT_RGB_CD_OUTPUT_SHIFT						0
+#define   NV10TCL_RC_OUT_RGB_CD_OUTPUT_MASK						0x0000000f
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR0					0x00000001
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR1					0x00000002
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_FOG						0x00000003
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_PRIMARY_COLOR					0x00000004
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_SECONDARY_COLOR					0x00000005
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE0					0x00000008
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE1					0x00000009
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0						0x0000000c
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_SPARE1						0x0000000d
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_E_TIMES_F					0x0000000f
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE2					0x0000000a
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE3					0x0000000b
+#define   NV10TCL_RC_OUT_RGB_AB_OUTPUT_SHIFT						4
+#define   NV10TCL_RC_OUT_RGB_AB_OUTPUT_MASK						0x000000f0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR0					0x00000010
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR1					0x00000020
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_FOG						0x00000030
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_PRIMARY_COLOR					0x00000040
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_SECONDARY_COLOR					0x00000050
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE0					0x00000080
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE1					0x00000090
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0						0x000000c0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_SPARE1						0x000000d0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000000e0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_E_TIMES_F					0x000000f0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE2					0x000000a0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE3					0x000000b0
+#define   NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SHIFT						8
+#define   NV10TCL_RC_OUT_RGB_SUM_OUTPUT_MASK						0x00000f00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR0				0x00000100
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR1				0x00000200
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_FOG						0x00000300
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_PRIMARY_COLOR					0x00000400
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SECONDARY_COLOR				0x00000500
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE0					0x00000800
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE1					0x00000900
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0						0x00000c00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE1						0x00000d00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_E_TIMES_F					0x00000f00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE2					0x00000a00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE3					0x00000b00
+#define   NV10TCL_RC_OUT_RGB_CD_DOT_PRODUCT						(1 << 12)
+#define   NV10TCL_RC_OUT_RGB_AB_DOT_PRODUCT						(1 << 13)
+#define   NV10TCL_RC_OUT_RGB_MUX_SUM							(1 << 14)
+#define   NV10TCL_RC_OUT_RGB_BIAS							(1 << 15)
+#define    NV10TCL_RC_OUT_RGB_BIAS_NONE							0x00000000
+#define    NV10TCL_RC_OUT_RGB_BIAS_BIAS_BY_NEGATIVE_ONE_HALF				0x00008000
+#define   NV10TCL_RC_OUT_RGB_SCALE_SHIFT						17
+#define   NV10TCL_RC_OUT_RGB_SCALE_MASK							0x00000000
+#define    NV10TCL_RC_OUT_RGB_SCALE_NONE						0x00000000
+#define    NV10TCL_RC_OUT_RGB_SCALE_SCALE_BY_TWO					0x00020000
+#define    NV10TCL_RC_OUT_RGB_SCALE_SCALE_BY_FOUR					0x00040000
+#define    NV10TCL_RC_OUT_RGB_SCALE_SCALE_BY_ONE_HALF					0x00060000
+#define   NV10TCL_RC_OUT_RGB_OPERATION_SHIFT						27
+#define   NV10TCL_RC_OUT_RGB_OPERATION_MASK						0x38000000
+#define  NV10TCL_RC_FINAL0								0x00000288
+#define   NV10TCL_RC_FINAL0_D_INPUT_SHIFT						0
+#define   NV10TCL_RC_FINAL0_D_INPUT_MASK						0x0000000f
+#define    NV10TCL_RC_FINAL0_D_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR0					0x00000001
+#define    NV10TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR1					0x00000002
+#define    NV10TCL_RC_FINAL0_D_INPUT_FOG						0x00000003
+#define    NV10TCL_RC_FINAL0_D_INPUT_PRIMARY_COLOR					0x00000004
+#define    NV10TCL_RC_FINAL0_D_INPUT_SECONDARY_COLOR					0x00000005
+#define    NV10TCL_RC_FINAL0_D_INPUT_TEXTURE0						0x00000008
+#define    NV10TCL_RC_FINAL0_D_INPUT_TEXTURE1						0x00000009
+#define    NV10TCL_RC_FINAL0_D_INPUT_SPARE0						0x0000000c
+#define    NV10TCL_RC_FINAL0_D_INPUT_SPARE1						0x0000000d
+#define    NV10TCL_RC_FINAL0_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV10TCL_RC_FINAL0_D_INPUT_E_TIMES_F						0x0000000f
+#define    NV10TCL_RC_FINAL0_D_INPUT_TEXTURE2						0x0000000a
+#define    NV10TCL_RC_FINAL0_D_INPUT_TEXTURE3						0x0000000b
+#define   NV10TCL_RC_FINAL0_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV10TCL_RC_FINAL0_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL0_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV10TCL_RC_FINAL0_D_MAPPING_SHIFT						5
+#define   NV10TCL_RC_FINAL0_D_MAPPING_MASK						0x000000e0
+#define    NV10TCL_RC_FINAL0_D_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_FINAL0_D_MAPPING_UNSIGNED_INVERT					0x00000020
+#define    NV10TCL_RC_FINAL0_D_MAPPING_EXPAND_NORMAL					0x00000040
+#define    NV10TCL_RC_FINAL0_D_MAPPING_EXPAND_NEGATE					0x00000060
+#define    NV10TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NORMAL					0x00000080
+#define    NV10TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NEGATE					0x000000a0
+#define    NV10TCL_RC_FINAL0_D_MAPPING_SIGNED_IDENTITY					0x000000c0
+#define    NV10TCL_RC_FINAL0_D_MAPPING_SIGNED_NEGATE					0x000000e0
+#define   NV10TCL_RC_FINAL0_C_INPUT_SHIFT						8
+#define   NV10TCL_RC_FINAL0_C_INPUT_MASK						0x00000f00
+#define    NV10TCL_RC_FINAL0_C_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR0					0x00000100
+#define    NV10TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR1					0x00000200
+#define    NV10TCL_RC_FINAL0_C_INPUT_FOG						0x00000300
+#define    NV10TCL_RC_FINAL0_C_INPUT_PRIMARY_COLOR					0x00000400
+#define    NV10TCL_RC_FINAL0_C_INPUT_SECONDARY_COLOR					0x00000500
+#define    NV10TCL_RC_FINAL0_C_INPUT_TEXTURE0						0x00000800
+#define    NV10TCL_RC_FINAL0_C_INPUT_TEXTURE1						0x00000900
+#define    NV10TCL_RC_FINAL0_C_INPUT_SPARE0						0x00000c00
+#define    NV10TCL_RC_FINAL0_C_INPUT_SPARE1						0x00000d00
+#define    NV10TCL_RC_FINAL0_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV10TCL_RC_FINAL0_C_INPUT_E_TIMES_F						0x00000f00
+#define    NV10TCL_RC_FINAL0_C_INPUT_TEXTURE2						0x00000a00
+#define    NV10TCL_RC_FINAL0_C_INPUT_TEXTURE3						0x00000b00
+#define   NV10TCL_RC_FINAL0_C_COMPONENT_USAGE						(1 << 12)
+#define    NV10TCL_RC_FINAL0_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL0_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV10TCL_RC_FINAL0_C_MAPPING_SHIFT						13
+#define   NV10TCL_RC_FINAL0_C_MAPPING_MASK						0x0000e000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_UNSIGNED_INVERT					0x00002000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_EXPAND_NORMAL					0x00004000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_EXPAND_NEGATE					0x00006000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NORMAL					0x00008000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NEGATE					0x0000a000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_SIGNED_IDENTITY					0x0000c000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_SIGNED_NEGATE					0x0000e000
+#define   NV10TCL_RC_FINAL0_B_INPUT_SHIFT						16
+#define   NV10TCL_RC_FINAL0_B_INPUT_MASK						0x000f0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR0					0x00010000
+#define    NV10TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR1					0x00020000
+#define    NV10TCL_RC_FINAL0_B_INPUT_FOG						0x00030000
+#define    NV10TCL_RC_FINAL0_B_INPUT_PRIMARY_COLOR					0x00040000
+#define    NV10TCL_RC_FINAL0_B_INPUT_SECONDARY_COLOR					0x00050000
+#define    NV10TCL_RC_FINAL0_B_INPUT_TEXTURE0						0x00080000
+#define    NV10TCL_RC_FINAL0_B_INPUT_TEXTURE1						0x00090000
+#define    NV10TCL_RC_FINAL0_B_INPUT_SPARE0						0x000c0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_SPARE1						0x000d0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000e0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_E_TIMES_F						0x000f0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_TEXTURE2						0x000a0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_TEXTURE3						0x000b0000
+#define   NV10TCL_RC_FINAL0_B_COMPONENT_USAGE						(1 << 20)
+#define    NV10TCL_RC_FINAL0_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL0_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV10TCL_RC_FINAL0_B_MAPPING_SHIFT						21
+#define   NV10TCL_RC_FINAL0_B_MAPPING_MASK						0x00e00000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_UNSIGNED_INVERT					0x00200000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_EXPAND_NORMAL					0x00400000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_EXPAND_NEGATE					0x00600000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NORMAL					0x00800000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NEGATE					0x00a00000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_SIGNED_IDENTITY					0x00c00000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_SIGNED_NEGATE					0x00e00000
+#define   NV10TCL_RC_FINAL0_A_INPUT_SHIFT						24
+#define   NV10TCL_RC_FINAL0_A_INPUT_MASK						0x0f000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR0					0x01000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR1					0x02000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_FOG						0x03000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_PRIMARY_COLOR					0x04000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_SECONDARY_COLOR					0x05000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_TEXTURE0						0x08000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_TEXTURE1						0x09000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_SPARE0						0x0c000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_SPARE1						0x0d000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0e000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_E_TIMES_F						0x0f000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_TEXTURE2						0x0a000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_TEXTURE3						0x0b000000
+#define   NV10TCL_RC_FINAL0_A_COMPONENT_USAGE						(1 << 28)
+#define    NV10TCL_RC_FINAL0_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL0_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV10TCL_RC_FINAL0_A_MAPPING_SHIFT						29
+#define   NV10TCL_RC_FINAL0_A_MAPPING_MASK						0xe0000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_UNSIGNED_INVERT					0x20000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_EXPAND_NORMAL					0x40000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_EXPAND_NEGATE					0x60000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NORMAL					0x80000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NEGATE					0xa0000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_SIGNED_IDENTITY					0xc0000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_SIGNED_NEGATE					0xe0000000
+#define  NV10TCL_RC_FINAL1								0x0000028c
+#define   NV10TCL_RC_FINAL1_COLOR_SUM_CLAMP						(1 <<  7)
+#define   NV10TCL_RC_FINAL1_G_INPUT_SHIFT						8
+#define   NV10TCL_RC_FINAL1_G_INPUT_MASK						0x00000f00
+#define    NV10TCL_RC_FINAL1_G_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR0					0x00000100
+#define    NV10TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR1					0x00000200
+#define    NV10TCL_RC_FINAL1_G_INPUT_FOG						0x00000300
+#define    NV10TCL_RC_FINAL1_G_INPUT_PRIMARY_COLOR					0x00000400
+#define    NV10TCL_RC_FINAL1_G_INPUT_SECONDARY_COLOR					0x00000500
+#define    NV10TCL_RC_FINAL1_G_INPUT_TEXTURE0						0x00000800
+#define    NV10TCL_RC_FINAL1_G_INPUT_TEXTURE1						0x00000900
+#define    NV10TCL_RC_FINAL1_G_INPUT_SPARE0						0x00000c00
+#define    NV10TCL_RC_FINAL1_G_INPUT_SPARE1						0x00000d00
+#define    NV10TCL_RC_FINAL1_G_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV10TCL_RC_FINAL1_G_INPUT_E_TIMES_F						0x00000f00
+#define    NV10TCL_RC_FINAL1_G_INPUT_TEXTURE2						0x00000a00
+#define    NV10TCL_RC_FINAL1_G_INPUT_TEXTURE3						0x00000b00
+#define   NV10TCL_RC_FINAL1_G_COMPONENT_USAGE						(1 << 12)
+#define    NV10TCL_RC_FINAL1_G_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL1_G_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV10TCL_RC_FINAL1_G_MAPPING_SHIFT						13
+#define   NV10TCL_RC_FINAL1_G_MAPPING_MASK						0x0000e000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_UNSIGNED_INVERT					0x00002000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_EXPAND_NORMAL					0x00004000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_EXPAND_NEGATE					0x00006000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NORMAL					0x00008000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NEGATE					0x0000a000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_SIGNED_IDENTITY					0x0000c000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_SIGNED_NEGATE					0x0000e000
+#define   NV10TCL_RC_FINAL1_F_INPUT_SHIFT						16
+#define   NV10TCL_RC_FINAL1_F_INPUT_MASK						0x000f0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR0					0x00010000
+#define    NV10TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR1					0x00020000
+#define    NV10TCL_RC_FINAL1_F_INPUT_FOG						0x00030000
+#define    NV10TCL_RC_FINAL1_F_INPUT_PRIMARY_COLOR					0x00040000
+#define    NV10TCL_RC_FINAL1_F_INPUT_SECONDARY_COLOR					0x00050000
+#define    NV10TCL_RC_FINAL1_F_INPUT_TEXTURE0						0x00080000
+#define    NV10TCL_RC_FINAL1_F_INPUT_TEXTURE1						0x00090000
+#define    NV10TCL_RC_FINAL1_F_INPUT_SPARE0						0x000c0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_SPARE1						0x000d0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000e0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_E_TIMES_F						0x000f0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_TEXTURE2						0x000a0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_TEXTURE3						0x000b0000
+#define   NV10TCL_RC_FINAL1_F_COMPONENT_USAGE						(1 << 20)
+#define    NV10TCL_RC_FINAL1_F_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL1_F_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV10TCL_RC_FINAL1_F_MAPPING_SHIFT						21
+#define   NV10TCL_RC_FINAL1_F_MAPPING_MASK						0x00e00000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_UNSIGNED_INVERT					0x00200000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_EXPAND_NORMAL					0x00400000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_EXPAND_NEGATE					0x00600000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NORMAL					0x00800000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NEGATE					0x00a00000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_SIGNED_IDENTITY					0x00c00000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_SIGNED_NEGATE					0x00e00000
+#define   NV10TCL_RC_FINAL1_E_INPUT_SHIFT						24
+#define   NV10TCL_RC_FINAL1_E_INPUT_MASK						0x0f000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR0					0x01000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR1					0x02000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_FOG						0x03000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_PRIMARY_COLOR					0x04000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_SECONDARY_COLOR					0x05000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_TEXTURE0						0x08000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_TEXTURE1						0x09000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_SPARE0						0x0c000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_SPARE1						0x0d000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0e000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_E_TIMES_F						0x0f000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_TEXTURE2						0x0a000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_TEXTURE3						0x0b000000
+#define   NV10TCL_RC_FINAL1_E_COMPONENT_USAGE						(1 << 28)
+#define    NV10TCL_RC_FINAL1_E_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL1_E_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV10TCL_RC_FINAL1_E_MAPPING_SHIFT						29
+#define   NV10TCL_RC_FINAL1_E_MAPPING_MASK						0xe0000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_UNSIGNED_INVERT					0x20000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_EXPAND_NORMAL					0x40000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_EXPAND_NEGATE					0x60000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NORMAL					0x80000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NEGATE					0xa0000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_SIGNED_IDENTITY					0xc0000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_SIGNED_NEGATE					0xe0000000
+#define  NV10TCL_LIGHT_MODEL								0x00000294
+#define   NV10TCL_LIGHT_MODEL_VERTEX_SPECULAR						(1 <<  0)
+#define   NV10TCL_LIGHT_MODEL_SEPARATE_SPECULAR						(1 <<  1)
+#define   NV10TCL_LIGHT_MODEL_LOCAL_VIEWER						(1 << 16)
+#define  NV10TCL_COLOR_MATERIAL								0x00000298
+#define   NV10TCL_COLOR_MATERIAL_EMISSION						(1 <<  0)
+#define   NV10TCL_COLOR_MATERIAL_AMBIENT						(1 <<  1)
+#define   NV10TCL_COLOR_MATERIAL_DIFFUSE						(1 <<  2)
+#define   NV10TCL_COLOR_MATERIAL_SPECULAR						(1 <<  3)
+#define  NV10TCL_FOG_MODE								0x0000029c
+#define   NV10TCL_FOG_MODE_LINEAR							0x00002601
+#define   NV10TCL_FOG_MODE_EXP								0x00000800
+#define   NV10TCL_FOG_MODE_EXP_ABS							0x00000802
+#define   NV10TCL_FOG_MODE_EXP2								0x00000803
+#define  NV10TCL_FOG_COORD								0x000002a0
+#define   NV10TCL_FOG_COORD_FOG								0x00000000
+#define   NV10TCL_FOG_COORD_DIST_RADIAL							0x00000001
+#define   NV10TCL_FOG_COORD_DIST_ORTHOGONAL						0x00000002
+#define   NV10TCL_FOG_COORD_DIST_ORTHOGONAL_ABS						0x00000003
+#define  NV10TCL_FOG_ENABLE								0x000002a4
+#define  NV10TCL_FOG_COLOR								0x000002a8
+#define   NV10TCL_FOG_COLOR_R_SHIFT							0
+#define   NV10TCL_FOG_COLOR_R_MASK							0x000000ff
+#define   NV10TCL_FOG_COLOR_G_SHIFT							8
+#define   NV10TCL_FOG_COLOR_G_MASK							0x0000ff00
+#define   NV10TCL_FOG_COLOR_B_SHIFT							16
+#define   NV10TCL_FOG_COLOR_B_MASK							0x00ff0000
+#define   NV10TCL_FOG_COLOR_A_SHIFT							24
+#define   NV10TCL_FOG_COLOR_A_MASK							0xff000000
+#define  NV10TCL_VIEWPORT_CLIP_MODE							0x000002b4
+#define  NV10TCL_VIEWPORT_CLIP_HORIZ(x)							(0x000002c0+((x)*4))
+#define  NV10TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_L_SHIFT					0
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_L_MASK					0x000007ff
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_LEFT_ENABLE					(1 << 11)
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_R_SHIFT					16
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_R_MASK					0x07ff0000
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_RIGHT_ENABLE					(1 << 27)
+#define  NV10TCL_VIEWPORT_CLIP_VERT(x)							(0x000002e0+((x)*4))
+#define  NV10TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_T_SHIFT					0
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_T_MASK					0x000007ff
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_TOP_ENABLE					(1 << 11)
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_B_SHIFT					16
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_B_MASK					0x07ff0000
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_BOTTOM_ENABLE					(1 << 27)
+#define  NV10TCL_ALPHA_FUNC_ENABLE							0x00000300
+#define  NV10TCL_BLEND_FUNC_ENABLE							0x00000304
+#define  NV10TCL_CULL_FACE_ENABLE							0x00000308
+#define  NV10TCL_DEPTH_TEST_ENABLE							0x0000030c
+#define  NV10TCL_DITHER_ENABLE								0x00000310
+#define  NV10TCL_LIGHTING_ENABLE							0x00000314
+#define  NV10TCL_POINT_PARAMETERS_ENABLE						0x00000318
+#define  NV10TCL_POINT_SMOOTH_ENABLE							0x0000031c
+#define  NV10TCL_LINE_SMOOTH_ENABLE							0x00000320
+#define  NV10TCL_POLYGON_SMOOTH_ENABLE							0x00000324
+#define  NV10TCL_VERTEX_WEIGHT_ENABLE							0x00000328
+#define  NV10TCL_STENCIL_ENABLE								0x0000032c
+#define  NV10TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000330
+#define  NV10TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000334
+#define  NV10TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000338
+#define  NV10TCL_ALPHA_FUNC_FUNC							0x0000033c
+#define   NV10TCL_ALPHA_FUNC_FUNC_NEVER							0x00000200
+#define   NV10TCL_ALPHA_FUNC_FUNC_LESS							0x00000201
+#define   NV10TCL_ALPHA_FUNC_FUNC_EQUAL							0x00000202
+#define   NV10TCL_ALPHA_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV10TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV10TCL_ALPHA_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV10TCL_ALPHA_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV10TCL_ALPHA_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV10TCL_ALPHA_FUNC_REF								0x00000340
+#define  NV10TCL_BLEND_FUNC_SRC								0x00000344
+#define   NV10TCL_BLEND_FUNC_SRC_ZERO							0x00000000
+#define   NV10TCL_BLEND_FUNC_SRC_ONE							0x00000001
+#define   NV10TCL_BLEND_FUNC_SRC_SRC_COLOR						0x00000300
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_SRC_COLOR					0x00000301
+#define   NV10TCL_BLEND_FUNC_SRC_SRC_ALPHA						0x00000302
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_SRC_ALPHA					0x00000303
+#define   NV10TCL_BLEND_FUNC_SRC_DST_ALPHA						0x00000304
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_DST_ALPHA					0x00000305
+#define   NV10TCL_BLEND_FUNC_SRC_DST_COLOR						0x00000306
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_DST_COLOR					0x00000307
+#define   NV10TCL_BLEND_FUNC_SRC_SRC_ALPHA_SATURATE					0x00000308
+#define   NV10TCL_BLEND_FUNC_SRC_CONSTANT_COLOR						0x00008001
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV10TCL_BLEND_FUNC_SRC_CONSTANT_ALPHA						0x00008003
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV10TCL_BLEND_FUNC_DST								0x00000348
+#define   NV10TCL_BLEND_FUNC_DST_ZERO							0x00000000
+#define   NV10TCL_BLEND_FUNC_DST_ONE							0x00000001
+#define   NV10TCL_BLEND_FUNC_DST_SRC_COLOR						0x00000300
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_SRC_COLOR					0x00000301
+#define   NV10TCL_BLEND_FUNC_DST_SRC_ALPHA						0x00000302
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_SRC_ALPHA					0x00000303
+#define   NV10TCL_BLEND_FUNC_DST_DST_ALPHA						0x00000304
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_DST_ALPHA					0x00000305
+#define   NV10TCL_BLEND_FUNC_DST_DST_COLOR						0x00000306
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_DST_COLOR					0x00000307
+#define   NV10TCL_BLEND_FUNC_DST_SRC_ALPHA_SATURATE					0x00000308
+#define   NV10TCL_BLEND_FUNC_DST_CONSTANT_COLOR						0x00008001
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV10TCL_BLEND_FUNC_DST_CONSTANT_ALPHA						0x00008003
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV10TCL_BLEND_COLOR								0x0000034c
+#define   NV10TCL_BLEND_COLOR_B_SHIFT							0
+#define   NV10TCL_BLEND_COLOR_B_MASK							0x000000ff
+#define   NV10TCL_BLEND_COLOR_G_SHIFT							8
+#define   NV10TCL_BLEND_COLOR_G_MASK							0x0000ff00
+#define   NV10TCL_BLEND_COLOR_R_SHIFT							16
+#define   NV10TCL_BLEND_COLOR_R_MASK							0x00ff0000
+#define   NV10TCL_BLEND_COLOR_A_SHIFT							24
+#define   NV10TCL_BLEND_COLOR_A_MASK							0xff000000
+#define  NV10TCL_BLEND_EQUATION								0x00000350
+#define   NV10TCL_BLEND_EQUATION_FUNC_ADD						0x00008006
+#define   NV10TCL_BLEND_EQUATION_MIN							0x00008007
+#define   NV10TCL_BLEND_EQUATION_MAX							0x00008008
+#define   NV10TCL_BLEND_EQUATION_FUNC_SUBTRACT						0x0000800a
+#define   NV10TCL_BLEND_EQUATION_FUNC_REVERSE_SUBTRACT					0x0000800b
+#define  NV10TCL_DEPTH_FUNC								0x00000354
+#define   NV10TCL_DEPTH_FUNC_NEVER							0x00000200
+#define   NV10TCL_DEPTH_FUNC_LESS							0x00000201
+#define   NV10TCL_DEPTH_FUNC_EQUAL							0x00000202
+#define   NV10TCL_DEPTH_FUNC_LEQUAL							0x00000203
+#define   NV10TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV10TCL_DEPTH_FUNC_NOTEQUAL							0x00000205
+#define   NV10TCL_DEPTH_FUNC_GEQUAL							0x00000206
+#define   NV10TCL_DEPTH_FUNC_ALWAYS							0x00000207
+#define  NV10TCL_COLOR_MASK								0x00000358
+#define   NV10TCL_COLOR_MASK_B								(1 <<  0)
+#define   NV10TCL_COLOR_MASK_G								(1 <<  8)
+#define   NV10TCL_COLOR_MASK_R								(1 << 16)
+#define   NV10TCL_COLOR_MASK_A								(1 << 24)
+#define  NV10TCL_DEPTH_WRITE_ENABLE							0x0000035c
+#define  NV10TCL_STENCIL_MASK								0x00000360
+#define  NV10TCL_STENCIL_FUNC_FUNC							0x00000364
+#define   NV10TCL_STENCIL_FUNC_FUNC_NEVER						0x00000200
+#define   NV10TCL_STENCIL_FUNC_FUNC_LESS						0x00000201
+#define   NV10TCL_STENCIL_FUNC_FUNC_EQUAL						0x00000202
+#define   NV10TCL_STENCIL_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV10TCL_STENCIL_FUNC_FUNC_GREATER						0x00000204
+#define   NV10TCL_STENCIL_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV10TCL_STENCIL_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV10TCL_STENCIL_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV10TCL_STENCIL_FUNC_REF							0x00000368
+#define  NV10TCL_STENCIL_FUNC_MASK							0x0000036c
+#define  NV10TCL_STENCIL_OP_FAIL							0x00000370
+#define   NV10TCL_STENCIL_OP_FAIL_ZERO							0x00000000
+#define   NV10TCL_STENCIL_OP_FAIL_INVERT						0x0000150a
+#define   NV10TCL_STENCIL_OP_FAIL_KEEP							0x00001e00
+#define   NV10TCL_STENCIL_OP_FAIL_REPLACE						0x00001e01
+#define   NV10TCL_STENCIL_OP_FAIL_INCR							0x00001e02
+#define   NV10TCL_STENCIL_OP_FAIL_DECR							0x00001e03
+#define   NV10TCL_STENCIL_OP_FAIL_INCR_WRAP						0x00008507
+#define   NV10TCL_STENCIL_OP_FAIL_DECR_WRAP						0x00008508
+#define  NV10TCL_STENCIL_OP_ZFAIL							0x00000374
+#define   NV10TCL_STENCIL_OP_ZFAIL_ZERO							0x00000000
+#define   NV10TCL_STENCIL_OP_ZFAIL_INVERT						0x0000150a
+#define   NV10TCL_STENCIL_OP_ZFAIL_KEEP							0x00001e00
+#define   NV10TCL_STENCIL_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV10TCL_STENCIL_OP_ZFAIL_INCR							0x00001e02
+#define   NV10TCL_STENCIL_OP_ZFAIL_DECR							0x00001e03
+#define   NV10TCL_STENCIL_OP_ZFAIL_INCR_WRAP						0x00008507
+#define   NV10TCL_STENCIL_OP_ZFAIL_DECR_WRAP						0x00008508
+#define  NV10TCL_STENCIL_OP_ZPASS							0x00000378
+#define   NV10TCL_STENCIL_OP_ZPASS_ZERO							0x00000000
+#define   NV10TCL_STENCIL_OP_ZPASS_INVERT						0x0000150a
+#define   NV10TCL_STENCIL_OP_ZPASS_KEEP							0x00001e00
+#define   NV10TCL_STENCIL_OP_ZPASS_REPLACE						0x00001e01
+#define   NV10TCL_STENCIL_OP_ZPASS_INCR							0x00001e02
+#define   NV10TCL_STENCIL_OP_ZPASS_DECR							0x00001e03
+#define   NV10TCL_STENCIL_OP_ZPASS_INCR_WRAP						0x00008507
+#define   NV10TCL_STENCIL_OP_ZPASS_DECR_WRAP						0x00008508
+#define  NV10TCL_SHADE_MODEL								0x0000037c
+#define   NV10TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV10TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV10TCL_LINE_WIDTH								0x00000380
+#define  NV10TCL_POLYGON_OFFSET_FACTOR							0x00000384
+#define  NV10TCL_POLYGON_OFFSET_UNITS							0x00000388
+#define  NV10TCL_POLYGON_MODE_FRONT							0x0000038c
+#define   NV10TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV10TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV10TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV10TCL_POLYGON_MODE_BACK							0x00000390
+#define   NV10TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV10TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV10TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV10TCL_DEPTH_RANGE_NEAR							0x00000394
+#define  NV10TCL_DEPTH_RANGE_FAR							0x00000398
+#define  NV10TCL_CULL_FACE								0x0000039c
+#define   NV10TCL_CULL_FACE_FRONT							0x00000404
+#define   NV10TCL_CULL_FACE_BACK							0x00000405
+#define   NV10TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV10TCL_FRONT_FACE								0x000003a0
+#define   NV10TCL_FRONT_FACE_CW								0x00000900
+#define   NV10TCL_FRONT_FACE_CCW							0x00000901
+#define  NV10TCL_NORMALIZE_ENABLE							0x000003a4
+#define  NV10TCL_MATERIAL_FACTOR_R							0x000003a8
+#define  NV10TCL_MATERIAL_FACTOR_G							0x000003ac
+#define  NV10TCL_MATERIAL_FACTOR_B							0x000003b0
+#define  NV10TCL_MATERIAL_FACTOR_A							0x000003b4
+#define  NV10TCL_SEPARATE_SPECULAR_ENABLE						0x000003b8
+#define  NV10TCL_ENABLED_LIGHTS								0x000003bc
+#define   NV10TCL_ENABLED_LIGHTS_0_SHIFT						0
+#define   NV10TCL_ENABLED_LIGHTS_0_MASK							0x00000003
+#define    NV10TCL_ENABLED_LIGHTS_0_DISABLED						0x00000000
+#define    NV10TCL_ENABLED_LIGHTS_0_NONPOSITIONAL					0x00000001
+#define    NV10TCL_ENABLED_LIGHTS_0_POSITIONAL						0x00000002
+#define    NV10TCL_ENABLED_LIGHTS_0_DIRECTIONAL						0x00000003
+#define   NV10TCL_ENABLED_LIGHTS_1_SHIFT						2
+#define   NV10TCL_ENABLED_LIGHTS_1_MASK							0x0000000c
+#define    NV10TCL_ENABLED_LIGHTS_1_DISABLED						0x00000000
+#define    NV10TCL_ENABLED_LIGHTS_1_NONPOSITIONAL					0x00000004
+#define    NV10TCL_ENABLED_LIGHTS_1_POSITIONAL						0x00000008
+#define    NV10TCL_ENABLED_LIGHTS_1_DIRECTIONAL						0x0000000c
+#define   NV10TCL_ENABLED_LIGHTS_2_SHIFT						4
+#define   NV10TCL_ENABLED_LIGHTS_2_MASK							0x00000030
+#define    NV10TCL_ENABLED_LIGHTS_2_DISABLED						0x00000000
+#define    NV10TCL_ENABLED_LIGHTS_2_NONPOSITIONAL					0x00000010
+#define    NV10TCL_ENABLED_LIGHTS_2_POSITIONAL						0x00000020
+#define    NV10TCL_ENABLED_LIGHTS_2_DIRECTIONAL						0x00000030
+#define   NV10TCL_ENABLED_LIGHTS_3_SHIFT						6
+#define   NV10TCL_ENABLED_LIGHTS_3_MASK							0x000000c0
+#define    NV10TCL_ENABLED_LIGHTS_3_DISABLED						0x00000000
+#define    NV10TCL_ENABLED_LIGHTS_3_NONPOSITIONAL					0x00000040
+#define    NV10TCL_ENABLED_LIGHTS_3_POSITIONAL						0x00000080
+#define    NV10TCL_ENABLED_LIGHTS_3_DIRECTIONAL						0x000000c0
+#define   NV10TCL_ENABLED_LIGHTS_4_SHIFT						8
+#define   NV10TCL_ENABLED_LIGHTS_4_MASK							0x00000300
+#define    NV10TCL_ENABLED_LIGHTS_4_DISABLED						0x00000000
+#define    NV10TCL_ENABLED_LIGHTS_4_NONPOSITIONAL					0x00000100
+#define    NV10TCL_ENABLED_LIGHTS_4_POSITIONAL						0x00000200
+#define    NV10TCL_ENABLED_LIGHTS_4_DIRECTIONAL						0x00000300
+#define   NV10TCL_ENABLED_LIGHTS_5_SHIFT						10
+#define   NV10TCL_ENABLED_LIGHTS_5_MASK							0x00000c00
+#define    NV10TCL_ENABLED_LIGHTS_5_DISABLED						0x00000000
+#define    NV10TCL_ENABLED_LIGHTS_5_NONPOSITIONAL					0x00000400
+#define    NV10TCL_ENABLED_LIGHTS_5_POSITIONAL						0x00000800
+#define    NV10TCL_ENABLED_LIGHTS_5_DIRECTIONAL						0x00000c00
+#define   NV10TCL_ENABLED_LIGHTS_6_SHIFT						12
+#define   NV10TCL_ENABLED_LIGHTS_6_MASK							0x00003000
+#define    NV10TCL_ENABLED_LIGHTS_6_DISABLED						0x00000000
+#define    NV10TCL_ENABLED_LIGHTS_6_NONPOSITIONAL					0x00001000
+#define    NV10TCL_ENABLED_LIGHTS_6_POSITIONAL						0x00002000
+#define    NV10TCL_ENABLED_LIGHTS_6_DIRECTIONAL						0x00003000
+#define   NV10TCL_ENABLED_LIGHTS_7_SHIFT						14
+#define   NV10TCL_ENABLED_LIGHTS_7_MASK							0x0000c000
+#define    NV10TCL_ENABLED_LIGHTS_7_DISABLED						0x00000000
+#define    NV10TCL_ENABLED_LIGHTS_7_NONPOSITIONAL					0x00004000
+#define    NV10TCL_ENABLED_LIGHTS_7_POSITIONAL						0x00008000
+#define    NV10TCL_ENABLED_LIGHTS_7_DIRECTIONAL						0x0000c000
+#define  NV10TCL_TX_GEN_MODE_S(x)							(0x000003c0+((x)*16))
+#define  NV10TCL_TX_GEN_MODE_S__SIZE							0x00000002
+#define   NV10TCL_TX_GEN_MODE_S_FALSE							0x00000000
+#define   NV10TCL_TX_GEN_MODE_S_EYE_LINEAR						0x00002400
+#define   NV10TCL_TX_GEN_MODE_S_OBJECT_LINEAR						0x00002401
+#define   NV10TCL_TX_GEN_MODE_S_SPHERE_MAP						0x00002402
+#define   NV10TCL_TX_GEN_MODE_S_NORMAL_MAP						0x00008511
+#define   NV10TCL_TX_GEN_MODE_S_REFLECTION_MAP						0x00008512
+#define  NV10TCL_TX_GEN_MODE_T(x)							(0x000003c4+((x)*16))
+#define  NV10TCL_TX_GEN_MODE_T__SIZE							0x00000002
+#define   NV10TCL_TX_GEN_MODE_T_FALSE							0x00000000
+#define   NV10TCL_TX_GEN_MODE_T_EYE_LINEAR						0x00002400
+#define   NV10TCL_TX_GEN_MODE_T_OBJECT_LINEAR						0x00002401
+#define   NV10TCL_TX_GEN_MODE_T_SPHERE_MAP						0x00002402
+#define   NV10TCL_TX_GEN_MODE_T_NORMAL_MAP						0x00008511
+#define   NV10TCL_TX_GEN_MODE_T_REFLECTION_MAP						0x00008512
+#define  NV10TCL_TX_GEN_MODE_R(x)							(0x000003c8+((x)*16))
+#define  NV10TCL_TX_GEN_MODE_R__SIZE							0x00000002
+#define   NV10TCL_TX_GEN_MODE_R_FALSE							0x00000000
+#define   NV10TCL_TX_GEN_MODE_R_EYE_LINEAR						0x00002400
+#define   NV10TCL_TX_GEN_MODE_R_OBJECT_LINEAR						0x00002401
+#define   NV10TCL_TX_GEN_MODE_R_SPHERE_MAP						0x00002402
+#define   NV10TCL_TX_GEN_MODE_R_NORMAL_MAP						0x00008511
+#define   NV10TCL_TX_GEN_MODE_R_REFLECTION_MAP						0x00008512
+#define  NV10TCL_TX_GEN_MODE_Q(x)							(0x000003cc+((x)*16))
+#define  NV10TCL_TX_GEN_MODE_Q__SIZE							0x00000002
+#define   NV10TCL_TX_GEN_MODE_Q_FALSE							0x00000000
+#define   NV10TCL_TX_GEN_MODE_Q_EYE_LINEAR						0x00002400
+#define   NV10TCL_TX_GEN_MODE_Q_OBJECT_LINEAR						0x00002401
+#define   NV10TCL_TX_GEN_MODE_Q_SPHERE_MAP						0x00002402
+#define   NV10TCL_TX_GEN_MODE_Q_NORMAL_MAP						0x00008511
+#define   NV10TCL_TX_GEN_MODE_Q_REFLECTION_MAP						0x00008512
+#define  NV10TCL_TX_MATRIX_ENABLE(x)							(0x000003e0+((x)*4))
+#define  NV10TCL_TX_MATRIX_ENABLE__SIZE							0x00000002
+#define  NV10TCL_VIEW_MATRIX_ENABLE							0x000003e8
+#define   NV10TCL_VIEW_MATRIX_ENABLE_MODELVIEW1						(1 <<  0)
+#define   NV10TCL_VIEW_MATRIX_ENABLE_MODELVIEW0						(1 <<  1)
+#define   NV10TCL_VIEW_MATRIX_ENABLE_PROJECTION						(1 <<  2)
+#define  NV10TCL_POINT_SIZE								0x000003ec
+#define  NV10TCL_MODELVIEW0_MATRIX(x)							(0x00000400+((x)*4))
+#define  NV10TCL_MODELVIEW0_MATRIX__SIZE						0x00000010
+#define  NV10TCL_MODELVIEW1_MATRIX(x)							(0x00000440+((x)*4))
+#define  NV10TCL_MODELVIEW1_MATRIX__SIZE						0x00000010
+#define  NV10TCL_INVERSE_MODELVIEW0_MATRIX(x)						(0x00000480+((x)*4))
+#define  NV10TCL_INVERSE_MODELVIEW0_MATRIX__SIZE					0x00000010
+#define  NV10TCL_INVERSE_MODELVIEW1_MATRIX(x)						(0x000004c0+((x)*4))
+#define  NV10TCL_INVERSE_MODELVIEW1_MATRIX__SIZE					0x00000010
+#define  NV10TCL_PROJECTION_MATRIX(x)							(0x00000500+((x)*4))
+#define  NV10TCL_PROJECTION_MATRIX__SIZE						0x00000010
+#define  NV10TCL_TX0_MATRIX(x)								(0x00000540+((x)*4))
+#define  NV10TCL_TX0_MATRIX__SIZE							0x00000010
+#define  NV10TCL_TX1_MATRIX(x)								(0x00000580+((x)*4))
+#define  NV10TCL_TX1_MATRIX__SIZE							0x00000010
+#define  NV10TCL_TX_GEN_COEFF_S_A(x)							(0x00000600+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_S_A__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_S_B(x)							(0x00000604+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_S_B__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_S_C(x)							(0x00000608+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_S_C__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_S_D(x)							(0x0000060c+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_S_D__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_T_A(x)							(0x00000610+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_T_A__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_T_B(x)							(0x00000614+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_T_B__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_T_C(x)							(0x00000618+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_T_C__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_T_D(x)							(0x0000061c+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_T_D__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_R_A(x)							(0x00000620+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_R_A__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_R_B(x)							(0x00000624+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_R_B__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_R_C(x)							(0x00000628+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_R_C__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_R_D(x)							(0x0000062c+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_R_D__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_Q_A(x)							(0x00000630+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_Q_A__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_Q_B(x)							(0x00000634+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_Q_B__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_Q_C(x)							(0x00000638+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_Q_C__SIZE							0x00000002
+#define  NV10TCL_TX_GEN_COEFF_Q_D(x)							(0x0000063c+((x)*64))
+#define  NV10TCL_TX_GEN_COEFF_Q_D__SIZE							0x00000002
+#define  NV10TCL_FOG_EQUATION_CONSTANT							0x00000680
+#define  NV10TCL_FOG_EQUATION_LINEAR							0x00000684
+#define  NV10TCL_FOG_EQUATION_QUADRATIC							0x00000688
+#define  NV10TCL_MATERIAL_SHININESS(x)							(0x000006a0+((x)*4))
+#define  NV10TCL_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV10TCL_LIGHT_MODEL_AMBIENT_R							0x000006c4
+#define  NV10TCL_LIGHT_MODEL_AMBIENT_G							0x000006c8
+#define  NV10TCL_LIGHT_MODEL_AMBIENT_B							0x000006cc
+#define  NV10TCL_VIEWPORT_TRANSLATE_X							0x000006e8
+#define  NV10TCL_VIEWPORT_TRANSLATE_Y							0x000006ec
+#define  NV10TCL_VIEWPORT_TRANSLATE_Z							0x000006f0
+#define  NV10TCL_VIEWPORT_TRANSLATE_W							0x000006f4
+#define  NV10TCL_POINT_PARAMETER(x)							(0x000006f8+((x)*4))
+#define  NV10TCL_POINT_PARAMETER__SIZE							0x00000008
+#define  NV10TCL_LIGHT_AMBIENT_R(x)							(0x00000800+((x)*128))
+#define  NV10TCL_LIGHT_AMBIENT_R__SIZE							0x00000008
+#define  NV10TCL_LIGHT_AMBIENT_G(x)							(0x00000804+((x)*128))
+#define  NV10TCL_LIGHT_AMBIENT_G__SIZE							0x00000008
+#define  NV10TCL_LIGHT_AMBIENT_B(x)							(0x00000808+((x)*128))
+#define  NV10TCL_LIGHT_AMBIENT_B__SIZE							0x00000008
+#define  NV10TCL_LIGHT_DIFFUSE_R(x)							(0x0000080c+((x)*128))
+#define  NV10TCL_LIGHT_DIFFUSE_R__SIZE							0x00000008
+#define  NV10TCL_LIGHT_DIFFUSE_G(x)							(0x00000810+((x)*128))
+#define  NV10TCL_LIGHT_DIFFUSE_G__SIZE							0x00000008
+#define  NV10TCL_LIGHT_DIFFUSE_B(x)							(0x00000814+((x)*128))
+#define  NV10TCL_LIGHT_DIFFUSE_B__SIZE							0x00000008
+#define  NV10TCL_LIGHT_SPECULAR_R(x)							(0x00000818+((x)*128))
+#define  NV10TCL_LIGHT_SPECULAR_R__SIZE							0x00000008
+#define  NV10TCL_LIGHT_SPECULAR_G(x)							(0x0000081c+((x)*128))
+#define  NV10TCL_LIGHT_SPECULAR_G__SIZE							0x00000008
+#define  NV10TCL_LIGHT_SPECULAR_B(x)							(0x00000820+((x)*128))
+#define  NV10TCL_LIGHT_SPECULAR_B__SIZE							0x00000008
+#define  NV10TCL_LIGHT_HALF_VECTOR_X(x)							(0x00000828+((x)*128))
+#define  NV10TCL_LIGHT_HALF_VECTOR_X__SIZE						0x00000008
+#define  NV10TCL_LIGHT_HALF_VECTOR_Y(x)							(0x0000082c+((x)*128))
+#define  NV10TCL_LIGHT_HALF_VECTOR_Y__SIZE						0x00000008
+#define  NV10TCL_LIGHT_HALF_VECTOR_Z(x)							(0x00000830+((x)*128))
+#define  NV10TCL_LIGHT_HALF_VECTOR_Z__SIZE						0x00000008
+#define  NV10TCL_LIGHT_DIRECTION_X(x)							(0x00000834+((x)*128))
+#define  NV10TCL_LIGHT_DIRECTION_X__SIZE						0x00000008
+#define  NV10TCL_LIGHT_DIRECTION_Y(x)							(0x00000838+((x)*128))
+#define  NV10TCL_LIGHT_DIRECTION_Y__SIZE						0x00000008
+#define  NV10TCL_LIGHT_DIRECTION_Z(x)							(0x0000083c+((x)*128))
+#define  NV10TCL_LIGHT_DIRECTION_Z__SIZE						0x00000008
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_A(x)							(0x00000840+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_A__SIZE						0x00000008
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_B(x)							(0x00000844+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_B__SIZE						0x00000008
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_C(x)							(0x00000848+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_C__SIZE						0x00000008
+#define  NV10TCL_LIGHT_SPOT_DIR_X(x)							(0x0000084c+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_DIR_X__SIZE							0x00000008
+#define  NV10TCL_LIGHT_SPOT_DIR_Y(x)							(0x00000850+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_DIR_Y__SIZE							0x00000008
+#define  NV10TCL_LIGHT_SPOT_DIR_Z(x)							(0x00000854+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_DIR_Z__SIZE							0x00000008
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_D(x)							(0x00000858+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_D__SIZE						0x00000008
+#define  NV10TCL_LIGHT_POSITION_X(x)							(0x0000085c+((x)*128))
+#define  NV10TCL_LIGHT_POSITION_X__SIZE							0x00000008
+#define  NV10TCL_LIGHT_POSITION_Y(x)							(0x00000860+((x)*128))
+#define  NV10TCL_LIGHT_POSITION_Y__SIZE							0x00000008
+#define  NV10TCL_LIGHT_POSITION_Z(x)							(0x00000864+((x)*128))
+#define  NV10TCL_LIGHT_POSITION_Z__SIZE							0x00000008
+#define  NV10TCL_LIGHT_ATTENUATION_CONSTANT(x)						(0x00000868+((x)*128))
+#define  NV10TCL_LIGHT_ATTENUATION_CONSTANT__SIZE					0x00000008
+#define  NV10TCL_LIGHT_ATTENUATION_LINEAR(x)						(0x0000086c+((x)*128))
+#define  NV10TCL_LIGHT_ATTENUATION_LINEAR__SIZE						0x00000008
+#define  NV10TCL_LIGHT_ATTENUATION_QUADRATIC(x)						(0x00000870+((x)*128))
+#define  NV10TCL_LIGHT_ATTENUATION_QUADRATIC__SIZE					0x00000008
+#define  NV10TCL_VERTEX_POS_3F_X							0x00000c00
+#define  NV10TCL_VERTEX_POS_3F_Y							0x00000c04
+#define  NV10TCL_VERTEX_POS_3F_Z							0x00000c08
+#define  NV10TCL_VERTEX_POS_4F_X							0x00000c18
+#define  NV10TCL_VERTEX_POS_4F_Y							0x00000c1c
+#define  NV10TCL_VERTEX_POS_4F_Z							0x00000c20
+#define  NV10TCL_VERTEX_POS_4F_W							0x00000c24
+#define  NV10TCL_VERTEX_NOR_3F_X							0x00000c30
+#define  NV10TCL_VERTEX_NOR_3F_Y							0x00000c34
+#define  NV10TCL_VERTEX_NOR_3F_Z							0x00000c38
+#define  NV10TCL_VERTEX_NOR_3I_XY							0x00000c40
+#define   NV10TCL_VERTEX_NOR_3I_XY_X_SHIFT						0
+#define   NV10TCL_VERTEX_NOR_3I_XY_X_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_NOR_3I_XY_Y_SHIFT						16
+#define   NV10TCL_VERTEX_NOR_3I_XY_Y_MASK						0xffff0000
+#define  NV10TCL_VERTEX_NOR_3I_Z							0x00000c44
+#define   NV10TCL_VERTEX_NOR_3I_Z_Z_SHIFT						0
+#define   NV10TCL_VERTEX_NOR_3I_Z_Z_MASK						0x0000ffff
+#define  NV10TCL_VERTEX_COL_4F_R							0x00000c50
+#define  NV10TCL_VERTEX_COL_4F_G							0x00000c54
+#define  NV10TCL_VERTEX_COL_4F_B							0x00000c58
+#define  NV10TCL_VERTEX_COL_4F_A							0x00000c5c
+#define  NV10TCL_VERTEX_COL_3F_R							0x00000c60
+#define  NV10TCL_VERTEX_COL_3F_G							0x00000c64
+#define  NV10TCL_VERTEX_COL_3F_B							0x00000c68
+#define  NV10TCL_VERTEX_COL_4I								0x00000c6c
+#define   NV10TCL_VERTEX_COL_4I_R_SHIFT							0
+#define   NV10TCL_VERTEX_COL_4I_R_MASK							0x000000ff
+#define   NV10TCL_VERTEX_COL_4I_G_SHIFT							8
+#define   NV10TCL_VERTEX_COL_4I_G_MASK							0x0000ff00
+#define   NV10TCL_VERTEX_COL_4I_B_SHIFT							16
+#define   NV10TCL_VERTEX_COL_4I_B_MASK							0x00ff0000
+#define   NV10TCL_VERTEX_COL_4I_A_SHIFT							24
+#define   NV10TCL_VERTEX_COL_4I_A_MASK							0xff000000
+#define  NV10TCL_VERTEX_COL2_3F_R							0x00000c80
+#define  NV10TCL_VERTEX_COL2_3F_G							0x00000c84
+#define  NV10TCL_VERTEX_COL2_3F_B							0x00000c88
+#define  NV10TCL_VERTEX_COL2_3I								0x00000c8c
+#define   NV10TCL_VERTEX_COL2_3I_R_SHIFT						0
+#define   NV10TCL_VERTEX_COL2_3I_R_MASK							0x000000ff
+#define   NV10TCL_VERTEX_COL2_3I_G_SHIFT						8
+#define   NV10TCL_VERTEX_COL2_3I_G_MASK							0x0000ff00
+#define   NV10TCL_VERTEX_COL2_3I_B_SHIFT						16
+#define   NV10TCL_VERTEX_COL2_3I_B_MASK							0x00ff0000
+#define  NV10TCL_VERTEX_TX0_2F_S							0x00000c90
+#define  NV10TCL_VERTEX_TX0_2F_T							0x00000c94
+#define  NV10TCL_VERTEX_TX0_2I								0x00000c98
+#define   NV10TCL_VERTEX_TX0_2I_S_SHIFT							0
+#define   NV10TCL_VERTEX_TX0_2I_S_MASK							0x0000ffff
+#define   NV10TCL_VERTEX_TX0_2I_T_SHIFT							16
+#define   NV10TCL_VERTEX_TX0_2I_T_MASK							0xffff0000
+#define  NV10TCL_VERTEX_TX0_4F_S							0x00000ca0
+#define  NV10TCL_VERTEX_TX0_4F_T							0x00000ca4
+#define  NV10TCL_VERTEX_TX0_4F_R							0x00000ca8
+#define  NV10TCL_VERTEX_TX0_4F_Q							0x00000cac
+#define  NV10TCL_VERTEX_TX0_4I_ST							0x00000cb0
+#define   NV10TCL_VERTEX_TX0_4I_ST_S_SHIFT						0
+#define   NV10TCL_VERTEX_TX0_4I_ST_S_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_TX0_4I_ST_T_SHIFT						16
+#define   NV10TCL_VERTEX_TX0_4I_ST_T_MASK						0xffff0000
+#define  NV10TCL_VERTEX_TX0_4I_RQ							0x00000cb4
+#define   NV10TCL_VERTEX_TX0_4I_RQ_R_SHIFT						0
+#define   NV10TCL_VERTEX_TX0_4I_RQ_R_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_TX0_4I_RQ_Q_SHIFT						16
+#define   NV10TCL_VERTEX_TX0_4I_RQ_Q_MASK						0xffff0000
+#define  NV10TCL_VERTEX_TX1_2F_S							0x00000cb8
+#define  NV10TCL_VERTEX_TX1_2F_T							0x00000cbc
+#define  NV10TCL_VERTEX_TX1_2I								0x00000cc0
+#define   NV10TCL_VERTEX_TX1_2I_S_SHIFT							0
+#define   NV10TCL_VERTEX_TX1_2I_S_MASK							0x0000ffff
+#define   NV10TCL_VERTEX_TX1_2I_T_SHIFT							16
+#define   NV10TCL_VERTEX_TX1_2I_T_MASK							0xffff0000
+#define  NV10TCL_VERTEX_TX1_4F_S							0x00000cc8
+#define  NV10TCL_VERTEX_TX1_4F_T							0x00000ccc
+#define  NV10TCL_VERTEX_TX1_4F_R							0x00000cd0
+#define  NV10TCL_VERTEX_TX1_4F_Q							0x00000cd4
+#define  NV10TCL_VERTEX_TX1_4I_ST							0x00000cd8
+#define   NV10TCL_VERTEX_TX1_4I_ST_S_SHIFT						0
+#define   NV10TCL_VERTEX_TX1_4I_ST_S_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_TX1_4I_ST_T_SHIFT						16
+#define   NV10TCL_VERTEX_TX1_4I_ST_T_MASK						0xffff0000
+#define  NV10TCL_VERTEX_TX1_4I_RQ							0x00000cdc
+#define   NV10TCL_VERTEX_TX1_4I_RQ_R_SHIFT						0
+#define   NV10TCL_VERTEX_TX1_4I_RQ_R_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_TX1_4I_RQ_Q_SHIFT						16
+#define   NV10TCL_VERTEX_TX1_4I_RQ_Q_MASK						0xffff0000
+#define  NV10TCL_VERTEX_FOG_1F								0x00000ce0
+#define  NV10TCL_VERTEX_WGH_1F								0x00000ce4
+#define  NV10TCL_EDGEFLAG_ENABLE							0x00000cec
+#define  NV10TCL_VERTEX_ARRAY_VALIDATE							0x00000cf0
+#define  NV10TCL_VTXBUF_ADDRESS(x)							(0x00000d00+((x)*8))
+#define  NV10TCL_VTXBUF_ADDRESS__SIZE							0x00000008
+#define  NV10TCL_VTXFMT(x)								(0x00000d04+((x)*8))
+#define  NV10TCL_VTXFMT__SIZE								0x00000008
+#define   NV10TCL_VTXFMT_TYPE_SHIFT							0
+#define   NV10TCL_VTXFMT_TYPE_MASK							0x0000000f
+#define    NV10TCL_VTXFMT_TYPE_BYTE_BGRA						0x00000000
+#define    NV10TCL_VTXFMT_TYPE_SHORT							0x00000001
+#define    NV10TCL_VTXFMT_TYPE_FLOAT							0x00000002
+#define    NV10TCL_VTXFMT_TYPE_BYTE_RGBA						0x00000004
+#define   NV10TCL_VTXFMT_FIELDS_SHIFT							4
+#define   NV10TCL_VTXFMT_FIELDS_MASK							0x000000f0
+#define   NV10TCL_VTXFMT_STRIDE_SHIFT							8
+#define   NV10TCL_VTXFMT_STRIDE_MASK							0x0000ff00
+#define   NV10TCL_VTXFMT_POS_HOMOGENEOUS						(1 << 24)
+#define  NV10TCL_VERTEX_BEGIN_END							0x00000dfc
+#define   NV10TCL_VERTEX_BEGIN_END_STOP							0x00000000
+#define   NV10TCL_VERTEX_BEGIN_END_POINTS						0x00000001
+#define   NV10TCL_VERTEX_BEGIN_END_LINES						0x00000002
+#define   NV10TCL_VERTEX_BEGIN_END_LINE_LOOP						0x00000003
+#define   NV10TCL_VERTEX_BEGIN_END_LINE_STRIP						0x00000004
+#define   NV10TCL_VERTEX_BEGIN_END_TRIANGLES						0x00000005
+#define   NV10TCL_VERTEX_BEGIN_END_TRIANGLE_STRIP					0x00000006
+#define   NV10TCL_VERTEX_BEGIN_END_TRIANGLE_FAN						0x00000007
+#define   NV10TCL_VERTEX_BEGIN_END_QUADS						0x00000008
+#define   NV10TCL_VERTEX_BEGIN_END_QUAD_STRIP						0x00000009
+#define   NV10TCL_VERTEX_BEGIN_END_POLYGON						0x0000000a
+#define  NV10TCL_VB_ELEMENT_U16								0x00000e00
+#define   NV10TCL_VB_ELEMENT_U16_I0_SHIFT						0
+#define   NV10TCL_VB_ELEMENT_U16_I0_MASK						0x0000ffff
+#define   NV10TCL_VB_ELEMENT_U16_I1_SHIFT						16
+#define   NV10TCL_VB_ELEMENT_U16_I1_MASK						0xffff0000
+#define  NV10TCL_VB_ELEMENT_U32								0x00001100
+#define  NV10TCL_VERTEX_BUFFER_BEGIN_END						0x000013fc
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_STOP						0x00000000
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_POINTS					0x00000001
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_LINES						0x00000002
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_LINE_LOOP					0x00000003
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_LINE_STRIP					0x00000004
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_TRIANGLES					0x00000005
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_TRIANGLE_STRIP				0x00000006
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_TRIANGLE_FAN					0x00000007
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_QUADS						0x00000008
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_QUAD_STRIP					0x00000009
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_POLYGON					0x0000000a
+#define  NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS						0x00001400
+#define   NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS_FIRST_SHIFT					0
+#define   NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS_FIRST_MASK					0x0000ffff
+#define   NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS_LAST_SHIFT					24
+#define   NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS_LAST_MASK					0xff000000
+#define  NV10TCL_VERTEX_ARRAY_DATA							0x00001800
+
+
+#define NV11TCL										0x00000096
+
+#define  NV11TCL_COLOR_LOGIC_OP_ENABLE							0x00000d40
+#define  NV11TCL_COLOR_LOGIC_OP_OP							0x00000d44
+#define   NV11TCL_COLOR_LOGIC_OP_OP_CLEAR						0x00001500
+#define   NV11TCL_COLOR_LOGIC_OP_OP_AND							0x00001501
+#define   NV11TCL_COLOR_LOGIC_OP_OP_AND_REVERSE						0x00001502
+#define   NV11TCL_COLOR_LOGIC_OP_OP_COPY						0x00001503
+#define   NV11TCL_COLOR_LOGIC_OP_OP_AND_INVERTED					0x00001504
+#define   NV11TCL_COLOR_LOGIC_OP_OP_NOOP						0x00001505
+#define   NV11TCL_COLOR_LOGIC_OP_OP_XOR							0x00001506
+#define   NV11TCL_COLOR_LOGIC_OP_OP_OR							0x00001507
+#define   NV11TCL_COLOR_LOGIC_OP_OP_NOR							0x00001508
+#define   NV11TCL_COLOR_LOGIC_OP_OP_EQUIV						0x00001509
+#define   NV11TCL_COLOR_LOGIC_OP_OP_INVERT						0x0000150a
+#define   NV11TCL_COLOR_LOGIC_OP_OP_OR_REVERSE						0x0000150b
+#define   NV11TCL_COLOR_LOGIC_OP_OP_COPY_INVERTED					0x0000150c
+#define   NV11TCL_COLOR_LOGIC_OP_OP_OR_INVERTED						0x0000150d
+#define   NV11TCL_COLOR_LOGIC_OP_OP_NAND						0x0000150e
+#define   NV11TCL_COLOR_LOGIC_OP_OP_SET							0x0000150f
+
+
+#define NV17TCL										0x00000099
+
+#define  NV17TCL_DMA_IN_MEMORY4								0x000001ac
+#define  NV17TCL_DMA_IN_MEMORY5								0x000001b0
+#define  NV17TCL_COLOR_MASK_ENABLE							0x000002bc
+#define  NV17TCL_LMA_DEPTH_BUFFER_PITCH							0x00000d5c
+#define  NV17TCL_LMA_DEPTH_BUFFER_OFFSET						0x00000d60
+#define  NV17TCL_LMA_DEPTH_FILL_VALUE							0x00000d68
+#define  NV17TCL_LMA_DEPTH_BUFFER_CLEAR							0x00000d6c
+#define  NV17TCL_LMA_DEPTH_WINDOW_X							0x00001638
+#define  NV17TCL_LMA_DEPTH_WINDOW_Y							0x0000163c
+#define  NV17TCL_LMA_DEPTH_WINDOW_Z							0x00001640
+#define  NV17TCL_LMA_DEPTH_WINDOW_W							0x00001644
+#define  NV17TCL_LMA_DEPTH_ENABLE							0x00001658
+
+
+#define NV03_CONTEXT_SURFACES_2D							0x00000058
+
+#define  NV03_CONTEXT_SURFACES_2D_SYNCHRONIZE						0x00000100
+#define  NV03_CONTEXT_SURFACES_2D_DMA_NOTIFY						0x00000180
+#define  NV03_CONTEXT_SURFACES_2D_DMA_SOURCE						0x00000184
+#define  NV03_CONTEXT_SURFACES_2D_DMA_DESTIN						0x00000188
+#define  NV03_CONTEXT_SURFACES_2D_COLOR_FORMAT						0x00000300
+#define  NV03_CONTEXT_SURFACES_2D_PITCH							0x00000304
+#define   NV03_CONTEXT_SURFACES_2D_PITCH_SOURCE_SHIFT					0
+#define   NV03_CONTEXT_SURFACES_2D_PITCH_SOURCE_MASK					0x0000ffff
+#define   NV03_CONTEXT_SURFACES_2D_PITCH_DESTIN_SHIFT					16
+#define   NV03_CONTEXT_SURFACES_2D_PITCH_DESTIN_MASK					0xffff0000
+#define  NV03_CONTEXT_SURFACES_2D_OFFSET_SOURCE						0x00000308
+#define  NV03_CONTEXT_SURFACES_2D_OFFSET_DESTIN						0x0000030c
+
+
+#define NV03_CONTEXT_SURFACES_3D							0x0000005a
+
+#define  NV03_CONTEXT_SURFACES_3D_SYNCHRONIZE						0x00000100
+#define  NV03_CONTEXT_SURFACES_3D_DMA_NOTIFY						0x00000180
+#define  NV03_CONTEXT_SURFACES_3D_DMA_SURFACE						0x00000184
+#define  NV03_CONTEXT_SURFACES_3D_PITCH							0x00000300
+#define  NV03_CONTEXT_SURFACES_3D_OFFSET_COLOR						0x00000304
+#define  NV03_CONTEXT_SURFACES_3D_OFFSET_ZETA						0x00000308
+
+
+#define NV04_INDEXED_IMAGE_FROM_CPU							0x00000060
+
+#define  NV04_INDEXED_IMAGE_FROM_CPU_NOP						0x00000100
+#define  NV04_INDEXED_IMAGE_FROM_CPU_NOTIFY						0x00000104
+#define  NV04_INDEXED_IMAGE_FROM_CPU_PATCH						0x0000010c
+#define  NV04_INDEXED_IMAGE_FROM_CPU_DMA_NOTIFY						0x00000180
+#define  NV04_INDEXED_IMAGE_FROM_CPU_DMA_LUT						0x00000184
+#define  NV04_INDEXED_IMAGE_FROM_CPU_COLOR_KEY						0x00000188
+#define  NV04_INDEXED_IMAGE_FROM_CPU_CLIP_RECTANGLE					0x0000018c
+#define  NV04_INDEXED_IMAGE_FROM_CPU_PATTERN						0x00000190
+#define  NV04_INDEXED_IMAGE_FROM_CPU_ROP						0x00000194
+#define  NV04_INDEXED_IMAGE_FROM_CPU_BETA1						0x00000198
+#define  NV04_INDEXED_IMAGE_FROM_CPU_BETA4						0x0000019c
+#define  NV04_INDEXED_IMAGE_FROM_CPU_SURFACE						0x000001a0
+#define  NV04_INDEXED_IMAGE_FROM_CPU_OPERATION						0x000003e4
+#define  NV04_INDEXED_IMAGE_FROM_CPU_COLOR_FORMAT					0x000003e8
+#define  NV04_INDEXED_IMAGE_FROM_CPU_INDEX_FORMAT					0x000003ec
+#define  NV04_INDEXED_IMAGE_FROM_CPU_LUT_OFFSET						0x000003f0
+#define  NV04_INDEXED_IMAGE_FROM_CPU_POINT						0x000003f4
+#define  NV04_INDEXED_IMAGE_FROM_CPU_SIZE_OUT						0x000003f8
+#define  NV04_INDEXED_IMAGE_FROM_CPU_SIZE_IN						0x000003fc
+#define  NV04_INDEXED_IMAGE_FROM_CPU_COLOR(x)						(0x00000400+((x)*4))
+#define  NV04_INDEXED_IMAGE_FROM_CPU_COLOR__SIZE					0x00000700
+
+
+#define NV05_INDEXED_IMAGE_FROM_CPU							0x00000064
+
+#define  NV05_INDEXED_IMAGE_FROM_CPU_COLOR_CONVERSION					0x000003e0
+
+
+#define NV03_CHANNEL_PIO								0x0000006a
+
+
+
+#define NV03_CHANNEL_DMA								0x0000006b
+
+
+
+#define NV04_BETA_SOLID									0x00000072
+
+#define  NV04_BETA_SOLID_NOP								0x00000100
+#define  NV04_BETA_SOLID_NOTIFY								0x00000104
+#define  NV04_BETA_SOLID_DMA_NOTIFY							0x00000180
+#define  NV04_BETA_SOLID_BETA_OUTPUT							0x00000200
+#define  NV04_BETA_SOLID_BETA_FACTOR							0x00000300
+
+
+#define NV10_TEXTURE_FROM_CPU								0x0000007b
+
+#define  NV10_TEXTURE_FROM_CPU_NOP							0x00000100
+#define  NV10_TEXTURE_FROM_CPU_NOTIFY							0x00000104
+#define  NV10_TEXTURE_FROM_CPU_WAIT_FOR_IDLE						0x00000108
+#define  NV10_TEXTURE_FROM_CPU_PM_TRIGGER						0x00000140
+#define  NV10_TEXTURE_FROM_CPU_DMA_NOTIFY						0x00000180
+#define  NV10_TEXTURE_FROM_CPU_SURFACE							0x00000184
+#define  NV10_TEXTURE_FROM_CPU_COLOR_FORMAT						0x00000300
+#define  NV10_TEXTURE_FROM_CPU_POINT							0x00000304
+#define   NV10_TEXTURE_FROM_CPU_POINT_X_SHIFT						0
+#define   NV10_TEXTURE_FROM_CPU_POINT_X_MASK						0x0000ffff
+#define   NV10_TEXTURE_FROM_CPU_POINT_Y_SHIFT						16
+#define   NV10_TEXTURE_FROM_CPU_POINT_Y_MASK						0xffff0000
+#define  NV10_TEXTURE_FROM_CPU_SIZE							0x00000308
+#define   NV10_TEXTURE_FROM_CPU_SIZE_W_SHIFT						0
+#define   NV10_TEXTURE_FROM_CPU_SIZE_W_MASK						0x0000ffff
+#define   NV10_TEXTURE_FROM_CPU_SIZE_H_SHIFT						16
+#define   NV10_TEXTURE_FROM_CPU_SIZE_H_MASK						0xffff0000
+#define  NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL						0x0000030c
+#define   NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL_X_SHIFT					0
+#define   NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL_X_MASK					0x0000ffff
+#define   NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL_W_SHIFT					16
+#define   NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL_W_MASK					0xffff0000
+#define  NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL						0x00000310
+#define   NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL_Y_SHIFT					0
+#define   NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL_Y_MASK					0x0000ffff
+#define   NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL_H_SHIFT					16
+#define   NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL_H_MASK					0xffff0000
+#define  NV10_TEXTURE_FROM_CPU_COLOR(x)							(0x00000400+((x)*4))
+#define  NV10_TEXTURE_FROM_CPU_COLOR__SIZE						0x00000700
+
+
+#define NV30_TEXTURE_FROM_CPU								0x0000037b
+
+
+
+#define NV40_TEXTURE_FROM_CPU								0x0000307b
+
+
+
+#define NV10_VIDEO_DISPLAY								0x0000007c
+
+
+
+#define NV20TCL										0x00000097
+
+#define  NV20TCL_NOP									0x00000100
+#define  NV20TCL_NOTIFY									0x00000104
+#define  NV20TCL_DMA_NOTIFY								0x00000180
+#define  NV20TCL_DMA_TEXTURE0								0x00000184
+#define  NV20TCL_DMA_TEXTURE1								0x00000188
+#define  NV20TCL_DMA_COLOR								0x00000194
+#define  NV20TCL_DMA_ZETA								0x00000198
+#define  NV20TCL_DMA_VTXBUF0								0x0000019c
+#define  NV20TCL_DMA_VTXBUF1								0x000001a0
+#define  NV20TCL_DMA_FENCE								0x000001a4
+#define  NV20TCL_DMA_QUERY								0x000001a8
+#define  NV20TCL_RT_HORIZ								0x00000200
+#define   NV20TCL_RT_HORIZ_X_SHIFT							0
+#define   NV20TCL_RT_HORIZ_X_MASK							0x0000ffff
+#define   NV20TCL_RT_HORIZ_W_SHIFT							16
+#define   NV20TCL_RT_HORIZ_W_MASK							0xffff0000
+#define  NV20TCL_RT_VERT								0x00000204
+#define   NV20TCL_RT_VERT_Y_SHIFT							0
+#define   NV20TCL_RT_VERT_Y_MASK							0x0000ffff
+#define   NV20TCL_RT_VERT_H_SHIFT							16
+#define   NV20TCL_RT_VERT_H_MASK							0xffff0000
+#define  NV20TCL_RT_FORMAT								0x00000208
+#define   NV20TCL_RT_FORMAT_TYPE_SHIFT							8
+#define   NV20TCL_RT_FORMAT_TYPE_MASK							0x00000f00
+#define    NV20TCL_RT_FORMAT_TYPE_LINEAR						0x00000100
+#define    NV20TCL_RT_FORMAT_TYPE_SWIZZLED						0x00000200
+#define   NV20TCL_RT_FORMAT_COLOR_SHIFT							0
+#define   NV20TCL_RT_FORMAT_COLOR_MASK							0x0000001f
+#define    NV20TCL_RT_FORMAT_COLOR_R5G6B5						0x00000003
+#define    NV20TCL_RT_FORMAT_COLOR_X8R8G8B8						0x00000005
+#define    NV20TCL_RT_FORMAT_COLOR_A8R8G8B8						0x00000008
+#define    NV20TCL_RT_FORMAT_COLOR_B8							0x00000009
+#define    NV20TCL_RT_FORMAT_COLOR_UNKNOWN						0x0000000d
+#define    NV20TCL_RT_FORMAT_COLOR_X8B8G8R8						0x0000000f
+#define    NV20TCL_RT_FORMAT_COLOR_A8B8G8R8						0x00000010
+#define  NV20TCL_RT_PITCH								0x0000020c
+#define   NV20TCL_RT_PITCH_COLOR_PITCH_SHIFT						0
+#define   NV20TCL_RT_PITCH_COLOR_PITCH_MASK						0x0000ffff
+#define   NV20TCL_RT_PITCH_ZETA_PITCH_SHIFT						16
+#define   NV20TCL_RT_PITCH_ZETA_PITCH_MASK						0xffff0000
+#define  NV20TCL_COLOR_OFFSET								0x00000210
+#define  NV20TCL_ZETA_OFFSET								0x00000214
+#define  NV20TCL_RC_IN_ALPHA(x)								(0x00000260+((x)*4))
+#define  NV20TCL_RC_IN_ALPHA__SIZE							0x00000008
+#define   NV20TCL_RC_IN_ALPHA_D_INPUT_SHIFT						0
+#define   NV20TCL_RC_IN_ALPHA_D_INPUT_MASK						0x0000000f
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR0					0x00000001
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR1					0x00000002
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_FOG						0x00000003
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_PRIMARY_COLOR					0x00000004
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_SECONDARY_COLOR					0x00000005
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_TEXTURE0						0x00000008
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_TEXTURE1						0x00000009
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_SPARE0						0x0000000c
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_SPARE1						0x0000000d
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_E_TIMES_F					0x0000000f
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_TEXTURE2						0x0000000a
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_TEXTURE3						0x0000000b
+#define   NV20TCL_RC_IN_ALPHA_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV20TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV20TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV20TCL_RC_IN_ALPHA_D_MAPPING_SHIFT						5
+#define   NV20TCL_RC_IN_ALPHA_D_MAPPING_MASK						0x000000e0
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_INVERT				0x00000020
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NORMAL					0x00000040
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NEGATE					0x00000060
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NORMAL				0x00000080
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NEGATE				0x000000a0
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_IDENTITY				0x000000c0
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_NEGATE					0x000000e0
+#define   NV20TCL_RC_IN_ALPHA_C_INPUT_SHIFT						8
+#define   NV20TCL_RC_IN_ALPHA_C_INPUT_MASK						0x00000f00
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR0					0x00000100
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR1					0x00000200
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_FOG						0x00000300
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_PRIMARY_COLOR					0x00000400
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_SECONDARY_COLOR					0x00000500
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_TEXTURE0						0x00000800
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_TEXTURE1						0x00000900
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_SPARE0						0x00000c00
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_SPARE1						0x00000d00
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_E_TIMES_F					0x00000f00
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_TEXTURE2						0x00000a00
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_TEXTURE3						0x00000b00
+#define   NV20TCL_RC_IN_ALPHA_C_COMPONENT_USAGE						(1 << 12)
+#define    NV20TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV20TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV20TCL_RC_IN_ALPHA_C_MAPPING_SHIFT						13
+#define   NV20TCL_RC_IN_ALPHA_C_MAPPING_MASK						0x0000e000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_INVERT				0x00002000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NORMAL					0x00004000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NEGATE					0x00006000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NORMAL				0x00008000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NEGATE				0x0000a000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_IDENTITY				0x0000c000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_NEGATE					0x0000e000
+#define   NV20TCL_RC_IN_ALPHA_B_INPUT_SHIFT						16
+#define   NV20TCL_RC_IN_ALPHA_B_INPUT_MASK						0x000f0000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR0					0x00010000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR1					0x00020000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_FOG						0x00030000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_PRIMARY_COLOR					0x00040000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_SECONDARY_COLOR					0x00050000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_TEXTURE0						0x00080000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_TEXTURE1						0x00090000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_SPARE0						0x000c0000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_SPARE1						0x000d0000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000e0000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_E_TIMES_F					0x000f0000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_TEXTURE2						0x000a0000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_TEXTURE3						0x000b0000
+#define   NV20TCL_RC_IN_ALPHA_B_COMPONENT_USAGE						(1 << 20)
+#define    NV20TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV20TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV20TCL_RC_IN_ALPHA_B_MAPPING_SHIFT						21
+#define   NV20TCL_RC_IN_ALPHA_B_MAPPING_MASK						0x00e00000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_INVERT				0x00200000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NORMAL					0x00400000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NEGATE					0x00600000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NORMAL				0x00800000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NEGATE				0x00a00000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_IDENTITY				0x00c00000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_NEGATE					0x00e00000
+#define   NV20TCL_RC_IN_ALPHA_A_INPUT_SHIFT						24
+#define   NV20TCL_RC_IN_ALPHA_A_INPUT_MASK						0x0f000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR0					0x01000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR1					0x02000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_FOG						0x03000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_PRIMARY_COLOR					0x04000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_SECONDARY_COLOR					0x05000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_TEXTURE0						0x08000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_TEXTURE1						0x09000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_SPARE0						0x0c000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_SPARE1						0x0d000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0e000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_E_TIMES_F					0x0f000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_TEXTURE2						0x0a000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_TEXTURE3						0x0b000000
+#define   NV20TCL_RC_IN_ALPHA_A_COMPONENT_USAGE						(1 << 28)
+#define    NV20TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV20TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV20TCL_RC_IN_ALPHA_A_MAPPING_SHIFT						29
+#define   NV20TCL_RC_IN_ALPHA_A_MAPPING_MASK						0xe0000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_INVERT				0x20000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NORMAL					0x40000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NEGATE					0x60000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NORMAL				0x80000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NEGATE				0xa0000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_IDENTITY				0xc0000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_NEGATE					0xe0000000
+#define  NV20TCL_RC_FINAL0								0x00000288
+#define   NV20TCL_RC_FINAL0_D_INPUT_SHIFT						0
+#define   NV20TCL_RC_FINAL0_D_INPUT_MASK						0x0000000f
+#define    NV20TCL_RC_FINAL0_D_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR0					0x00000001
+#define    NV20TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR1					0x00000002
+#define    NV20TCL_RC_FINAL0_D_INPUT_FOG						0x00000003
+#define    NV20TCL_RC_FINAL0_D_INPUT_PRIMARY_COLOR					0x00000004
+#define    NV20TCL_RC_FINAL0_D_INPUT_SECONDARY_COLOR					0x00000005
+#define    NV20TCL_RC_FINAL0_D_INPUT_TEXTURE0						0x00000008
+#define    NV20TCL_RC_FINAL0_D_INPUT_TEXTURE1						0x00000009
+#define    NV20TCL_RC_FINAL0_D_INPUT_SPARE0						0x0000000c
+#define    NV20TCL_RC_FINAL0_D_INPUT_SPARE1						0x0000000d
+#define    NV20TCL_RC_FINAL0_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV20TCL_RC_FINAL0_D_INPUT_E_TIMES_F						0x0000000f
+#define    NV20TCL_RC_FINAL0_D_INPUT_TEXTURE2						0x0000000a
+#define    NV20TCL_RC_FINAL0_D_INPUT_TEXTURE3						0x0000000b
+#define   NV20TCL_RC_FINAL0_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV20TCL_RC_FINAL0_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL0_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV20TCL_RC_FINAL0_D_MAPPING_SHIFT						5
+#define   NV20TCL_RC_FINAL0_D_MAPPING_MASK						0x000000e0
+#define    NV20TCL_RC_FINAL0_D_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_FINAL0_D_MAPPING_UNSIGNED_INVERT					0x00000020
+#define    NV20TCL_RC_FINAL0_D_MAPPING_EXPAND_NORMAL					0x00000040
+#define    NV20TCL_RC_FINAL0_D_MAPPING_EXPAND_NEGATE					0x00000060
+#define    NV20TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NORMAL					0x00000080
+#define    NV20TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NEGATE					0x000000a0
+#define    NV20TCL_RC_FINAL0_D_MAPPING_SIGNED_IDENTITY					0x000000c0
+#define    NV20TCL_RC_FINAL0_D_MAPPING_SIGNED_NEGATE					0x000000e0
+#define   NV20TCL_RC_FINAL0_C_INPUT_SHIFT						8
+#define   NV20TCL_RC_FINAL0_C_INPUT_MASK						0x00000f00
+#define    NV20TCL_RC_FINAL0_C_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR0					0x00000100
+#define    NV20TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR1					0x00000200
+#define    NV20TCL_RC_FINAL0_C_INPUT_FOG						0x00000300
+#define    NV20TCL_RC_FINAL0_C_INPUT_PRIMARY_COLOR					0x00000400
+#define    NV20TCL_RC_FINAL0_C_INPUT_SECONDARY_COLOR					0x00000500
+#define    NV20TCL_RC_FINAL0_C_INPUT_TEXTURE0						0x00000800
+#define    NV20TCL_RC_FINAL0_C_INPUT_TEXTURE1						0x00000900
+#define    NV20TCL_RC_FINAL0_C_INPUT_SPARE0						0x00000c00
+#define    NV20TCL_RC_FINAL0_C_INPUT_SPARE1						0x00000d00
+#define    NV20TCL_RC_FINAL0_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV20TCL_RC_FINAL0_C_INPUT_E_TIMES_F						0x00000f00
+#define    NV20TCL_RC_FINAL0_C_INPUT_TEXTURE2						0x00000a00
+#define    NV20TCL_RC_FINAL0_C_INPUT_TEXTURE3						0x00000b00
+#define   NV20TCL_RC_FINAL0_C_COMPONENT_USAGE						(1 << 12)
+#define    NV20TCL_RC_FINAL0_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL0_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV20TCL_RC_FINAL0_C_MAPPING_SHIFT						13
+#define   NV20TCL_RC_FINAL0_C_MAPPING_MASK						0x0000e000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_UNSIGNED_INVERT					0x00002000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_EXPAND_NORMAL					0x00004000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_EXPAND_NEGATE					0x00006000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NORMAL					0x00008000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NEGATE					0x0000a000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_SIGNED_IDENTITY					0x0000c000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_SIGNED_NEGATE					0x0000e000
+#define   NV20TCL_RC_FINAL0_B_INPUT_SHIFT						16
+#define   NV20TCL_RC_FINAL0_B_INPUT_MASK						0x000f0000
+#define    NV20TCL_RC_FINAL0_B_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR0					0x00010000
+#define    NV20TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR1					0x00020000
+#define    NV20TCL_RC_FINAL0_B_INPUT_FOG						0x00030000
+#define    NV20TCL_RC_FINAL0_B_INPUT_PRIMARY_COLOR					0x00040000
+#define    NV20TCL_RC_FINAL0_B_INPUT_SECONDARY_COLOR					0x00050000
+#define    NV20TCL_RC_FINAL0_B_INPUT_TEXTURE0						0x00080000
+#define    NV20TCL_RC_FINAL0_B_INPUT_TEXTURE1						0x00090000
+#define    NV20TCL_RC_FINAL0_B_INPUT_SPARE0						0x000c0000
+#define    NV20TCL_RC_FINAL0_B_INPUT_SPARE1						0x000d0000
+#define    NV20TCL_RC_FINAL0_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000e0000
+#define    NV20TCL_RC_FINAL0_B_INPUT_E_TIMES_F						0x000f0000
+#define    NV20TCL_RC_FINAL0_B_INPUT_TEXTURE2						0x000a0000
+#define    NV20TCL_RC_FINAL0_B_INPUT_TEXTURE3						0x000b0000
+#define   NV20TCL_RC_FINAL0_B_COMPONENT_USAGE						(1 << 20)
+#define    NV20TCL_RC_FINAL0_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL0_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV20TCL_RC_FINAL0_B_MAPPING_SHIFT						21
+#define   NV20TCL_RC_FINAL0_B_MAPPING_MASK						0x00e00000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_UNSIGNED_INVERT					0x00200000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_EXPAND_NORMAL					0x00400000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_EXPAND_NEGATE					0x00600000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NORMAL					0x00800000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NEGATE					0x00a00000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_SIGNED_IDENTITY					0x00c00000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_SIGNED_NEGATE					0x00e00000
+#define   NV20TCL_RC_FINAL0_A_INPUT_SHIFT						24
+#define   NV20TCL_RC_FINAL0_A_INPUT_MASK						0x0f000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR0					0x01000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR1					0x02000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_FOG						0x03000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_PRIMARY_COLOR					0x04000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_SECONDARY_COLOR					0x05000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_TEXTURE0						0x08000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_TEXTURE1						0x09000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_SPARE0						0x0c000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_SPARE1						0x0d000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0e000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_E_TIMES_F						0x0f000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_TEXTURE2						0x0a000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_TEXTURE3						0x0b000000
+#define   NV20TCL_RC_FINAL0_A_COMPONENT_USAGE						(1 << 28)
+#define    NV20TCL_RC_FINAL0_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL0_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV20TCL_RC_FINAL0_A_MAPPING_SHIFT						29
+#define   NV20TCL_RC_FINAL0_A_MAPPING_MASK						0xe0000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_UNSIGNED_INVERT					0x20000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_EXPAND_NORMAL					0x40000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_EXPAND_NEGATE					0x60000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NORMAL					0x80000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NEGATE					0xa0000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_SIGNED_IDENTITY					0xc0000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_SIGNED_NEGATE					0xe0000000
+#define  NV20TCL_RC_FINAL1								0x0000028c
+#define   NV20TCL_RC_FINAL1_COLOR_SUM_CLAMP						(1 <<  7)
+#define   NV20TCL_RC_FINAL1_G_INPUT_SHIFT						8
+#define   NV20TCL_RC_FINAL1_G_INPUT_MASK						0x00000f00
+#define    NV20TCL_RC_FINAL1_G_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR0					0x00000100
+#define    NV20TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR1					0x00000200
+#define    NV20TCL_RC_FINAL1_G_INPUT_FOG						0x00000300
+#define    NV20TCL_RC_FINAL1_G_INPUT_PRIMARY_COLOR					0x00000400
+#define    NV20TCL_RC_FINAL1_G_INPUT_SECONDARY_COLOR					0x00000500
+#define    NV20TCL_RC_FINAL1_G_INPUT_TEXTURE0						0x00000800
+#define    NV20TCL_RC_FINAL1_G_INPUT_TEXTURE1						0x00000900
+#define    NV20TCL_RC_FINAL1_G_INPUT_SPARE0						0x00000c00
+#define    NV20TCL_RC_FINAL1_G_INPUT_SPARE1						0x00000d00
+#define    NV20TCL_RC_FINAL1_G_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV20TCL_RC_FINAL1_G_INPUT_E_TIMES_F						0x00000f00
+#define    NV20TCL_RC_FINAL1_G_INPUT_TEXTURE2						0x00000a00
+#define    NV20TCL_RC_FINAL1_G_INPUT_TEXTURE3						0x00000b00
+#define   NV20TCL_RC_FINAL1_G_COMPONENT_USAGE						(1 << 12)
+#define    NV20TCL_RC_FINAL1_G_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL1_G_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV20TCL_RC_FINAL1_G_MAPPING_SHIFT						13
+#define   NV20TCL_RC_FINAL1_G_MAPPING_MASK						0x0000e000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_UNSIGNED_INVERT					0x00002000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_EXPAND_NORMAL					0x00004000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_EXPAND_NEGATE					0x00006000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NORMAL					0x00008000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NEGATE					0x0000a000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_SIGNED_IDENTITY					0x0000c000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_SIGNED_NEGATE					0x0000e000
+#define   NV20TCL_RC_FINAL1_F_INPUT_SHIFT						16
+#define   NV20TCL_RC_FINAL1_F_INPUT_MASK						0x000f0000
+#define    NV20TCL_RC_FINAL1_F_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR0					0x00010000
+#define    NV20TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR1					0x00020000
+#define    NV20TCL_RC_FINAL1_F_INPUT_FOG						0x00030000
+#define    NV20TCL_RC_FINAL1_F_INPUT_PRIMARY_COLOR					0x00040000
+#define    NV20TCL_RC_FINAL1_F_INPUT_SECONDARY_COLOR					0x00050000
+#define    NV20TCL_RC_FINAL1_F_INPUT_TEXTURE0						0x00080000
+#define    NV20TCL_RC_FINAL1_F_INPUT_TEXTURE1						0x00090000
+#define    NV20TCL_RC_FINAL1_F_INPUT_SPARE0						0x000c0000
+#define    NV20TCL_RC_FINAL1_F_INPUT_SPARE1						0x000d0000
+#define    NV20TCL_RC_FINAL1_F_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000e0000
+#define    NV20TCL_RC_FINAL1_F_INPUT_E_TIMES_F						0x000f0000
+#define    NV20TCL_RC_FINAL1_F_INPUT_TEXTURE2						0x000a0000
+#define    NV20TCL_RC_FINAL1_F_INPUT_TEXTURE3						0x000b0000
+#define   NV20TCL_RC_FINAL1_F_COMPONENT_USAGE						(1 << 20)
+#define    NV20TCL_RC_FINAL1_F_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL1_F_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV20TCL_RC_FINAL1_F_MAPPING_SHIFT						21
+#define   NV20TCL_RC_FINAL1_F_MAPPING_MASK						0x00e00000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_UNSIGNED_INVERT					0x00200000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_EXPAND_NORMAL					0x00400000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_EXPAND_NEGATE					0x00600000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NORMAL					0x00800000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NEGATE					0x00a00000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_SIGNED_IDENTITY					0x00c00000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_SIGNED_NEGATE					0x00e00000
+#define   NV20TCL_RC_FINAL1_E_INPUT_SHIFT						24
+#define   NV20TCL_RC_FINAL1_E_INPUT_MASK						0x0f000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR0					0x01000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR1					0x02000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_FOG						0x03000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_PRIMARY_COLOR					0x04000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_SECONDARY_COLOR					0x05000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_TEXTURE0						0x08000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_TEXTURE1						0x09000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_SPARE0						0x0c000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_SPARE1						0x0d000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0e000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_E_TIMES_F						0x0f000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_TEXTURE2						0x0a000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_TEXTURE3						0x0b000000
+#define   NV20TCL_RC_FINAL1_E_COMPONENT_USAGE						(1 << 28)
+#define    NV20TCL_RC_FINAL1_E_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL1_E_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV20TCL_RC_FINAL1_E_MAPPING_SHIFT						29
+#define   NV20TCL_RC_FINAL1_E_MAPPING_MASK						0xe0000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_UNSIGNED_INVERT					0x20000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_EXPAND_NORMAL					0x40000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_EXPAND_NEGATE					0x60000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NORMAL					0x80000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NEGATE					0xa0000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_SIGNED_IDENTITY					0xc0000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_SIGNED_NEGATE					0xe0000000
+#define  NV20TCL_LIGHT_MODEL								0x00000294
+#define   NV20TCL_LIGHT_MODEL_VIEWER_SHIFT						16
+#define   NV20TCL_LIGHT_MODEL_VIEWER_MASK						0x00030000
+#define    NV20TCL_LIGHT_MODEL_VIEWER_NONLOCAL						0x00020000
+#define    NV20TCL_LIGHT_MODEL_VIEWER_LOCAL						0x00030000
+#define   NV20TCL_LIGHT_MODEL_SEPARATE_SPECULAR						(1 <<  0)
+#define  NV20TCL_COLOR_MATERIAL								0x00000298
+#define   NV20TCL_COLOR_MATERIAL_FRONT_EMISSION_SHIFT					0
+#define   NV20TCL_COLOR_MATERIAL_FRONT_EMISSION_MASK					0x00000003
+#define    NV20TCL_COLOR_MATERIAL_FRONT_EMISSION_OFF					0x00000000
+#define    NV20TCL_COLOR_MATERIAL_FRONT_EMISSION_COL1					0x00000001
+#define    NV20TCL_COLOR_MATERIAL_FRONT_EMISSION_COL2					0x00000002
+#define   NV20TCL_COLOR_MATERIAL_FRONT_AMBIENT_SHIFT					2
+#define   NV20TCL_COLOR_MATERIAL_FRONT_AMBIENT_MASK					0x0000000c
+#define    NV20TCL_COLOR_MATERIAL_FRONT_AMBIENT_OFF					0x00000000
+#define    NV20TCL_COLOR_MATERIAL_FRONT_AMBIENT_COL1					0x00000004
+#define    NV20TCL_COLOR_MATERIAL_FRONT_AMBIENT_COL2					0x00000008
+#define   NV20TCL_COLOR_MATERIAL_FRONT_DIFFUSE_SHIFT					4
+#define   NV20TCL_COLOR_MATERIAL_FRONT_DIFFUSE_MASK					0x00000030
+#define    NV20TCL_COLOR_MATERIAL_FRONT_DIFFUSE_OFF					0x00000000
+#define    NV20TCL_COLOR_MATERIAL_FRONT_DIFFUSE_COL1					0x00000010
+#define    NV20TCL_COLOR_MATERIAL_FRONT_DIFFUSE_COL2					0x00000020
+#define   NV20TCL_COLOR_MATERIAL_FRONT_SPECULAR_SHIFT					6
+#define   NV20TCL_COLOR_MATERIAL_FRONT_SPECULAR_MASK					0x000000c0
+#define    NV20TCL_COLOR_MATERIAL_FRONT_SPECULAR_OFF					0x00000000
+#define    NV20TCL_COLOR_MATERIAL_FRONT_SPECULAR_COL1					0x00000040
+#define    NV20TCL_COLOR_MATERIAL_FRONT_SPECULAR_COL2					0x00000080
+#define   NV20TCL_COLOR_MATERIAL_BACK_EMISSION_SHIFT					8
+#define   NV20TCL_COLOR_MATERIAL_BACK_EMISSION_MASK					0x00000300
+#define    NV20TCL_COLOR_MATERIAL_BACK_EMISSION_OFF					0x00000000
+#define    NV20TCL_COLOR_MATERIAL_BACK_EMISSION_COL1					0x00000100
+#define    NV20TCL_COLOR_MATERIAL_BACK_EMISSION_COL2					0x00000200
+#define   NV20TCL_COLOR_MATERIAL_BACK_AMBIENT_SHIFT					10
+#define   NV20TCL_COLOR_MATERIAL_BACK_AMBIENT_MASK					0x00000c00
+#define    NV20TCL_COLOR_MATERIAL_BACK_AMBIENT_OFF					0x00000000
+#define    NV20TCL_COLOR_MATERIAL_BACK_AMBIENT_COL1					0x00000400
+#define    NV20TCL_COLOR_MATERIAL_BACK_AMBIENT_COL2					0x00000800
+#define   NV20TCL_COLOR_MATERIAL_BACK_DIFFUSE_SHIFT					12
+#define   NV20TCL_COLOR_MATERIAL_BACK_DIFFUSE_MASK					0x00003000
+#define    NV20TCL_COLOR_MATERIAL_BACK_DIFFUSE_OFF					0x00000000
+#define    NV20TCL_COLOR_MATERIAL_BACK_DIFFUSE_COL1					0x00001000
+#define    NV20TCL_COLOR_MATERIAL_BACK_DIFFUSE_COL2					0x00002000
+#define   NV20TCL_COLOR_MATERIAL_BACK_SPECULAR_SHIFT					14
+#define   NV20TCL_COLOR_MATERIAL_BACK_SPECULAR_MASK					0x0000c000
+#define    NV20TCL_COLOR_MATERIAL_BACK_SPECULAR_OFF					0x00000000
+#define    NV20TCL_COLOR_MATERIAL_BACK_SPECULAR_COL1					0x00004000
+#define    NV20TCL_COLOR_MATERIAL_BACK_SPECULAR_COL2					0x00008000
+#define  NV20TCL_FOG_MODE								0x0000029c
+#define   NV20TCL_FOG_MODE_LINEAR_UNSIGNED						0x00000804
+#define   NV20TCL_FOG_MODE_LINEAR_SIGNED						0x00002601
+#define   NV20TCL_FOG_MODE_EXP_UNSIGNED							0x00000802
+#define   NV20TCL_FOG_MODE_EXP_SIGNED							0x00000800
+#define   NV20TCL_FOG_MODE_EXP2_UNSIGNED						0x00000803
+#define   NV20TCL_FOG_MODE_EXP2_SIGNED							0x00000801
+#define  NV20TCL_FOG_COORD								0x000002a0
+#define   NV20TCL_FOG_COORD_DIST_RADIAL							0x00000001
+#define   NV20TCL_FOG_COORD_DIST_ORTHOGONAL						0x00000002
+#define   NV20TCL_FOG_COORD_DIST_ORTHOGONAL_ABS						0x00000003
+#define   NV20TCL_FOG_COORD_FOG								0x00000006
+#define  NV20TCL_FOG_ENABLE								0x000002a4
+#define  NV20TCL_FOG_COLOR								0x000002a8
+#define   NV20TCL_FOG_COLOR_R_SHIFT							0
+#define   NV20TCL_FOG_COLOR_R_MASK							0x000000ff
+#define   NV20TCL_FOG_COLOR_G_SHIFT							8
+#define   NV20TCL_FOG_COLOR_G_MASK							0x0000ff00
+#define   NV20TCL_FOG_COLOR_B_SHIFT							16
+#define   NV20TCL_FOG_COLOR_B_MASK							0x00ff0000
+#define   NV20TCL_FOG_COLOR_A_SHIFT							24
+#define   NV20TCL_FOG_COLOR_A_MASK							0xff000000
+#define  NV20TCL_VIEWPORT_CLIP_MODE							0x000002b4
+#define  NV20TCL_VIEWPORT_CLIP_HORIZ(x)							(0x000002c0+((x)*4))
+#define  NV20TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define  NV20TCL_VIEWPORT_CLIP_VERT(x)							(0x000002e0+((x)*4))
+#define  NV20TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define  NV20TCL_ALPHA_FUNC_ENABLE							0x00000300
+#define  NV20TCL_BLEND_FUNC_ENABLE							0x00000304
+#define  NV20TCL_CULL_FACE_ENABLE							0x00000308
+#define  NV20TCL_DEPTH_TEST_ENABLE							0x0000030c
+#define  NV20TCL_DITHER_ENABLE								0x00000310
+#define  NV20TCL_LIGHTING_ENABLE							0x00000314
+#define  NV20TCL_POINT_PARAMETERS_ENABLE						0x00000318
+#define  NV20TCL_POINT_SMOOTH_ENABLE							0x0000031c
+#define  NV20TCL_LINE_SMOOTH_ENABLE							0x00000320
+#define  NV20TCL_POLYGON_SMOOTH_ENABLE							0x00000324
+#define  NV20TCL_STENCIL_ENABLE								0x0000032c
+#define  NV20TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000330
+#define  NV20TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000334
+#define  NV20TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000338
+#define  NV20TCL_ALPHA_FUNC_FUNC							0x0000033c
+#define   NV20TCL_ALPHA_FUNC_FUNC_NEVER							0x00000200
+#define   NV20TCL_ALPHA_FUNC_FUNC_LESS							0x00000201
+#define   NV20TCL_ALPHA_FUNC_FUNC_EQUAL							0x00000202
+#define   NV20TCL_ALPHA_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV20TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV20TCL_ALPHA_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV20TCL_ALPHA_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV20TCL_ALPHA_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV20TCL_ALPHA_FUNC_REF								0x00000340
+#define  NV20TCL_BLEND_FUNC_SRC								0x00000344
+#define   NV20TCL_BLEND_FUNC_SRC_ZERO							0x00000000
+#define   NV20TCL_BLEND_FUNC_SRC_ONE							0x00000001
+#define   NV20TCL_BLEND_FUNC_SRC_SRC_COLOR						0x00000300
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_SRC_COLOR					0x00000301
+#define   NV20TCL_BLEND_FUNC_SRC_SRC_ALPHA						0x00000302
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_SRC_ALPHA					0x00000303
+#define   NV20TCL_BLEND_FUNC_SRC_DST_ALPHA						0x00000304
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_DST_ALPHA					0x00000305
+#define   NV20TCL_BLEND_FUNC_SRC_DST_COLOR						0x00000306
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_DST_COLOR					0x00000307
+#define   NV20TCL_BLEND_FUNC_SRC_SRC_ALPHA_SATURATE					0x00000308
+#define   NV20TCL_BLEND_FUNC_SRC_CONSTANT_COLOR						0x00008001
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV20TCL_BLEND_FUNC_SRC_CONSTANT_ALPHA						0x00008003
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV20TCL_BLEND_FUNC_DST								0x00000348
+#define   NV20TCL_BLEND_FUNC_DST_ZERO							0x00000000
+#define   NV20TCL_BLEND_FUNC_DST_ONE							0x00000001
+#define   NV20TCL_BLEND_FUNC_DST_SRC_COLOR						0x00000300
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_SRC_COLOR					0x00000301
+#define   NV20TCL_BLEND_FUNC_DST_SRC_ALPHA						0x00000302
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_SRC_ALPHA					0x00000303
+#define   NV20TCL_BLEND_FUNC_DST_DST_ALPHA						0x00000304
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_DST_ALPHA					0x00000305
+#define   NV20TCL_BLEND_FUNC_DST_DST_COLOR						0x00000306
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_DST_COLOR					0x00000307
+#define   NV20TCL_BLEND_FUNC_DST_SRC_ALPHA_SATURATE					0x00000308
+#define   NV20TCL_BLEND_FUNC_DST_CONSTANT_COLOR						0x00008001
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV20TCL_BLEND_FUNC_DST_CONSTANT_ALPHA						0x00008003
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV20TCL_BLEND_COLOR								0x0000034c
+#define   NV20TCL_BLEND_COLOR_B_SHIFT							0
+#define   NV20TCL_BLEND_COLOR_B_MASK							0x000000ff
+#define   NV20TCL_BLEND_COLOR_G_SHIFT							8
+#define   NV20TCL_BLEND_COLOR_G_MASK							0x0000ff00
+#define   NV20TCL_BLEND_COLOR_R_SHIFT							16
+#define   NV20TCL_BLEND_COLOR_R_MASK							0x00ff0000
+#define   NV20TCL_BLEND_COLOR_A_SHIFT							24
+#define   NV20TCL_BLEND_COLOR_A_MASK							0xff000000
+#define  NV20TCL_BLEND_EQUATION								0x00000350
+#define   NV20TCL_BLEND_EQUATION_FUNC_ADD						0x00008006
+#define   NV20TCL_BLEND_EQUATION_MIN							0x00008007
+#define   NV20TCL_BLEND_EQUATION_MAX							0x00008008
+#define   NV20TCL_BLEND_EQUATION_FUNC_SUBTRACT						0x0000800a
+#define   NV20TCL_BLEND_EQUATION_FUNC_REVERSE_SUBTRACT					0x0000800b
+#define  NV20TCL_DEPTH_FUNC								0x00000354
+#define   NV20TCL_DEPTH_FUNC_NEVER							0x00000200
+#define   NV20TCL_DEPTH_FUNC_LESS							0x00000201
+#define   NV20TCL_DEPTH_FUNC_EQUAL							0x00000202
+#define   NV20TCL_DEPTH_FUNC_LEQUAL							0x00000203
+#define   NV20TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV20TCL_DEPTH_FUNC_NOTEQUAL							0x00000205
+#define   NV20TCL_DEPTH_FUNC_GEQUAL							0x00000206
+#define   NV20TCL_DEPTH_FUNC_ALWAYS							0x00000207
+#define  NV20TCL_COLOR_MASK								0x00000358
+#define   NV20TCL_COLOR_MASK_B								(1 <<  0)
+#define   NV20TCL_COLOR_MASK_G								(1 <<  8)
+#define   NV20TCL_COLOR_MASK_R								(1 << 16)
+#define   NV20TCL_COLOR_MASK_A								(1 << 24)
+#define  NV20TCL_DEPTH_WRITE_ENABLE							0x0000035c
+#define  NV20TCL_STENCIL_MASK								0x00000360
+#define  NV20TCL_STENCIL_FUNC_FUNC							0x00000364
+#define   NV20TCL_STENCIL_FUNC_FUNC_NEVER						0x00000200
+#define   NV20TCL_STENCIL_FUNC_FUNC_LESS						0x00000201
+#define   NV20TCL_STENCIL_FUNC_FUNC_EQUAL						0x00000202
+#define   NV20TCL_STENCIL_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV20TCL_STENCIL_FUNC_FUNC_GREATER						0x00000204
+#define   NV20TCL_STENCIL_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV20TCL_STENCIL_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV20TCL_STENCIL_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV20TCL_STENCIL_FUNC_REF							0x00000368
+#define  NV20TCL_STENCIL_FUNC_MASK							0x0000036c
+#define  NV20TCL_STENCIL_OP_FAIL							0x00000370
+#define   NV20TCL_STENCIL_OP_FAIL_ZERO							0x00000000
+#define   NV20TCL_STENCIL_OP_FAIL_INVERT						0x0000150a
+#define   NV20TCL_STENCIL_OP_FAIL_KEEP							0x00001e00
+#define   NV20TCL_STENCIL_OP_FAIL_REPLACE						0x00001e01
+#define   NV20TCL_STENCIL_OP_FAIL_INCR							0x00001e02
+#define   NV20TCL_STENCIL_OP_FAIL_DECR							0x00001e03
+#define   NV20TCL_STENCIL_OP_FAIL_INCR_WRAP						0x00008507
+#define   NV20TCL_STENCIL_OP_FAIL_DECR_WRAP						0x00008508
+#define  NV20TCL_STENCIL_OP_ZFAIL							0x00000374
+#define   NV20TCL_STENCIL_OP_ZFAIL_ZERO							0x00000000
+#define   NV20TCL_STENCIL_OP_ZFAIL_INVERT						0x0000150a
+#define   NV20TCL_STENCIL_OP_ZFAIL_KEEP							0x00001e00
+#define   NV20TCL_STENCIL_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV20TCL_STENCIL_OP_ZFAIL_INCR							0x00001e02
+#define   NV20TCL_STENCIL_OP_ZFAIL_DECR							0x00001e03
+#define   NV20TCL_STENCIL_OP_ZFAIL_INCR_WRAP						0x00008507
+#define   NV20TCL_STENCIL_OP_ZFAIL_DECR_WRAP						0x00008508
+#define  NV20TCL_STENCIL_OP_ZPASS							0x00000378
+#define   NV20TCL_STENCIL_OP_ZPASS_ZERO							0x00000000
+#define   NV20TCL_STENCIL_OP_ZPASS_INVERT						0x0000150a
+#define   NV20TCL_STENCIL_OP_ZPASS_KEEP							0x00001e00
+#define   NV20TCL_STENCIL_OP_ZPASS_REPLACE						0x00001e01
+#define   NV20TCL_STENCIL_OP_ZPASS_INCR							0x00001e02
+#define   NV20TCL_STENCIL_OP_ZPASS_DECR							0x00001e03
+#define   NV20TCL_STENCIL_OP_ZPASS_INCR_WRAP						0x00008507
+#define   NV20TCL_STENCIL_OP_ZPASS_DECR_WRAP						0x00008508
+#define  NV20TCL_SHADE_MODEL								0x0000037c
+#define   NV20TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV20TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV20TCL_LINE_WIDTH								0x00000380
+#define  NV20TCL_POLYGON_OFFSET_FACTOR							0x00000384
+#define  NV20TCL_POLYGON_OFFSET_UNITS							0x00000388
+#define  NV20TCL_POLYGON_MODE_FRONT							0x0000038c
+#define   NV20TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV20TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV20TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV20TCL_POLYGON_MODE_BACK							0x00000390
+#define   NV20TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV20TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV20TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV20TCL_DEPTH_RANGE_NEAR							0x00000394
+#define  NV20TCL_DEPTH_RANGE_FAR							0x00000398
+#define  NV20TCL_CULL_FACE								0x0000039c
+#define   NV20TCL_CULL_FACE_FRONT							0x00000404
+#define   NV20TCL_CULL_FACE_BACK							0x00000405
+#define   NV20TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV20TCL_FRONT_FACE								0x000003a0
+#define   NV20TCL_FRONT_FACE_CW								0x00000900
+#define   NV20TCL_FRONT_FACE_CCW							0x00000901
+#define  NV20TCL_NORMALIZE_ENABLE							0x000003a4
+#define  NV20TCL_MATERIAL_FACTOR_FRONT_R						0x000003a8
+#define  NV20TCL_MATERIAL_FACTOR_FRONT_G						0x000003ac
+#define  NV20TCL_MATERIAL_FACTOR_FRONT_B						0x000003b0
+#define  NV20TCL_MATERIAL_FACTOR_FRONT_A						0x000003b4
+#define  NV20TCL_SEPARATE_SPECULAR_ENABLE						0x000003b8
+#define  NV20TCL_ENABLED_LIGHTS								0x000003bc
+#define   NV20TCL_ENABLED_LIGHTS_0_SHIFT						0
+#define   NV20TCL_ENABLED_LIGHTS_0_MASK							0x00000003
+#define    NV20TCL_ENABLED_LIGHTS_0_DISABLED						0x00000000
+#define    NV20TCL_ENABLED_LIGHTS_0_NONPOSITIONAL					0x00000001
+#define    NV20TCL_ENABLED_LIGHTS_0_POSITIONAL						0x00000002
+#define    NV20TCL_ENABLED_LIGHTS_0_DIRECTIONAL						0x00000003
+#define   NV20TCL_ENABLED_LIGHTS_1_SHIFT						2
+#define   NV20TCL_ENABLED_LIGHTS_1_MASK							0x0000000c
+#define    NV20TCL_ENABLED_LIGHTS_1_DISABLED						0x00000000
+#define    NV20TCL_ENABLED_LIGHTS_1_NONPOSITIONAL					0x00000004
+#define    NV20TCL_ENABLED_LIGHTS_1_POSITIONAL						0x00000008
+#define    NV20TCL_ENABLED_LIGHTS_1_DIRECTIONAL						0x0000000c
+#define   NV20TCL_ENABLED_LIGHTS_2_SHIFT						4
+#define   NV20TCL_ENABLED_LIGHTS_2_MASK							0x00000030
+#define    NV20TCL_ENABLED_LIGHTS_2_DISABLED						0x00000000
+#define    NV20TCL_ENABLED_LIGHTS_2_NONPOSITIONAL					0x00000010
+#define    NV20TCL_ENABLED_LIGHTS_2_POSITIONAL						0x00000020
+#define    NV20TCL_ENABLED_LIGHTS_2_DIRECTIONAL						0x00000030
+#define   NV20TCL_ENABLED_LIGHTS_3_SHIFT						6
+#define   NV20TCL_ENABLED_LIGHTS_3_MASK							0x000000c0
+#define    NV20TCL_ENABLED_LIGHTS_3_DISABLED						0x00000000
+#define    NV20TCL_ENABLED_LIGHTS_3_NONPOSITIONAL					0x00000040
+#define    NV20TCL_ENABLED_LIGHTS_3_POSITIONAL						0x00000080
+#define    NV20TCL_ENABLED_LIGHTS_3_DIRECTIONAL						0x000000c0
+#define   NV20TCL_ENABLED_LIGHTS_4_SHIFT						8
+#define   NV20TCL_ENABLED_LIGHTS_4_MASK							0x00000300
+#define    NV20TCL_ENABLED_LIGHTS_4_DISABLED						0x00000000
+#define    NV20TCL_ENABLED_LIGHTS_4_NONPOSITIONAL					0x00000100
+#define    NV20TCL_ENABLED_LIGHTS_4_POSITIONAL						0x00000200
+#define    NV20TCL_ENABLED_LIGHTS_4_DIRECTIONAL						0x00000300
+#define   NV20TCL_ENABLED_LIGHTS_5_SHIFT						10
+#define   NV20TCL_ENABLED_LIGHTS_5_MASK							0x00000c00
+#define    NV20TCL_ENABLED_LIGHTS_5_DISABLED						0x00000000
+#define    NV20TCL_ENABLED_LIGHTS_5_NONPOSITIONAL					0x00000400
+#define    NV20TCL_ENABLED_LIGHTS_5_POSITIONAL						0x00000800
+#define    NV20TCL_ENABLED_LIGHTS_5_DIRECTIONAL						0x00000c00
+#define   NV20TCL_ENABLED_LIGHTS_6_SHIFT						12
+#define   NV20TCL_ENABLED_LIGHTS_6_MASK							0x00003000
+#define    NV20TCL_ENABLED_LIGHTS_6_DISABLED						0x00000000
+#define    NV20TCL_ENABLED_LIGHTS_6_NONPOSITIONAL					0x00001000
+#define    NV20TCL_ENABLED_LIGHTS_6_POSITIONAL						0x00002000
+#define    NV20TCL_ENABLED_LIGHTS_6_DIRECTIONAL						0x00003000
+#define   NV20TCL_ENABLED_LIGHTS_7_SHIFT						14
+#define   NV20TCL_ENABLED_LIGHTS_7_MASK							0x0000c000
+#define    NV20TCL_ENABLED_LIGHTS_7_DISABLED						0x00000000
+#define    NV20TCL_ENABLED_LIGHTS_7_NONPOSITIONAL					0x00004000
+#define    NV20TCL_ENABLED_LIGHTS_7_POSITIONAL						0x00008000
+#define    NV20TCL_ENABLED_LIGHTS_7_DIRECTIONAL						0x0000c000
+#define  NV20TCL_TX_GEN_MODE_S(x)							(0x000003c0+((x)*16))
+#define  NV20TCL_TX_GEN_MODE_S__SIZE							0x00000004
+#define   NV20TCL_TX_GEN_MODE_S_FALSE							0x00000000
+#define   NV20TCL_TX_GEN_MODE_S_EYE_LINEAR						0x00002400
+#define   NV20TCL_TX_GEN_MODE_S_OBJECT_LINEAR						0x00002401
+#define   NV20TCL_TX_GEN_MODE_S_SPHERE_MAP						0x00002402
+#define   NV20TCL_TX_GEN_MODE_S_NORMAL_MAP						0x00008511
+#define   NV20TCL_TX_GEN_MODE_S_REFLECTION_MAP						0x00008512
+#define  NV20TCL_TX_GEN_MODE_T(x)							(0x000003c4+((x)*16))
+#define  NV20TCL_TX_GEN_MODE_T__SIZE							0x00000004
+#define   NV20TCL_TX_GEN_MODE_T_FALSE							0x00000000
+#define   NV20TCL_TX_GEN_MODE_T_EYE_LINEAR						0x00002400
+#define   NV20TCL_TX_GEN_MODE_T_OBJECT_LINEAR						0x00002401
+#define   NV20TCL_TX_GEN_MODE_T_SPHERE_MAP						0x00002402
+#define   NV20TCL_TX_GEN_MODE_T_NORMAL_MAP						0x00008511
+#define   NV20TCL_TX_GEN_MODE_T_REFLECTION_MAP						0x00008512
+#define  NV20TCL_TX_GEN_MODE_R(x)							(0x000003c8+((x)*16))
+#define  NV20TCL_TX_GEN_MODE_R__SIZE							0x00000004
+#define   NV20TCL_TX_GEN_MODE_R_FALSE							0x00000000
+#define   NV20TCL_TX_GEN_MODE_R_EYE_LINEAR						0x00002400
+#define   NV20TCL_TX_GEN_MODE_R_OBJECT_LINEAR						0x00002401
+#define   NV20TCL_TX_GEN_MODE_R_SPHERE_MAP						0x00002402
+#define   NV20TCL_TX_GEN_MODE_R_NORMAL_MAP						0x00008511
+#define   NV20TCL_TX_GEN_MODE_R_REFLECTION_MAP						0x00008512
+#define  NV20TCL_TX_GEN_MODE_Q(x)							(0x000003cc+((x)*16))
+#define  NV20TCL_TX_GEN_MODE_Q__SIZE							0x00000004
+#define   NV20TCL_TX_GEN_MODE_Q_FALSE							0x00000000
+#define   NV20TCL_TX_GEN_MODE_Q_EYE_LINEAR						0x00002400
+#define   NV20TCL_TX_GEN_MODE_Q_OBJECT_LINEAR						0x00002401
+#define   NV20TCL_TX_GEN_MODE_Q_SPHERE_MAP						0x00002402
+#define   NV20TCL_TX_GEN_MODE_Q_NORMAL_MAP						0x00008511
+#define   NV20TCL_TX_GEN_MODE_Q_REFLECTION_MAP						0x00008512
+#define  NV20TCL_TX_MATRIX_ENABLE(x)							(0x00000420+((x)*4))
+#define  NV20TCL_TX_MATRIX_ENABLE__SIZE							0x00000004
+#define  NV20TCL_POINT_SIZE								0x0000043c
+#define  NV20TCL_MODELVIEW0_MATRIX(x)							(0x00000480+((x)*4))
+#define  NV20TCL_MODELVIEW0_MATRIX__SIZE						0x00000010
+#define  NV20TCL_MODELVIEW1_MATRIX(x)							(0x000004c0+((x)*4))
+#define  NV20TCL_MODELVIEW1_MATRIX__SIZE						0x00000010
+#define  NV20TCL_MODELVIEW2_MATRIX(x)							(0x00000500+((x)*4))
+#define  NV20TCL_MODELVIEW2_MATRIX__SIZE						0x00000010
+#define  NV20TCL_MODELVIEW3_MATRIX(x)							(0x00000540+((x)*4))
+#define  NV20TCL_MODELVIEW3_MATRIX__SIZE						0x00000010
+#define  NV20TCL_INVERSE_MODELVIEW0_MATRIX(x)						(0x00000580+((x)*4))
+#define  NV20TCL_INVERSE_MODELVIEW0_MATRIX__SIZE					0x00000010
+#define  NV20TCL_INVERSE_MODELVIEW1_MATRIX(x)						(0x000005c0+((x)*4))
+#define  NV20TCL_INVERSE_MODELVIEW1_MATRIX__SIZE					0x00000010
+#define  NV20TCL_INVERSE_MODELVIEW2_MATRIX(x)						(0x00000600+((x)*4))
+#define  NV20TCL_INVERSE_MODELVIEW2_MATRIX__SIZE					0x00000010
+#define  NV20TCL_INVERSE_MODELVIEW3_MATRIX(x)						(0x00000640+((x)*4))
+#define  NV20TCL_INVERSE_MODELVIEW3_MATRIX__SIZE					0x00000010
+#define  NV20TCL_PROJECTION_MATRIX(x)							(0x00000680+((x)*4))
+#define  NV20TCL_PROJECTION_MATRIX__SIZE						0x00000010
+#define  NV20TCL_TX0_MATRIX(x)								(0x000006c0+((x)*4))
+#define  NV20TCL_TX0_MATRIX__SIZE							0x00000010
+#define  NV20TCL_TX1_MATRIX(x)								(0x00000700+((x)*4))
+#define  NV20TCL_TX1_MATRIX__SIZE							0x00000010
+#define  NV20TCL_TX2_MATRIX(x)								(0x00000740+((x)*4))
+#define  NV20TCL_TX2_MATRIX__SIZE							0x00000010
+#define  NV20TCL_TX3_MATRIX(x)								(0x00000780+((x)*4))
+#define  NV20TCL_TX3_MATRIX__SIZE							0x00000010
+#define  NV20TCL_TX_GEN_COEFF_S_A(x)							(0x00000840+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_S_A__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_S_B(x)							(0x00000844+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_S_B__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_S_C(x)							(0x00000848+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_S_C__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_S_D(x)							(0x0000084c+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_S_D__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_T_A(x)							(0x00000850+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_T_A__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_T_B(x)							(0x00000854+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_T_B__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_T_C(x)							(0x00000858+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_T_C__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_T_D(x)							(0x0000085c+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_T_D__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_R_A(x)							(0x00000860+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_R_A__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_R_B(x)							(0x00000864+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_R_B__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_R_C(x)							(0x00000868+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_R_C__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_R_D(x)							(0x0000086c+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_R_D__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_Q_A(x)							(0x00000870+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_Q_A__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_Q_B(x)							(0x00000874+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_Q_B__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_Q_C(x)							(0x00000878+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_Q_C__SIZE							0x00000004
+#define  NV20TCL_TX_GEN_COEFF_Q_D(x)							(0x0000087c+((x)*64))
+#define  NV20TCL_TX_GEN_COEFF_Q_D__SIZE							0x00000004
+#define  NV20TCL_FOG_EQUATION_CONSTANT							0x000009c0
+#define  NV20TCL_FOG_EQUATION_LINEAR							0x000009c4
+#define  NV20TCL_FOG_EQUATION_QUADRATIC							0x000009c8
+#define  NV20TCL_FRONT_MATERIAL_SHININESS(x)						(0x000009e0+((x)*4))
+#define  NV20TCL_FRONT_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV20TCL_LIGHT_MODEL_FRONT_AMBIENT_R						0x00000a10
+#define  NV20TCL_LIGHT_MODEL_FRONT_AMBIENT_G						0x00000a14
+#define  NV20TCL_LIGHT_MODEL_FRONT_AMBIENT_B						0x00000a18
+#define  NV20TCL_VIEWPORT_TRANSLATE_X							0x00000a20
+#define  NV20TCL_VIEWPORT_TRANSLATE_Y							0x00000a24
+#define  NV20TCL_VIEWPORT_TRANSLATE_Z							0x00000a28
+#define  NV20TCL_VIEWPORT_TRANSLATE_W							0x00000a2c
+#define  NV20TCL_POINT_PARAMETER(x)							(0x00000a30+((x)*4))
+#define  NV20TCL_POINT_PARAMETER__SIZE							0x00000008
+#define  NV20TCL_RC_CONSTANT_COLOR0(x)							(0x00000a60+((x)*4))
+#define  NV20TCL_RC_CONSTANT_COLOR0__SIZE						0x00000008
+#define   NV20TCL_RC_CONSTANT_COLOR0_B_SHIFT						0
+#define   NV20TCL_RC_CONSTANT_COLOR0_B_MASK						0x000000ff
+#define   NV20TCL_RC_CONSTANT_COLOR0_G_SHIFT						8
+#define   NV20TCL_RC_CONSTANT_COLOR0_G_MASK						0x0000ff00
+#define   NV20TCL_RC_CONSTANT_COLOR0_R_SHIFT						16
+#define   NV20TCL_RC_CONSTANT_COLOR0_R_MASK						0x00ff0000
+#define   NV20TCL_RC_CONSTANT_COLOR0_A_SHIFT						24
+#define   NV20TCL_RC_CONSTANT_COLOR0_A_MASK						0xff000000
+#define  NV20TCL_RC_CONSTANT_COLOR1(x)							(0x00000a80+((x)*4))
+#define  NV20TCL_RC_CONSTANT_COLOR1__SIZE						0x00000008
+#define   NV20TCL_RC_CONSTANT_COLOR1_B_SHIFT						0
+#define   NV20TCL_RC_CONSTANT_COLOR1_B_MASK						0x000000ff
+#define   NV20TCL_RC_CONSTANT_COLOR1_G_SHIFT						8
+#define   NV20TCL_RC_CONSTANT_COLOR1_G_MASK						0x0000ff00
+#define   NV20TCL_RC_CONSTANT_COLOR1_R_SHIFT						16
+#define   NV20TCL_RC_CONSTANT_COLOR1_R_MASK						0x00ff0000
+#define   NV20TCL_RC_CONSTANT_COLOR1_A_SHIFT						24
+#define   NV20TCL_RC_CONSTANT_COLOR1_A_MASK						0xff000000
+#define  NV20TCL_RC_OUT_ALPHA(x)							(0x00000aa0+((x)*4))
+#define  NV20TCL_RC_OUT_ALPHA__SIZE							0x00000008
+#define   NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_SHIFT						0
+#define   NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_MASK						0x0000000f
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_ZERO						0x00000000
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR0				0x00000001
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR1				0x00000002
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_FOG						0x00000003
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_PRIMARY_COLOR					0x00000004
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_SECONDARY_COLOR				0x00000005
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE0					0x00000008
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE1					0x00000009
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0					0x0000000c
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE1					0x0000000d
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_E_TIMES_F					0x0000000f
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE2					0x0000000a
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE3					0x0000000b
+#define   NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_SHIFT						4
+#define   NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_MASK						0x000000f0
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_ZERO						0x00000000
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR0				0x00000010
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR1				0x00000020
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_FOG						0x00000030
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_PRIMARY_COLOR					0x00000040
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_SECONDARY_COLOR				0x00000050
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE0					0x00000080
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE1					0x00000090
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0					0x000000c0
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE1					0x000000d0
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000000e0
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_E_TIMES_F					0x000000f0
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE2					0x000000a0
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE3					0x000000b0
+#define   NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_SHIFT						8
+#define   NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_MASK						0x00000f00
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_ZERO						0x00000000
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR0				0x00000100
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR1				0x00000200
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_FOG						0x00000300
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_PRIMARY_COLOR				0x00000400
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_SECONDARY_COLOR				0x00000500
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE0					0x00000800
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE1					0x00000900
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0					0x00000c00
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE1					0x00000d00
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_E_TIMES_F					0x00000f00
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE2					0x00000a00
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE3					0x00000b00
+#define   NV20TCL_RC_OUT_ALPHA_CD_DOT_PRODUCT						(1 << 12)
+#define   NV20TCL_RC_OUT_ALPHA_AB_DOT_PRODUCT						(1 << 13)
+#define   NV20TCL_RC_OUT_ALPHA_MUX_SUM							(1 << 14)
+#define   NV20TCL_RC_OUT_ALPHA_BIAS							(1 << 15)
+#define    NV20TCL_RC_OUT_ALPHA_BIAS_NONE						0x00000000
+#define    NV20TCL_RC_OUT_ALPHA_BIAS_BIAS_BY_NEGATIVE_ONE_HALF				0x00008000
+#define   NV20TCL_RC_OUT_ALPHA_SCALE_SHIFT						17
+#define   NV20TCL_RC_OUT_ALPHA_SCALE_MASK						0x00000000
+#define    NV20TCL_RC_OUT_ALPHA_SCALE_NONE						0x00000000
+#define    NV20TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_TWO					0x00020000
+#define    NV20TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_FOUR					0x00040000
+#define    NV20TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_ONE_HALF					0x00060000
+#define  NV20TCL_RC_IN_RGB(x)								(0x00000ac0+((x)*4))
+#define  NV20TCL_RC_IN_RGB__SIZE							0x00000008
+#define   NV20TCL_RC_IN_RGB_D_INPUT_SHIFT						0
+#define   NV20TCL_RC_IN_RGB_D_INPUT_MASK						0x0000000f
+#define    NV20TCL_RC_IN_RGB_D_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR0					0x00000001
+#define    NV20TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR1					0x00000002
+#define    NV20TCL_RC_IN_RGB_D_INPUT_FOG						0x00000003
+#define    NV20TCL_RC_IN_RGB_D_INPUT_PRIMARY_COLOR					0x00000004
+#define    NV20TCL_RC_IN_RGB_D_INPUT_SECONDARY_COLOR					0x00000005
+#define    NV20TCL_RC_IN_RGB_D_INPUT_TEXTURE0						0x00000008
+#define    NV20TCL_RC_IN_RGB_D_INPUT_TEXTURE1						0x00000009
+#define    NV20TCL_RC_IN_RGB_D_INPUT_SPARE0						0x0000000c
+#define    NV20TCL_RC_IN_RGB_D_INPUT_SPARE1						0x0000000d
+#define    NV20TCL_RC_IN_RGB_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV20TCL_RC_IN_RGB_D_INPUT_E_TIMES_F						0x0000000f
+#define    NV20TCL_RC_IN_RGB_D_INPUT_TEXTURE2						0x0000000a
+#define    NV20TCL_RC_IN_RGB_D_INPUT_TEXTURE3						0x0000000b
+#define   NV20TCL_RC_IN_RGB_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV20TCL_RC_IN_RGB_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_IN_RGB_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV20TCL_RC_IN_RGB_D_MAPPING_SHIFT						5
+#define   NV20TCL_RC_IN_RGB_D_MAPPING_MASK						0x000000e0
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_INVERT					0x00000020
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_EXPAND_NORMAL					0x00000040
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_EXPAND_NEGATE					0x00000060
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NORMAL					0x00000080
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NEGATE					0x000000a0
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_SIGNED_IDENTITY					0x000000c0
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_SIGNED_NEGATE					0x000000e0
+#define   NV20TCL_RC_IN_RGB_C_INPUT_SHIFT						8
+#define   NV20TCL_RC_IN_RGB_C_INPUT_MASK						0x00000f00
+#define    NV20TCL_RC_IN_RGB_C_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR0					0x00000100
+#define    NV20TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR1					0x00000200
+#define    NV20TCL_RC_IN_RGB_C_INPUT_FOG						0x00000300
+#define    NV20TCL_RC_IN_RGB_C_INPUT_PRIMARY_COLOR					0x00000400
+#define    NV20TCL_RC_IN_RGB_C_INPUT_SECONDARY_COLOR					0x00000500
+#define    NV20TCL_RC_IN_RGB_C_INPUT_TEXTURE0						0x00000800
+#define    NV20TCL_RC_IN_RGB_C_INPUT_TEXTURE1						0x00000900
+#define    NV20TCL_RC_IN_RGB_C_INPUT_SPARE0						0x00000c00
+#define    NV20TCL_RC_IN_RGB_C_INPUT_SPARE1						0x00000d00
+#define    NV20TCL_RC_IN_RGB_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV20TCL_RC_IN_RGB_C_INPUT_E_TIMES_F						0x00000f00
+#define    NV20TCL_RC_IN_RGB_C_INPUT_TEXTURE2						0x00000a00
+#define    NV20TCL_RC_IN_RGB_C_INPUT_TEXTURE3						0x00000b00
+#define   NV20TCL_RC_IN_RGB_C_COMPONENT_USAGE						(1 << 12)
+#define    NV20TCL_RC_IN_RGB_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_IN_RGB_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV20TCL_RC_IN_RGB_C_MAPPING_SHIFT						13
+#define   NV20TCL_RC_IN_RGB_C_MAPPING_MASK						0x0000e000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_INVERT					0x00002000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_EXPAND_NORMAL					0x00004000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_EXPAND_NEGATE					0x00006000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NORMAL					0x00008000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NEGATE					0x0000a000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_SIGNED_IDENTITY					0x0000c000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_SIGNED_NEGATE					0x0000e000
+#define   NV20TCL_RC_IN_RGB_B_INPUT_SHIFT						16
+#define   NV20TCL_RC_IN_RGB_B_INPUT_MASK						0x000f0000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR0					0x00010000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR1					0x00020000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_FOG						0x00030000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_PRIMARY_COLOR					0x00040000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_SECONDARY_COLOR					0x00050000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_TEXTURE0						0x00080000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_TEXTURE1						0x00090000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_SPARE0						0x000c0000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_SPARE1						0x000d0000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000e0000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_E_TIMES_F						0x000f0000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_TEXTURE2						0x000a0000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_TEXTURE3						0x000b0000
+#define   NV20TCL_RC_IN_RGB_B_COMPONENT_USAGE						(1 << 20)
+#define    NV20TCL_RC_IN_RGB_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_IN_RGB_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV20TCL_RC_IN_RGB_B_MAPPING_SHIFT						21
+#define   NV20TCL_RC_IN_RGB_B_MAPPING_MASK						0x00e00000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_INVERT					0x00200000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_EXPAND_NORMAL					0x00400000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_EXPAND_NEGATE					0x00600000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NORMAL					0x00800000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NEGATE					0x00a00000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_SIGNED_IDENTITY					0x00c00000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_SIGNED_NEGATE					0x00e00000
+#define   NV20TCL_RC_IN_RGB_A_INPUT_SHIFT						24
+#define   NV20TCL_RC_IN_RGB_A_INPUT_MASK						0x0f000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR0					0x01000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR1					0x02000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_FOG						0x03000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_PRIMARY_COLOR					0x04000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_SECONDARY_COLOR					0x05000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_TEXTURE0						0x08000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_TEXTURE1						0x09000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_SPARE0						0x0c000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_SPARE1						0x0d000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0e000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_E_TIMES_F						0x0f000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_TEXTURE2						0x0a000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_TEXTURE3						0x0b000000
+#define   NV20TCL_RC_IN_RGB_A_COMPONENT_USAGE						(1 << 28)
+#define    NV20TCL_RC_IN_RGB_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_IN_RGB_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV20TCL_RC_IN_RGB_A_MAPPING_SHIFT						29
+#define   NV20TCL_RC_IN_RGB_A_MAPPING_MASK						0xe0000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_INVERT					0x20000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_EXPAND_NORMAL					0x40000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_EXPAND_NEGATE					0x60000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NORMAL					0x80000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NEGATE					0xa0000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_SIGNED_IDENTITY					0xc0000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_SIGNED_NEGATE					0xe0000000
+#define  NV20TCL_VIEWPORT_SCALE_X							0x00000af0
+#define  NV20TCL_VIEWPORT_SCALE_Y							0x00000af4
+#define  NV20TCL_VIEWPORT_SCALE_Z							0x00000af8
+#define  NV20TCL_VIEWPORT_SCALE_W							0x00000afc
+#define  NV20TCL_VP_UPLOAD_INST(x)							(0x00000b00+((x)*4))
+#define  NV20TCL_VP_UPLOAD_INST__SIZE							0x00000004
+#define  NV20TCL_VP_UPLOAD_CONST(x)							(0x00000b80+((x)*4))
+#define  NV20TCL_VP_UPLOAD_CONST__SIZE							0x00000004
+#define  NV20TCL_LIGHT_BACK_AMBIENT_R(x)						(0x00000c00+((x)*64))
+#define  NV20TCL_LIGHT_BACK_AMBIENT_R__SIZE						0x00000008
+#define  NV20TCL_LIGHT_BACK_AMBIENT_G(x)						(0x00000c04+((x)*64))
+#define  NV20TCL_LIGHT_BACK_AMBIENT_G__SIZE						0x00000008
+#define  NV20TCL_LIGHT_BACK_AMBIENT_B(x)						(0x00000c08+((x)*64))
+#define  NV20TCL_LIGHT_BACK_AMBIENT_B__SIZE						0x00000008
+#define  NV20TCL_LIGHT_BACK_DIFFUSE_R(x)						(0x00000c0c+((x)*64))
+#define  NV20TCL_LIGHT_BACK_DIFFUSE_R__SIZE						0x00000008
+#define  NV20TCL_LIGHT_BACK_DIFFUSE_G(x)						(0x00000c10+((x)*64))
+#define  NV20TCL_LIGHT_BACK_DIFFUSE_G__SIZE						0x00000008
+#define  NV20TCL_LIGHT_BACK_DIFFUSE_B(x)						(0x00000c14+((x)*64))
+#define  NV20TCL_LIGHT_BACK_DIFFUSE_B__SIZE						0x00000008
+#define  NV20TCL_LIGHT_BACK_SPECULAR_R(x)						(0x00000c18+((x)*64))
+#define  NV20TCL_LIGHT_BACK_SPECULAR_R__SIZE						0x00000008
+#define  NV20TCL_LIGHT_BACK_SPECULAR_G(x)						(0x00000c1c+((x)*64))
+#define  NV20TCL_LIGHT_BACK_SPECULAR_G__SIZE						0x00000008
+#define  NV20TCL_LIGHT_BACK_SPECULAR_B(x)						(0x00000c20+((x)*64))
+#define  NV20TCL_LIGHT_BACK_SPECULAR_B__SIZE						0x00000008
+#define  NV20TCL_LIGHT_FRONT_AMBIENT_R(x)						(0x00001000+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_AMBIENT_R__SIZE						0x00000008
+#define  NV20TCL_LIGHT_FRONT_AMBIENT_G(x)						(0x00001004+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_AMBIENT_G__SIZE						0x00000008
+#define  NV20TCL_LIGHT_FRONT_AMBIENT_B(x)						(0x00001008+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_AMBIENT_B__SIZE						0x00000008
+#define  NV20TCL_LIGHT_FRONT_DIFFUSE_R(x)						(0x0000100c+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_DIFFUSE_R__SIZE						0x00000008
+#define  NV20TCL_LIGHT_FRONT_DIFFUSE_G(x)						(0x00001010+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_DIFFUSE_G__SIZE						0x00000008
+#define  NV20TCL_LIGHT_FRONT_DIFFUSE_B(x)						(0x00001014+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_DIFFUSE_B__SIZE						0x00000008
+#define  NV20TCL_LIGHT_FRONT_SPECULAR_R(x)						(0x00001018+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_SPECULAR_R__SIZE						0x00000008
+#define  NV20TCL_LIGHT_FRONT_SPECULAR_G(x)						(0x0000101c+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_SPECULAR_G__SIZE						0x00000008
+#define  NV20TCL_LIGHT_FRONT_SPECULAR_B(x)						(0x00001020+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_SPECULAR_B__SIZE						0x00000008
+#define  NV20TCL_LIGHT_HALF_VECTOR_X(x)							(0x00001028+((x)*128))
+#define  NV20TCL_LIGHT_HALF_VECTOR_X__SIZE						0x00000008
+#define  NV20TCL_LIGHT_HALF_VECTOR_Y(x)							(0x0000102c+((x)*128))
+#define  NV20TCL_LIGHT_HALF_VECTOR_Y__SIZE						0x00000008
+#define  NV20TCL_LIGHT_HALF_VECTOR_Z(x)							(0x00001030+((x)*128))
+#define  NV20TCL_LIGHT_HALF_VECTOR_Z__SIZE						0x00000008
+#define  NV20TCL_LIGHT_DIRECTION_X(x)							(0x00001034+((x)*128))
+#define  NV20TCL_LIGHT_DIRECTION_X__SIZE						0x00000008
+#define  NV20TCL_LIGHT_DIRECTION_Y(x)							(0x00001038+((x)*128))
+#define  NV20TCL_LIGHT_DIRECTION_Y__SIZE						0x00000008
+#define  NV20TCL_LIGHT_DIRECTION_Z(x)							(0x0000103c+((x)*128))
+#define  NV20TCL_LIGHT_DIRECTION_Z__SIZE						0x00000008
+#define  NV20TCL_LIGHT_SPOT_CUTOFF_A(x)							(0x00001040+((x)*128))
+#define  NV20TCL_LIGHT_SPOT_CUTOFF_A__SIZE						0x00000008
+#define  NV20TCL_LIGHT_SPOT_CUTOFF_B(x)							(0x00001044+((x)*128))
+#define  NV20TCL_LIGHT_SPOT_CUTOFF_B__SIZE						0x00000008
+#define  NV20TCL_LIGHT_SPOT_CUTOFF_C(x)							(0x00001048+((x)*128))
+#define  NV20TCL_LIGHT_SPOT_CUTOFF_C__SIZE						0x00000008
+#define  NV20TCL_LIGHT_SPOT_DIR_X(x)							(0x0000104c+((x)*128))
+#define  NV20TCL_LIGHT_SPOT_DIR_X__SIZE							0x00000008
+#define  NV20TCL_LIGHT_SPOT_DIR_Y(x)							(0x00001050+((x)*128))
+#define  NV20TCL_LIGHT_SPOT_DIR_Y__SIZE							0x00000008
+#define  NV20TCL_LIGHT_SPOT_DIR_Z(x)							(0x00001054+((x)*128))
+#define  NV20TCL_LIGHT_SPOT_DIR_Z__SIZE							0x00000008
+#define  NV20TCL_LIGHT_SPOT_CUTOFF_D(x)							(0x00001058+((x)*128))
+#define  NV20TCL_LIGHT_SPOT_CUTOFF_D__SIZE						0x00000008
+#define  NV20TCL_LIGHT_POSITION_X(x)							(0x0000105c+((x)*128))
+#define  NV20TCL_LIGHT_POSITION_X__SIZE							0x00000008
+#define  NV20TCL_LIGHT_POSITION_Y(x)							(0x00001060+((x)*128))
+#define  NV20TCL_LIGHT_POSITION_Y__SIZE							0x00000008
+#define  NV20TCL_LIGHT_POSITION_Z(x)							(0x00001064+((x)*128))
+#define  NV20TCL_LIGHT_POSITION_Z__SIZE							0x00000008
+#define  NV20TCL_LIGHT_ATTENUATION_CONSTANT(x)						(0x00001068+((x)*128))
+#define  NV20TCL_LIGHT_ATTENUATION_CONSTANT__SIZE					0x00000008
+#define  NV20TCL_LIGHT_ATTENUATION_LINEAR(x)						(0x0000106c+((x)*128))
+#define  NV20TCL_LIGHT_ATTENUATION_LINEAR__SIZE						0x00000008
+#define  NV20TCL_LIGHT_ATTENUATION_QUADRATIC(x)						(0x00001070+((x)*128))
+#define  NV20TCL_LIGHT_ATTENUATION_QUADRATIC__SIZE					0x00000008
+#define  NV20TCL_POLYGON_STIPPLE_ENABLE							0x0000147c
+#define  NV20TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001480+((x)*4))
+#define  NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NV20TCL_VERTEX_POS_3F_X							0x00001500
+#define  NV20TCL_VERTEX_POS_3F_Y							0x00001504
+#define  NV20TCL_VERTEX_POS_3F_Z							0x00001508
+#define  NV20TCL_VERTEX_POS_4F_X							0x00001518
+#define  NV20TCL_VERTEX_POS_4F_Y							0x0000151c
+#define  NV20TCL_VERTEX_POS_4F_Z							0x00001520
+#define  NV20TCL_VERTEX_POS_3I_XY							0x00001528
+#define   NV20TCL_VERTEX_POS_3I_XY_X_SHIFT						0
+#define   NV20TCL_VERTEX_POS_3I_XY_X_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_POS_3I_XY_Y_SHIFT						16
+#define   NV20TCL_VERTEX_POS_3I_XY_Y_MASK						0xffff0000
+#define  NV20TCL_VERTEX_POS_3I_Z							0x0000152c
+#define   NV20TCL_VERTEX_POS_3I_Z_Z_SHIFT						0
+#define   NV20TCL_VERTEX_POS_3I_Z_Z_MASK						0x0000ffff
+#define  NV20TCL_VERTEX_NOR_3F_X							0x00001530
+#define  NV20TCL_VERTEX_NOR_3F_Y							0x00001534
+#define  NV20TCL_VERTEX_NOR_3F_Z							0x00001538
+#define  NV20TCL_VERTEX_NOR_3I_XY							0x00001540
+#define   NV20TCL_VERTEX_NOR_3I_XY_X_SHIFT						0
+#define   NV20TCL_VERTEX_NOR_3I_XY_X_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_NOR_3I_XY_Y_SHIFT						16
+#define   NV20TCL_VERTEX_NOR_3I_XY_Y_MASK						0xffff0000
+#define  NV20TCL_VERTEX_NOR_3I_Z							0x00001544
+#define   NV20TCL_VERTEX_NOR_3I_Z_Z_SHIFT						0
+#define   NV20TCL_VERTEX_NOR_3I_Z_Z_MASK						0x0000ffff
+#define  NV20TCL_VERTEX_COL_4F_X							0x00001550
+#define  NV20TCL_VERTEX_COL_4F_Y							0x00001554
+#define  NV20TCL_VERTEX_COL_4F_Z							0x00001558
+#define  NV20TCL_VERTEX_COL_4F_W							0x0000155c
+#define  NV20TCL_VERTEX_COL_3F_X							0x00001560
+#define  NV20TCL_VERTEX_COL_3F_Y							0x00001564
+#define  NV20TCL_VERTEX_COL_3F_Z							0x00001568
+#define  NV20TCL_VERTEX_COL_4I								0x0000156c
+#define   NV20TCL_VERTEX_COL_4I_R_SHIFT							0
+#define   NV20TCL_VERTEX_COL_4I_R_MASK							0x000000ff
+#define   NV20TCL_VERTEX_COL_4I_G_SHIFT							8
+#define   NV20TCL_VERTEX_COL_4I_G_MASK							0x0000ff00
+#define   NV20TCL_VERTEX_COL_4I_B_SHIFT							16
+#define   NV20TCL_VERTEX_COL_4I_B_MASK							0x00ff0000
+#define   NV20TCL_VERTEX_COL_4I_A_SHIFT							24
+#define   NV20TCL_VERTEX_COL_4I_A_MASK							0xff000000
+#define  NV20TCL_VERTEX_COL2_3F_X							0x00001580
+#define  NV20TCL_VERTEX_COL2_3F_Y							0x00001584
+#define  NV20TCL_VERTEX_COL2_3F_Z							0x00001588
+#define  NV20TCL_VERTEX_COL2_4I								0x0000158c
+#define   NV20TCL_VERTEX_COL2_4I_R_SHIFT						0
+#define   NV20TCL_VERTEX_COL2_4I_R_MASK							0x000000ff
+#define   NV20TCL_VERTEX_COL2_4I_G_SHIFT						8
+#define   NV20TCL_VERTEX_COL2_4I_G_MASK							0x0000ff00
+#define   NV20TCL_VERTEX_COL2_4I_B_SHIFT						16
+#define   NV20TCL_VERTEX_COL2_4I_B_MASK							0x00ff0000
+#define   NV20TCL_VERTEX_COL2_4I_A_SHIFT						24
+#define   NV20TCL_VERTEX_COL2_4I_A_MASK							0xff000000
+#define  NV20TCL_VERTEX_TX0_2F_S							0x00001590
+#define  NV20TCL_VERTEX_TX0_2F_T							0x00001594
+#define  NV20TCL_VERTEX_TX0_2I								0x00001598
+#define   NV20TCL_VERTEX_TX0_2I_S_SHIFT							0
+#define   NV20TCL_VERTEX_TX0_2I_S_MASK							0x0000ffff
+#define   NV20TCL_VERTEX_TX0_2I_T_SHIFT							16
+#define   NV20TCL_VERTEX_TX0_2I_T_MASK							0xffff0000
+#define  NV20TCL_VERTEX_TX0_4F_S							0x000015a0
+#define  NV20TCL_VERTEX_TX0_4F_T							0x000015a4
+#define  NV20TCL_VERTEX_TX0_4F_R							0x000015a8
+#define  NV20TCL_VERTEX_TX0_4F_Q							0x000015ac
+#define  NV20TCL_VERTEX_TX0_4I_ST							0x000015b0
+#define   NV20TCL_VERTEX_TX0_4I_ST_S_SHIFT						0
+#define   NV20TCL_VERTEX_TX0_4I_ST_S_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX0_4I_ST_T_SHIFT						16
+#define   NV20TCL_VERTEX_TX0_4I_ST_T_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX0_4I_RQ							0x000015b4
+#define   NV20TCL_VERTEX_TX0_4I_RQ_R_SHIFT						0
+#define   NV20TCL_VERTEX_TX0_4I_RQ_R_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX0_4I_RQ_Q_SHIFT						16
+#define   NV20TCL_VERTEX_TX0_4I_RQ_Q_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX1_2F_S							0x000015b8
+#define  NV20TCL_VERTEX_TX1_2F_T							0x000015bc
+#define  NV20TCL_VERTEX_TX1_2I								0x000015c0
+#define   NV20TCL_VERTEX_TX1_2I_S_SHIFT							0
+#define   NV20TCL_VERTEX_TX1_2I_S_MASK							0x0000ffff
+#define   NV20TCL_VERTEX_TX1_2I_T_SHIFT							16
+#define   NV20TCL_VERTEX_TX1_2I_T_MASK							0xffff0000
+#define  NV20TCL_VERTEX_TX1_4F_S							0x000015c8
+#define  NV20TCL_VERTEX_TX1_4F_T							0x000015cc
+#define  NV20TCL_VERTEX_TX1_4F_R							0x000015d0
+#define  NV20TCL_VERTEX_TX1_4F_Q							0x000015d4
+#define  NV20TCL_VERTEX_TX1_4I_ST							0x000015d8
+#define   NV20TCL_VERTEX_TX1_4I_ST_S_SHIFT						0
+#define   NV20TCL_VERTEX_TX1_4I_ST_S_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX1_4I_ST_T_SHIFT						16
+#define   NV20TCL_VERTEX_TX1_4I_ST_T_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX1_4I_RQ							0x000015dc
+#define   NV20TCL_VERTEX_TX1_4I_RQ_R_SHIFT						0
+#define   NV20TCL_VERTEX_TX1_4I_RQ_R_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX1_4I_RQ_Q_SHIFT						16
+#define   NV20TCL_VERTEX_TX1_4I_RQ_Q_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX2_2F_S							0x000015e0
+#define  NV20TCL_VERTEX_TX2_2F_T							0x000015e4
+#define  NV20TCL_VERTEX_TX2_2I								0x000015e8
+#define   NV20TCL_VERTEX_TX2_2I_S_SHIFT							0
+#define   NV20TCL_VERTEX_TX2_2I_S_MASK							0x0000ffff
+#define   NV20TCL_VERTEX_TX2_2I_T_SHIFT							16
+#define   NV20TCL_VERTEX_TX2_2I_T_MASK							0xffff0000
+#define  NV20TCL_VERTEX_TX2_4F_S							0x000015f0
+#define  NV20TCL_VERTEX_TX2_4F_T							0x000015f4
+#define  NV20TCL_VERTEX_TX2_4F_R							0x000015f8
+#define  NV20TCL_VERTEX_TX2_4F_Q							0x000015fc
+#define  NV20TCL_VERTEX_TX2_4I_ST							0x00001600
+#define   NV20TCL_VERTEX_TX2_4I_ST_S_SHIFT						0
+#define   NV20TCL_VERTEX_TX2_4I_ST_S_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX2_4I_ST_T_SHIFT						16
+#define   NV20TCL_VERTEX_TX2_4I_ST_T_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX2_4I_RQ							0x00001604
+#define   NV20TCL_VERTEX_TX2_4I_RQ_R_SHIFT						0
+#define   NV20TCL_VERTEX_TX2_4I_RQ_R_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX2_4I_RQ_Q_SHIFT						16
+#define   NV20TCL_VERTEX_TX2_4I_RQ_Q_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX3_2F_S							0x00001608
+#define  NV20TCL_VERTEX_TX3_2F_T							0x0000160c
+#define  NV20TCL_VERTEX_TX3_2I								0x00001610
+#define   NV20TCL_VERTEX_TX3_2I_S_SHIFT							0
+#define   NV20TCL_VERTEX_TX3_2I_S_MASK							0x0000ffff
+#define   NV20TCL_VERTEX_TX3_2I_T_SHIFT							16
+#define   NV20TCL_VERTEX_TX3_2I_T_MASK							0xffff0000
+#define  NV20TCL_VERTEX_TX3_4F_S							0x00001620
+#define  NV20TCL_VERTEX_TX3_4F_T							0x00001624
+#define  NV20TCL_VERTEX_TX3_4F_R							0x00001628
+#define  NV20TCL_VERTEX_TX3_4F_Q							0x0000162c
+#define  NV20TCL_VERTEX_TX3_4I_ST							0x00001630
+#define   NV20TCL_VERTEX_TX3_4I_ST_S_SHIFT						0
+#define   NV20TCL_VERTEX_TX3_4I_ST_S_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX3_4I_ST_T_SHIFT						16
+#define   NV20TCL_VERTEX_TX3_4I_ST_T_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX3_4I_RQ							0x00001634
+#define   NV20TCL_VERTEX_TX3_4I_RQ_R_SHIFT						0
+#define   NV20TCL_VERTEX_TX3_4I_RQ_R_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX3_4I_RQ_Q_SHIFT						16
+#define   NV20TCL_VERTEX_TX3_4I_RQ_Q_MASK						0xffff0000
+#define  NV20TCL_VERTEX_FOG_1F								0x00001698
+#define  NV20TCL_EDGEFLAG_ENABLE							0x000016bc
+#define  NV20TCL_VTX_CACHE_INVALIDATE							0x00001710
+#define  NV20TCL_VTXBUF_ADDRESS(x)							(0x00001720+((x)*4))
+#define  NV20TCL_VTXBUF_ADDRESS__SIZE							0x00000010
+#define   NV20TCL_VTXBUF_ADDRESS_DMA1							(1 << 31)
+#define   NV20TCL_VTXBUF_ADDRESS_OFFSET_SHIFT						0
+#define   NV20TCL_VTXBUF_ADDRESS_OFFSET_MASK						0x0fffffff
+#define  NV20TCL_VTXFMT(x)								(0x00001760+((x)*4))
+#define  NV20TCL_VTXFMT__SIZE								0x00000010
+#define   NV20TCL_VTXFMT_TYPE_SHIFT							0
+#define   NV20TCL_VTXFMT_TYPE_MASK							0x0000000f
+#define    NV20TCL_VTXFMT_TYPE_FLOAT							0x00000002
+#define    NV20TCL_VTXFMT_TYPE_UBYTE							0x00000004
+#define    NV20TCL_VTXFMT_TYPE_USHORT							0x00000005
+#define   NV20TCL_VTXFMT_SIZE_SHIFT							4
+#define   NV20TCL_VTXFMT_SIZE_MASK							0x000000f0
+#define   NV20TCL_VTXFMT_STRIDE_SHIFT							8
+#define   NV20TCL_VTXFMT_STRIDE_MASK							0x0000ff00
+#define  NV20TCL_LIGHT_MODEL_BACK_AMBIENT_R						0x000017a0
+#define  NV20TCL_LIGHT_MODEL_BACK_AMBIENT_G						0x000017a4
+#define  NV20TCL_LIGHT_MODEL_BACK_AMBIENT_B						0x000017a8
+#define  NV20TCL_MATERIAL_FACTOR_BACK_A							0x000017ac
+#define  NV20TCL_MATERIAL_FACTOR_BACK_R							0x000017b0
+#define  NV20TCL_MATERIAL_FACTOR_BACK_G							0x000017b4
+#define  NV20TCL_MATERIAL_FACTOR_BACK_B							0x000017b8
+#define  NV20TCL_COLOR_LOGIC_OP_ENABLE							0x000017bc
+#define  NV20TCL_COLOR_LOGIC_OP_OP							0x000017c0
+#define   NV20TCL_COLOR_LOGIC_OP_OP_CLEAR						0x00001500
+#define   NV20TCL_COLOR_LOGIC_OP_OP_AND							0x00001501
+#define   NV20TCL_COLOR_LOGIC_OP_OP_AND_REVERSE						0x00001502
+#define   NV20TCL_COLOR_LOGIC_OP_OP_COPY						0x00001503
+#define   NV20TCL_COLOR_LOGIC_OP_OP_AND_INVERTED					0x00001504
+#define   NV20TCL_COLOR_LOGIC_OP_OP_NOOP						0x00001505
+#define   NV20TCL_COLOR_LOGIC_OP_OP_XOR							0x00001506
+#define   NV20TCL_COLOR_LOGIC_OP_OP_OR							0x00001507
+#define   NV20TCL_COLOR_LOGIC_OP_OP_NOR							0x00001508
+#define   NV20TCL_COLOR_LOGIC_OP_OP_EQUIV						0x00001509
+#define   NV20TCL_COLOR_LOGIC_OP_OP_INVERT						0x0000150a
+#define   NV20TCL_COLOR_LOGIC_OP_OP_OR_REVERSE						0x0000150b
+#define   NV20TCL_COLOR_LOGIC_OP_OP_COPY_INVERTED					0x0000150c
+#define   NV20TCL_COLOR_LOGIC_OP_OP_OR_INVERTED						0x0000150d
+#define   NV20TCL_COLOR_LOGIC_OP_OP_NAND						0x0000150e
+#define   NV20TCL_COLOR_LOGIC_OP_OP_SET							0x0000150f
+#define  NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE						0x000017c4
+#define  NV20TCL_TX_SHADER_CULL_MODE							0x000017f8
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX0_S						(1 <<  0)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_S_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_S_LESS					0x00000001
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX0_T						(1 <<  1)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_T_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_T_LESS					0x00000002
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX0_R						(1 <<  2)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_R_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_R_LESS					0x00000004
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX0_Q						(1 <<  3)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_Q_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_Q_LESS					0x00000008
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX1_S						(1 <<  4)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_S_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_S_LESS					0x00000010
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX1_T						(1 <<  5)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_T_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_T_LESS					0x00000020
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX1_R						(1 <<  6)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_R_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_R_LESS					0x00000040
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX1_Q						(1 <<  7)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_Q_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_Q_LESS					0x00000080
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX2_S						(1 <<  8)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_S_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_S_LESS					0x00000100
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX2_T						(1 <<  9)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_T_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_T_LESS					0x00000200
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX2_R						(1 << 10)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_R_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_R_LESS					0x00000400
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX2_Q						(1 << 11)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_Q_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_Q_LESS					0x00000800
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX3_S						(1 << 12)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_S_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_S_LESS					0x00001000
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX3_T						(1 << 13)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_T_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_T_LESS					0x00002000
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX3_R						(1 << 14)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_R_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_R_LESS					0x00004000
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX3_Q						(1 << 15)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_Q_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_Q_LESS					0x00008000
+#define  NV20TCL_VERTEX_BEGIN_END							0x000017fc
+#define   NV20TCL_VERTEX_BEGIN_END_STOP							0x00000000
+#define   NV20TCL_VERTEX_BEGIN_END_POINTS						0x00000001
+#define   NV20TCL_VERTEX_BEGIN_END_LINES						0x00000002
+#define   NV20TCL_VERTEX_BEGIN_END_LINE_LOOP						0x00000003
+#define   NV20TCL_VERTEX_BEGIN_END_LINE_STRIP						0x00000004
+#define   NV20TCL_VERTEX_BEGIN_END_TRIANGLES						0x00000005
+#define   NV20TCL_VERTEX_BEGIN_END_TRIANGLE_STRIP					0x00000006
+#define   NV20TCL_VERTEX_BEGIN_END_TRIANGLE_FAN						0x00000007
+#define   NV20TCL_VERTEX_BEGIN_END_QUADS						0x00000008
+#define   NV20TCL_VERTEX_BEGIN_END_QUAD_STRIP						0x00000009
+#define   NV20TCL_VERTEX_BEGIN_END_POLYGON						0x0000000a
+#define  NV20TCL_VB_ELEMENT_U16								0x00001800
+#define   NV20TCL_VB_ELEMENT_U16_I0_SHIFT						0
+#define   NV20TCL_VB_ELEMENT_U16_I0_MASK						0x0000ffff
+#define   NV20TCL_VB_ELEMENT_U16_I1_SHIFT						16
+#define   NV20TCL_VB_ELEMENT_U16_I1_MASK						0xffff0000
+#define  NV20TCL_VB_ELEMENT_U32								0x00001808
+#define  NV20TCL_VB_VERTEX_BATCH							0x00001810
+#define   NV20TCL_VB_VERTEX_BATCH_OFFSET_SHIFT						0
+#define   NV20TCL_VB_VERTEX_BATCH_OFFSET_MASK						0x00ffffff
+#define   NV20TCL_VB_VERTEX_BATCH_COUNT_SHIFT						24
+#define   NV20TCL_VB_VERTEX_BATCH_COUNT_MASK						0xff000000
+#define  NV20TCL_VERTEX_DATA								0x00001818
+#define  NV20TCL_TX_SHADER_CONST_EYE_X							0x0000181c
+#define  NV20TCL_TX_SHADER_CONST_EYE_Y							0x00001820
+#define  NV20TCL_TX_SHADER_CONST_EYE_Z							0x00001824
+#define  NV20TCL_VTX_ATTR_4F_X(x)							(0x00001a00+((x)*16))
+#define  NV20TCL_VTX_ATTR_4F_X__SIZE							0x00000010
+#define  NV20TCL_VTX_ATTR_4F_Y(x)							(0x00001a04+((x)*16))
+#define  NV20TCL_VTX_ATTR_4F_Y__SIZE							0x00000010
+#define  NV20TCL_VTX_ATTR_4F_Z(x)							(0x00001a08+((x)*16))
+#define  NV20TCL_VTX_ATTR_4F_Z__SIZE							0x00000010
+#define  NV20TCL_VTX_ATTR_4F_W(x)							(0x00001a0c+((x)*16))
+#define  NV20TCL_VTX_ATTR_4F_W__SIZE							0x00000010
+#define  NV20TCL_TX_OFFSET(x)								(0x00001b00+((x)*64))
+#define  NV20TCL_TX_OFFSET__SIZE							0x00000004
+#define  NV20TCL_TX_FORMAT(x)								(0x00001b04+((x)*64))
+#define  NV20TCL_TX_FORMAT__SIZE							0x00000004
+#define   NV20TCL_TX_FORMAT_DMA0							(1 <<  0)
+#define   NV20TCL_TX_FORMAT_DMA1							(1 <<  1)
+#define   NV20TCL_TX_FORMAT_CUBIC							(1 <<  2)
+#define   NV20TCL_TX_FORMAT_NO_BORDER							(1 <<  3)
+#define   NV20TCL_TX_FORMAT_DIMS_SHIFT							4
+#define   NV20TCL_TX_FORMAT_DIMS_MASK							0x000000f0
+#define    NV20TCL_TX_FORMAT_DIMS_1D							0x00000010
+#define    NV20TCL_TX_FORMAT_DIMS_2D							0x00000020
+#define    NV20TCL_TX_FORMAT_DIMS_3D							0x00000030
+#define   NV20TCL_TX_FORMAT_FORMAT_SHIFT						8
+#define   NV20TCL_TX_FORMAT_FORMAT_MASK							0x0000ff00
+#define    NV20TCL_TX_FORMAT_FORMAT_L8							0x00000000
+#define    NV20TCL_TX_FORMAT_FORMAT_A8							0x00000100
+#define    NV20TCL_TX_FORMAT_FORMAT_A1R5G5B5						0x00000200
+#define    NV20TCL_TX_FORMAT_FORMAT_A4R4G4B4						0x00000400
+#define    NV20TCL_TX_FORMAT_FORMAT_R5G6B5						0x00000500
+#define    NV20TCL_TX_FORMAT_FORMAT_A8R8G8B8						0x00000600
+#define    NV20TCL_TX_FORMAT_FORMAT_X8R8G8B8						0x00000700
+#define    NV20TCL_TX_FORMAT_FORMAT_INDEX8						0x00000b00
+#define    NV20TCL_TX_FORMAT_FORMAT_DXT1						0x00000c00
+#define    NV20TCL_TX_FORMAT_FORMAT_DXT3						0x00000e00
+#define    NV20TCL_TX_FORMAT_FORMAT_DXT5						0x00000f00
+#define    NV20TCL_TX_FORMAT_FORMAT_A1R5G5B5_RECT					0x00001000
+#define    NV20TCL_TX_FORMAT_FORMAT_R5G6B5_RECT						0x00001100
+#define    NV20TCL_TX_FORMAT_FORMAT_A8R8G8B8_RECT					0x00001200
+#define    NV20TCL_TX_FORMAT_FORMAT_L8_RECT						0x00001300
+#define    NV20TCL_TX_FORMAT_FORMAT_DSDT8_RECT						0x00001700
+#define    NV20TCL_TX_FORMAT_FORMAT_A8L8						0x00001a00
+#define    NV20TCL_TX_FORMAT_FORMAT_A8_RECT						0x00001b00
+#define    NV20TCL_TX_FORMAT_FORMAT_A4R4G4B4_RECT					0x00001d00
+#define    NV20TCL_TX_FORMAT_FORMAT_R8G8B8_RECT						0x00001e00
+#define    NV20TCL_TX_FORMAT_FORMAT_A8L8_RECT						0x00002000
+#define    NV20TCL_TX_FORMAT_FORMAT_DSDT8						0x00002800
+#define    NV20TCL_TX_FORMAT_FORMAT_HILO16						0x00003300
+#define    NV20TCL_TX_FORMAT_FORMAT_HILO16_RECT						0x00003600
+#define    NV20TCL_TX_FORMAT_FORMAT_HILO8						0x00004400
+#define    NV20TCL_TX_FORMAT_FORMAT_SIGNED_HILO8					0x00004500
+#define    NV20TCL_TX_FORMAT_FORMAT_HILO8_RECT						0x00004600
+#define    NV20TCL_TX_FORMAT_FORMAT_SIGNED_HILO8_RECT					0x00004700
+#define    NV20TCL_TX_FORMAT_FORMAT_A16							0x00003200
+#define    NV20TCL_TX_FORMAT_FORMAT_A16_RECT						0x00003500
+#define    NV20TCL_TX_FORMAT_FORMAT_FLOAT_RGBA16_NV					0x00004a00
+#define    NV20TCL_TX_FORMAT_FORMAT_FLOAT_RGBA32_NV					0x00004b00
+#define    NV20TCL_TX_FORMAT_FORMAT_FLOAT_R32_NV					0x00004c00
+#define   NV20TCL_TX_FORMAT_MIPMAP							(1 << 19)
+#define   NV20TCL_TX_FORMAT_BASE_SIZE_U_SHIFT						20
+#define   NV20TCL_TX_FORMAT_BASE_SIZE_U_MASK						0x00f00000
+#define   NV20TCL_TX_FORMAT_BASE_SIZE_V_SHIFT						24
+#define   NV20TCL_TX_FORMAT_BASE_SIZE_V_MASK						0x0f000000
+#define   NV20TCL_TX_FORMAT_BASE_SIZE_W_SHIFT						28
+#define   NV20TCL_TX_FORMAT_BASE_SIZE_W_MASK						0xf0000000
+#define  NV20TCL_TX_WRAP(x)								(0x00001b08+((x)*64))
+#define  NV20TCL_TX_WRAP__SIZE								0x00000004
+#define   NV20TCL_TX_WRAP_S_SHIFT							0
+#define   NV20TCL_TX_WRAP_S_MASK							0x000000ff
+#define    NV20TCL_TX_WRAP_S_REPEAT							0x00000001
+#define    NV20TCL_TX_WRAP_S_MIRRORED_REPEAT						0x00000002
+#define    NV20TCL_TX_WRAP_S_CLAMP_TO_EDGE						0x00000003
+#define    NV20TCL_TX_WRAP_S_CLAMP_TO_BORDER						0x00000004
+#define    NV20TCL_TX_WRAP_S_CLAMP							0x00000005
+#define   NV20TCL_TX_WRAP_T_SHIFT							8
+#define   NV20TCL_TX_WRAP_T_MASK							0x00000f00
+#define    NV20TCL_TX_WRAP_T_REPEAT							0x00000100
+#define    NV20TCL_TX_WRAP_T_MIRRORED_REPEAT						0x00000200
+#define    NV20TCL_TX_WRAP_T_CLAMP_TO_EDGE						0x00000300
+#define    NV20TCL_TX_WRAP_T_CLAMP_TO_BORDER						0x00000400
+#define    NV20TCL_TX_WRAP_T_CLAMP							0x00000500
+#define   NV20TCL_TX_WRAP_R_SHIFT							16
+#define   NV20TCL_TX_WRAP_R_MASK							0x000f0000
+#define    NV20TCL_TX_WRAP_R_REPEAT							0x00010000
+#define    NV20TCL_TX_WRAP_R_MIRRORED_REPEAT						0x00020000
+#define    NV20TCL_TX_WRAP_R_CLAMP_TO_EDGE						0x00030000
+#define    NV20TCL_TX_WRAP_R_CLAMP_TO_BORDER						0x00040000
+#define    NV20TCL_TX_WRAP_R_CLAMP							0x00050000
+#define  NV20TCL_TX_ENABLE(x)								(0x00001b0c+((x)*64))
+#define  NV20TCL_TX_ENABLE__SIZE							0x00000004
+#define   NV20TCL_TX_ENABLE_ANISO_SHIFT							4
+#define   NV20TCL_TX_ENABLE_ANISO_MASK							0x00000030
+#define    NV20TCL_TX_ENABLE_ANISO_NONE							0x00000000
+#define    NV20TCL_TX_ENABLE_ANISO_2X							0x00000010
+#define    NV20TCL_TX_ENABLE_ANISO_4X							0x00000020
+#define    NV20TCL_TX_ENABLE_ANISO_8X							0x00000030
+#define   NV20TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT					14
+#define   NV20TCL_TX_ENABLE_MIPMAP_MAX_LOD_MASK						0x0003c000
+#define   NV20TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT					26
+#define   NV20TCL_TX_ENABLE_MIPMAP_MIN_LOD_MASK						0x3c000000
+#define   NV20TCL_TX_ENABLE_ENABLE							(1 << 30)
+#define  NV20TCL_TX_NPOT_PITCH(x)							(0x00001b10+((x)*64))
+#define  NV20TCL_TX_NPOT_PITCH__SIZE							0x00000004
+#define   NV20TCL_TX_NPOT_PITCH_PITCH_SHIFT						16
+#define   NV20TCL_TX_NPOT_PITCH_PITCH_MASK						0xffff0000
+#define  NV20TCL_TX_FILTER(x)								(0x00001b14+((x)*64))
+#define  NV20TCL_TX_FILTER__SIZE							0x00000004
+#define   NV20TCL_TX_FILTER_LOD_BIAS_SHIFT						8
+#define   NV20TCL_TX_FILTER_LOD_BIAS_MASK						0x00000f00
+#define   NV20TCL_TX_FILTER_MINIFY_SHIFT						16
+#define   NV20TCL_TX_FILTER_MINIFY_MASK							0x000f0000
+#define    NV20TCL_TX_FILTER_MINIFY_NEAREST						0x00010000
+#define    NV20TCL_TX_FILTER_MINIFY_LINEAR						0x00020000
+#define    NV20TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST				0x00030000
+#define    NV20TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST				0x00040000
+#define    NV20TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR				0x00050000
+#define    NV20TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR				0x00060000
+#define   NV20TCL_TX_FILTER_MAGNIFY_SHIFT						24
+#define   NV20TCL_TX_FILTER_MAGNIFY_MASK						0x0f000000
+#define    NV20TCL_TX_FILTER_MAGNIFY_NEAREST						0x01000000
+#define    NV20TCL_TX_FILTER_MAGNIFY_LINEAR						0x02000000
+#define  NV20TCL_TX_NPOT_SIZE(x)							(0x00001b1c+((x)*64))
+#define  NV20TCL_TX_NPOT_SIZE__SIZE							0x00000004
+#define   NV20TCL_TX_NPOT_SIZE_H_SHIFT							0
+#define   NV20TCL_TX_NPOT_SIZE_H_MASK							0x0000ffff
+#define   NV20TCL_TX_NPOT_SIZE_W_SHIFT							16
+#define   NV20TCL_TX_NPOT_SIZE_W_MASK							0xffff0000
+#define  NV20TCL_TX_PALETTE_OFFSET(x)							(0x00001b20+((x)*64))
+#define  NV20TCL_TX_PALETTE_OFFSET__SIZE						0x00000004
+#define  NV20TCL_TX_BORDER_COLOR(x)							(0x00001b24+((x)*64))
+#define  NV20TCL_TX_BORDER_COLOR__SIZE							0x00000004
+#define   NV20TCL_TX_BORDER_COLOR_B_SHIFT						0
+#define   NV20TCL_TX_BORDER_COLOR_B_MASK						0x000000ff
+#define   NV20TCL_TX_BORDER_COLOR_G_SHIFT						8
+#define   NV20TCL_TX_BORDER_COLOR_G_MASK						0x0000ff00
+#define   NV20TCL_TX_BORDER_COLOR_R_SHIFT						16
+#define   NV20TCL_TX_BORDER_COLOR_R_MASK						0x00ff0000
+#define   NV20TCL_TX_BORDER_COLOR_A_SHIFT						24
+#define   NV20TCL_TX_BORDER_COLOR_A_MASK						0xff000000
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX00(x)						(0x00001b28+((x)*64))
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX00__SIZE					0x00000004
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX01(x)						(0x00001b2c+((x)*64))
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX01__SIZE					0x00000004
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX11(x)						(0x00001b30+((x)*64))
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX11__SIZE					0x00000004
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX10(x)						(0x00001b34+((x)*64))
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX10__SIZE					0x00000004
+#define  NV20TCL_DEPTH_UNK17D8								0x00001d78
+#define   NV20TCL_DEPTH_UNK17D8_CLAMP_SHIFT						4
+#define   NV20TCL_DEPTH_UNK17D8_CLAMP_MASK						0x000000f0
+#define  NV20TCL_MULTISAMPLE_CONTROL							0x00001d7c
+#define  NV20TCL_CLEAR_DEPTH_VALUE							0x00001d8c
+#define  NV20TCL_CLEAR_VALUE								0x00001d90
+#define  NV20TCL_CLEAR_BUFFERS								0x00001d94
+#define   NV20TCL_CLEAR_BUFFERS_COLOR_A							(1 <<  7)
+#define   NV20TCL_CLEAR_BUFFERS_COLOR_B							(1 <<  6)
+#define   NV20TCL_CLEAR_BUFFERS_COLOR_G							(1 <<  5)
+#define   NV20TCL_CLEAR_BUFFERS_COLOR_R							(1 <<  4)
+#define   NV20TCL_CLEAR_BUFFERS_STENCIL							(1 <<  1)
+#define   NV20TCL_CLEAR_BUFFERS_DEPTH							(1 <<  0)
+#define  NV20TCL_RC_COLOR0								0x00001e20
+#define   NV20TCL_RC_COLOR0_B_SHIFT							0
+#define   NV20TCL_RC_COLOR0_B_MASK							0x000000ff
+#define   NV20TCL_RC_COLOR0_G_SHIFT							8
+#define   NV20TCL_RC_COLOR0_G_MASK							0x0000ff00
+#define   NV20TCL_RC_COLOR0_R_SHIFT							16
+#define   NV20TCL_RC_COLOR0_R_MASK							0x00ff0000
+#define   NV20TCL_RC_COLOR0_A_SHIFT							24
+#define   NV20TCL_RC_COLOR0_A_MASK							0xff000000
+#define  NV20TCL_RC_COLOR1								0x00001e24
+#define   NV20TCL_RC_COLOR1_B_SHIFT							0
+#define   NV20TCL_RC_COLOR1_B_MASK							0x000000ff
+#define   NV20TCL_RC_COLOR1_G_SHIFT							8
+#define   NV20TCL_RC_COLOR1_G_MASK							0x0000ff00
+#define   NV20TCL_RC_COLOR1_R_SHIFT							16
+#define   NV20TCL_RC_COLOR1_R_MASK							0x00ff0000
+#define   NV20TCL_RC_COLOR1_A_SHIFT							24
+#define   NV20TCL_RC_COLOR1_A_MASK							0xff000000
+#define  NV20TCL_BACK_MATERIAL_SHININESS(x)						(0x00001e28+((x)*4))
+#define  NV20TCL_BACK_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV20TCL_RC_OUT_RGB(x)								(0x00001e40+((x)*4))
+#define  NV20TCL_RC_OUT_RGB__SIZE							0x00000008
+#define   NV20TCL_RC_OUT_RGB_CD_OUTPUT_SHIFT						0
+#define   NV20TCL_RC_OUT_RGB_CD_OUTPUT_MASK						0x0000000f
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_ZERO						0x00000000
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR0					0x00000001
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR1					0x00000002
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_FOG						0x00000003
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_PRIMARY_COLOR					0x00000004
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_SECONDARY_COLOR					0x00000005
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE0					0x00000008
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE1					0x00000009
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0						0x0000000c
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_SPARE1						0x0000000d
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_E_TIMES_F					0x0000000f
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE2					0x0000000a
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE3					0x0000000b
+#define   NV20TCL_RC_OUT_RGB_AB_OUTPUT_SHIFT						4
+#define   NV20TCL_RC_OUT_RGB_AB_OUTPUT_MASK						0x000000f0
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_ZERO						0x00000000
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR0					0x00000010
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR1					0x00000020
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_FOG						0x00000030
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_PRIMARY_COLOR					0x00000040
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_SECONDARY_COLOR					0x00000050
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE0					0x00000080
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE1					0x00000090
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0						0x000000c0
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_SPARE1						0x000000d0
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000000e0
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_E_TIMES_F					0x000000f0
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE2					0x000000a0
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE3					0x000000b0
+#define   NV20TCL_RC_OUT_RGB_SUM_OUTPUT_SHIFT						8
+#define   NV20TCL_RC_OUT_RGB_SUM_OUTPUT_MASK						0x00000f00
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_ZERO						0x00000000
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR0				0x00000100
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR1				0x00000200
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_FOG						0x00000300
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_PRIMARY_COLOR					0x00000400
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_SECONDARY_COLOR				0x00000500
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE0					0x00000800
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE1					0x00000900
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0						0x00000c00
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE1						0x00000d00
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_E_TIMES_F					0x00000f00
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE2					0x00000a00
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE3					0x00000b00
+#define   NV20TCL_RC_OUT_RGB_CD_DOT_PRODUCT						(1 << 12)
+#define   NV20TCL_RC_OUT_RGB_AB_DOT_PRODUCT						(1 << 13)
+#define   NV20TCL_RC_OUT_RGB_MUX_SUM							(1 << 14)
+#define   NV20TCL_RC_OUT_RGB_BIAS							(1 << 15)
+#define    NV20TCL_RC_OUT_RGB_BIAS_NONE							0x00000000
+#define    NV20TCL_RC_OUT_RGB_BIAS_BIAS_BY_NEGATIVE_ONE_HALF				0x00008000
+#define   NV20TCL_RC_OUT_RGB_SCALE_SHIFT						17
+#define   NV20TCL_RC_OUT_RGB_SCALE_MASK							0x00000000
+#define    NV20TCL_RC_OUT_RGB_SCALE_NONE						0x00000000
+#define    NV20TCL_RC_OUT_RGB_SCALE_SCALE_BY_TWO					0x00020000
+#define    NV20TCL_RC_OUT_RGB_SCALE_SCALE_BY_FOUR					0x00040000
+#define    NV20TCL_RC_OUT_RGB_SCALE_SCALE_BY_ONE_HALF					0x00060000
+#define  NV20TCL_RC_ENABLE								0x00001e60
+#define   NV20TCL_RC_ENABLE_NUM_COMBINERS_SHIFT						0
+#define   NV20TCL_RC_ENABLE_NUM_COMBINERS_MASK						0x0000000f
+#define  NV20TCL_TX_RCOMP								0x00001e6c
+#define   NV20TCL_TX_RCOMP_NEVER							0x00000000
+#define   NV20TCL_TX_RCOMP_GREATER							0x00000001
+#define   NV20TCL_TX_RCOMP_EQUAL							0x00000002
+#define   NV20TCL_TX_RCOMP_GEQUAL							0x00000003
+#define   NV20TCL_TX_RCOMP_LESS								0x00000004
+#define   NV20TCL_TX_RCOMP_NOTEQUAL							0x00000005
+#define   NV20TCL_TX_RCOMP_LEQUAL							0x00000006
+#define   NV20TCL_TX_RCOMP_ALWAYS							0x00000007
+#define  NV20TCL_TX_SHADER_OP								0x00001e70
+#define   NV20TCL_TX_SHADER_OP_TX0_SHIFT						0
+#define   NV20TCL_TX_SHADER_OP_TX0_MASK							0x0000001f
+#define    NV20TCL_TX_SHADER_OP_TX0_NONE						0x00000000
+#define    NV20TCL_TX_SHADER_OP_TX0_TEXTURE_2D						0x00000001
+#define    NV20TCL_TX_SHADER_OP_TX0_PASS_THROUGH					0x00000004
+#define    NV20TCL_TX_SHADER_OP_TX0_CULL_FRAGMENT					0x00000005
+#define    NV20TCL_TX_SHADER_OP_TX0_OFFSET_TEXTURE_2D					0x00000006
+#define    NV20TCL_TX_SHADER_OP_TX0_DOT_PRODUCT_TEXTURE_2D				0x00000009
+#define    NV20TCL_TX_SHADER_OP_TX0_DOT_PRODUCT_DEPTH_REPLACE				0x0000000a
+#define    NV20TCL_TX_SHADER_OP_TX0_DEPENDANT_AR_TEXTURE_2D				0x0000000f
+#define    NV20TCL_TX_SHADER_OP_TX0_DEPENDANT_GB_TEXTURE_2D				0x00000010
+#define    NV20TCL_TX_SHADER_OP_TX0_DOT_PRODUCT						0x00000011
+#define   NV20TCL_TX_SHADER_OP_TX1_SHIFT						5
+#define   NV20TCL_TX_SHADER_OP_TX1_MASK							0x000003e0
+#define    NV20TCL_TX_SHADER_OP_TX1_NONE						0x00000000
+#define    NV20TCL_TX_SHADER_OP_TX1_TEXTURE_2D						0x00000020
+#define    NV20TCL_TX_SHADER_OP_TX1_PASS_THROUGH					0x00000080
+#define    NV20TCL_TX_SHADER_OP_TX1_CULL_FRAGMENT					0x000000a0
+#define    NV20TCL_TX_SHADER_OP_TX1_OFFSET_TEXTURE_2D					0x000000c0
+#define    NV20TCL_TX_SHADER_OP_TX1_DOT_PRODUCT_TEXTURE_2D				0x00000120
+#define    NV20TCL_TX_SHADER_OP_TX1_DOT_PRODUCT_DEPTH_REPLACE				0x00000140
+#define    NV20TCL_TX_SHADER_OP_TX1_DEPENDANT_AR_TEXTURE_2D				0x000001e0
+#define    NV20TCL_TX_SHADER_OP_TX1_DEPENDANT_GB_TEXTURE_2D				0x00000200
+#define    NV20TCL_TX_SHADER_OP_TX1_DOT_PRODUCT						0x00000220
+#define   NV20TCL_TX_SHADER_OP_TX2_SHIFT						10
+#define   NV20TCL_TX_SHADER_OP_TX2_MASK							0x00007c00
+#define    NV20TCL_TX_SHADER_OP_TX2_NONE						0x00000000
+#define    NV20TCL_TX_SHADER_OP_TX2_TEXTURE_2D						0x00000400
+#define    NV20TCL_TX_SHADER_OP_TX2_PASS_THROUGH					0x00001000
+#define    NV20TCL_TX_SHADER_OP_TX2_CULL_FRAGMENT					0x00001400
+#define    NV20TCL_TX_SHADER_OP_TX2_OFFSET_TEXTURE_2D					0x00001800
+#define    NV20TCL_TX_SHADER_OP_TX2_DOT_PRODUCT_TEXTURE_2D				0x00002400
+#define    NV20TCL_TX_SHADER_OP_TX2_DOT_PRODUCT_DEPTH_REPLACE				0x00002800
+#define    NV20TCL_TX_SHADER_OP_TX2_DEPENDANT_AR_TEXTURE_2D				0x00003c00
+#define    NV20TCL_TX_SHADER_OP_TX2_DEPENDANT_GB_TEXTURE_2D				0x00004000
+#define    NV20TCL_TX_SHADER_OP_TX2_DOT_PRODUCT						0x00004400
+#define   NV20TCL_TX_SHADER_OP_TX3_SHIFT						15
+#define   NV20TCL_TX_SHADER_OP_TX3_MASK							0x000f8000
+#define    NV20TCL_TX_SHADER_OP_TX3_NONE						0x00000000
+#define    NV20TCL_TX_SHADER_OP_TX3_TEXTURE_2D						0x00008000
+#define    NV20TCL_TX_SHADER_OP_TX3_PASS_THROUGH					0x00020000
+#define    NV20TCL_TX_SHADER_OP_TX3_CULL_FRAGMENT					0x00028000
+#define    NV20TCL_TX_SHADER_OP_TX3_OFFSET_TEXTURE_2D					0x00030000
+#define    NV20TCL_TX_SHADER_OP_TX3_DOT_PRODUCT_TEXTURE_2D				0x00048000
+#define    NV20TCL_TX_SHADER_OP_TX3_DOT_PRODUCT_DEPTH_REPLACE				0x00050000
+#define    NV20TCL_TX_SHADER_OP_TX3_DEPENDANT_AR_TEXTURE_2D				0x00078000
+#define    NV20TCL_TX_SHADER_OP_TX3_DEPENDANT_GB_TEXTURE_2D				0x00080000
+#define    NV20TCL_TX_SHADER_OP_TX3_DOT_PRODUCT						0x00088000
+#define  NV20TCL_TX_SHADER_DOTMAPPING							0x00001e74
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX0_SHIFT					0
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX0_MASK						0x0000000f
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX1_SHIFT					4
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX1_MASK						0x000000f0
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX2_SHIFT					8
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX2_MASK						0x00000f00
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX3_SHIFT					12
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX3_MASK						0x0000f000
+#define  NV20TCL_TX_SHADER_PREVIOUS							0x00001e78
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX0_SHIFT						8
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX0_MASK						0x00000f00
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX1_SHIFT						12
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX1_MASK						0x0000f000
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX2_SHIFT						16
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX2_MASK						0x00030000
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX3_SHIFT						20
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX3_MASK						0x00300000
+#define  NV20TCL_ENGINE									0x00001e94
+#define   NV20TCL_ENGINE_VP								(1 <<  1)
+#define   NV20TCL_ENGINE_FIXED								(1 <<  2)
+#define  NV20TCL_VP_UPLOAD_FROM_ID							0x00001e9c
+#define  NV20TCL_VP_START_FROM_ID							0x00001ea0
+#define  NV20TCL_VP_UPLOAD_CONST_ID							0x00001ea4
+
+
+#define NV25TCL										0x00000597
+
+#define  NV25TCL_DMA_IN_MEMORY4								0x0000019c
+#define  NV25TCL_DMA_IN_MEMORY5								0x000001a0
+#define  NV25TCL_DMA_IN_MEMORY8								0x000001ac
+#define  NV25TCL_DMA_IN_MEMORY9								0x000001b0
+
+
+#define NV30TCL										0x00000397
+
+
+
+#define NV35TCL										0x00000497
+
+
+
+#define NV34TCL										0x00000697
+
+#define  NV34TCL_NOP									0x00000100
+#define  NV34TCL_NOTIFY									0x00000104
+#define  NV34TCL_DMA_NOTIFY								0x00000180
+#define  NV34TCL_DMA_TEXTURE0								0x00000184
+#define  NV34TCL_DMA_TEXTURE1								0x00000188
+#define  NV34TCL_DMA_COLOR1								0x0000018c
+#define  NV34TCL_DMA_COLOR0								0x00000194
+#define  NV34TCL_DMA_ZETA								0x00000198
+#define  NV34TCL_DMA_VTXBUF0								0x0000019c
+#define  NV34TCL_DMA_VTXBUF1								0x000001a0
+#define  NV34TCL_DMA_FENCE								0x000001a4
+#define  NV34TCL_DMA_QUERY								0x000001a8
+#define  NV34TCL_DMA_IN_MEMORY7								0x000001ac
+#define  NV34TCL_DMA_IN_MEMORY8								0x000001b0
+#define  NV34TCL_RT_HORIZ								0x00000200
+#define   NV34TCL_RT_HORIZ_X_SHIFT							0
+#define   NV34TCL_RT_HORIZ_X_MASK							0x0000ffff
+#define   NV34TCL_RT_HORIZ_W_SHIFT							16
+#define   NV34TCL_RT_HORIZ_W_MASK							0xffff0000
+#define  NV34TCL_RT_VERT								0x00000204
+#define   NV34TCL_RT_VERT_Y_SHIFT							0
+#define   NV34TCL_RT_VERT_Y_MASK							0x0000ffff
+#define   NV34TCL_RT_VERT_H_SHIFT							16
+#define   NV34TCL_RT_VERT_H_MASK							0xffff0000
+#define  NV34TCL_RT_FORMAT								0x00000208
+#define   NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT						24
+#define   NV34TCL_RT_FORMAT_LOG2_HEIGHT_MASK						0xff000000
+#define   NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT						16
+#define   NV34TCL_RT_FORMAT_LOG2_WIDTH_MASK						0x00ff0000
+#define   NV34TCL_RT_FORMAT_TYPE_SHIFT							8
+#define   NV34TCL_RT_FORMAT_TYPE_MASK							0x00000f00
+#define    NV34TCL_RT_FORMAT_TYPE_LINEAR						0x00000100
+#define    NV34TCL_RT_FORMAT_TYPE_SWIZZLED						0x00000200
+#define   NV34TCL_RT_FORMAT_ZETA_SHIFT							5
+#define   NV34TCL_RT_FORMAT_ZETA_MASK							0x000000e0
+#define    NV34TCL_RT_FORMAT_ZETA_Z16							0x00000020
+#define    NV34TCL_RT_FORMAT_ZETA_Z24S8							0x00000040
+#define   NV34TCL_RT_FORMAT_COLOR_SHIFT							0
+#define   NV34TCL_RT_FORMAT_COLOR_MASK							0x0000001f
+#define    NV34TCL_RT_FORMAT_COLOR_R5G6B5						0x00000003
+#define    NV34TCL_RT_FORMAT_COLOR_X8R8G8B8						0x00000005
+#define    NV34TCL_RT_FORMAT_COLOR_A8R8G8B8						0x00000008
+#define    NV34TCL_RT_FORMAT_COLOR_B8							0x00000009
+#define    NV34TCL_RT_FORMAT_COLOR_UNKNOWN						0x0000000d
+#define    NV34TCL_RT_FORMAT_COLOR_X8B8G8R8						0x0000000f
+#define    NV34TCL_RT_FORMAT_COLOR_A8B8G8R8						0x00000010
+#define  NV34TCL_COLOR0_PITCH								0x0000020c
+#define   NV34TCL_COLOR0_PITCH_COLOR0_SHIFT						0
+#define   NV34TCL_COLOR0_PITCH_COLOR0_MASK						0x0000ffff
+#define   NV34TCL_COLOR0_PITCH_ZETA_SHIFT						16
+#define   NV34TCL_COLOR0_PITCH_ZETA_MASK						0xffff0000
+#define  NV34TCL_COLOR0_OFFSET								0x00000210
+#define  NV34TCL_ZETA_OFFSET								0x00000214
+#define  NV34TCL_COLOR1_OFFSET								0x00000218
+#define  NV34TCL_COLOR1_PITCH								0x0000021c
+#define  NV34TCL_RT_ENABLE								0x00000220
+#define   NV34TCL_RT_ENABLE_MRT								(1 <<  4)
+#define   NV34TCL_RT_ENABLE_COLOR1							(1 <<  1)
+#define   NV34TCL_RT_ENABLE_COLOR0							(1 <<  0)
+#define  NV34TCL_LMA_DEPTH_PITCH							0x0000022c
+#define  NV34TCL_LMA_DEPTH_OFFSET							0x00000230
+#define  NV34TCL_TX_UNITS_ENABLE							0x0000023c
+#define   NV34TCL_TX_UNITS_ENABLE_TX0							(1 <<  0)
+#define   NV34TCL_TX_UNITS_ENABLE_TX1							(1 <<  1)
+#define   NV34TCL_TX_UNITS_ENABLE_TX2							(1 <<  2)
+#define   NV34TCL_TX_UNITS_ENABLE_TX3							(1 <<  3)
+#define   NV34TCL_TX_UNITS_ENABLE_TX4							(1 <<  4)
+#define   NV34TCL_TX_UNITS_ENABLE_TX5							(1 <<  5)
+#define   NV34TCL_TX_UNITS_ENABLE_TX6							(1 <<  6)
+#define   NV34TCL_TX_UNITS_ENABLE_TX7							(1 <<  7)
+#define  NV34TCL_TX_MATRIX_ENABLE(x)							(0x00000240+((x)*4))
+#define  NV34TCL_TX_MATRIX_ENABLE__SIZE							0x00000008
+#define  NV34TCL_VIEWPORT_TX_ORIGIN							0x000002b8
+#define   NV34TCL_VIEWPORT_TX_ORIGIN_X_SHIFT						0
+#define   NV34TCL_VIEWPORT_TX_ORIGIN_X_MASK						0x0000ffff
+#define   NV34TCL_VIEWPORT_TX_ORIGIN_Y_SHIFT						16
+#define   NV34TCL_VIEWPORT_TX_ORIGIN_Y_MASK						0xffff0000
+#define  NV34TCL_VIEWPORT_CLIP_MODE							0x000002bc
+#define  NV34TCL_VIEWPORT_CLIP_HORIZ(x)							(0x000002c0+((x)*8))
+#define  NV34TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define   NV34TCL_VIEWPORT_CLIP_HORIZ_L_SHIFT						0
+#define   NV34TCL_VIEWPORT_CLIP_HORIZ_L_MASK						0x0000ffff
+#define   NV34TCL_VIEWPORT_CLIP_HORIZ_R_SHIFT						16
+#define   NV34TCL_VIEWPORT_CLIP_HORIZ_R_MASK						0xffff0000
+#define  NV34TCL_VIEWPORT_CLIP_VERT(x)							(0x000002c4+((x)*8))
+#define  NV34TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define   NV34TCL_VIEWPORT_CLIP_VERT_T_SHIFT						0
+#define   NV34TCL_VIEWPORT_CLIP_VERT_T_MASK						0x0000ffff
+#define   NV34TCL_VIEWPORT_CLIP_VERT_D_SHIFT						16
+#define   NV34TCL_VIEWPORT_CLIP_VERT_D_MASK						0xffff0000
+#define  NV34TCL_DITHER_ENABLE								0x00000300
+#define  NV34TCL_ALPHA_FUNC_ENABLE							0x00000304
+#define  NV34TCL_ALPHA_FUNC_FUNC							0x00000308
+#define   NV34TCL_ALPHA_FUNC_FUNC_NEVER							0x00000200
+#define   NV34TCL_ALPHA_FUNC_FUNC_LESS							0x00000201
+#define   NV34TCL_ALPHA_FUNC_FUNC_EQUAL							0x00000202
+#define   NV34TCL_ALPHA_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV34TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV34TCL_ALPHA_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV34TCL_ALPHA_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV34TCL_ALPHA_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV34TCL_ALPHA_FUNC_REF								0x0000030c
+#define  NV34TCL_BLEND_FUNC_ENABLE							0x00000310
+#define  NV34TCL_BLEND_FUNC_SRC								0x00000314
+#define   NV34TCL_BLEND_FUNC_SRC_RGB_SHIFT						0
+#define   NV34TCL_BLEND_FUNC_SRC_RGB_MASK						0x0000ffff
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ZERO						0x00000000
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE						0x00000001
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR						0x00000300
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA						0x00000302
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA						0x00000304
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_DST_COLOR						0x00000306
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE				0x00000308
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR					0x00008001
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA					0x00008003
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV34TCL_BLEND_FUNC_SRC_ALPHA_SHIFT						16
+#define   NV34TCL_BLEND_FUNC_SRC_ALPHA_MASK						0xffff0000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ZERO						0x00000000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE						0x00010000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR					0x03000000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x03010000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA					0x03020000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x03030000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA					0x03040000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x03050000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR					0x03060000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x03070000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE				0x03080000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR					0x80010000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x80020000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA					0x80030000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x80040000
+#define  NV34TCL_BLEND_FUNC_DST								0x00000318
+#define   NV34TCL_BLEND_FUNC_DST_RGB_SHIFT						0
+#define   NV34TCL_BLEND_FUNC_DST_RGB_MASK						0x0000ffff
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ZERO						0x00000000
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE						0x00000001
+#define    NV34TCL_BLEND_FUNC_DST_RGB_SRC_COLOR						0x00000300
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define    NV34TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA						0x00000302
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define    NV34TCL_BLEND_FUNC_DST_RGB_DST_ALPHA						0x00000304
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define    NV34TCL_BLEND_FUNC_DST_RGB_DST_COLOR						0x00000306
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define    NV34TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE				0x00000308
+#define    NV34TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR					0x00008001
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define    NV34TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA					0x00008003
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV34TCL_BLEND_FUNC_DST_ALPHA_SHIFT						16
+#define   NV34TCL_BLEND_FUNC_DST_ALPHA_MASK						0xffff0000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ZERO						0x00000000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE						0x00010000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR					0x03000000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x03010000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA					0x03020000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x03030000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA					0x03040000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x03050000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR					0x03060000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR				0x03070000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE				0x03080000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR					0x80010000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x80020000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA					0x80030000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x80040000
+#define  NV34TCL_BLEND_COLOR								0x0000031c
+#define   NV34TCL_BLEND_COLOR_B_SHIFT							0
+#define   NV34TCL_BLEND_COLOR_B_MASK							0x000000ff
+#define   NV34TCL_BLEND_COLOR_G_SHIFT							8
+#define   NV34TCL_BLEND_COLOR_G_MASK							0x0000ff00
+#define   NV34TCL_BLEND_COLOR_R_SHIFT							16
+#define   NV34TCL_BLEND_COLOR_R_MASK							0x00ff0000
+#define   NV34TCL_BLEND_COLOR_A_SHIFT							24
+#define   NV34TCL_BLEND_COLOR_A_MASK							0xff000000
+#define  NV34TCL_BLEND_EQUATION								0x00000320
+#define   NV34TCL_BLEND_EQUATION_FUNC_ADD						0x00008006
+#define   NV34TCL_BLEND_EQUATION_MIN							0x00008007
+#define   NV34TCL_BLEND_EQUATION_MAX							0x00008008
+#define   NV34TCL_BLEND_EQUATION_FUNC_SUBTRACT						0x0000800a
+#define   NV34TCL_BLEND_EQUATION_FUNC_REVERSE_SUBTRACT					0x0000800b
+#define  NV34TCL_COLOR_MASK								0x00000324
+#define   NV34TCL_COLOR_MASK_B_SHIFT							0
+#define   NV34TCL_COLOR_MASK_B_MASK							0x000000ff
+#define   NV34TCL_COLOR_MASK_G_SHIFT							8
+#define   NV34TCL_COLOR_MASK_G_MASK							0x0000ff00
+#define   NV34TCL_COLOR_MASK_R_SHIFT							16
+#define   NV34TCL_COLOR_MASK_R_MASK							0x00ff0000
+#define   NV34TCL_COLOR_MASK_A_SHIFT							24
+#define   NV34TCL_COLOR_MASK_A_MASK							0xff000000
+#define  NV34TCL_STENCIL_FRONT_ENABLE							0x00000328
+#define  NV34TCL_STENCIL_FRONT_MASK							0x0000032c
+#define  NV34TCL_STENCIL_FRONT_FUNC_FUNC						0x00000330
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_NEVER						0x00000200
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_LESS						0x00000201
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_EQUAL						0x00000202
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_LEQUAL					0x00000203
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_GEQUAL					0x00000206
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_ALWAYS					0x00000207
+#define  NV34TCL_STENCIL_FRONT_FUNC_REF							0x00000334
+#define  NV34TCL_STENCIL_FRONT_FUNC_MASK						0x00000338
+#define  NV34TCL_STENCIL_FRONT_OP_FAIL							0x0000033c
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_ZERO						0x00000000
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_REPLACE						0x00001e01
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_INCR						0x00001e02
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_DECR						0x00001e03
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_FRONT_OP_ZFAIL							0x00000340
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_ZERO						0x00000000
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_REPLACE					0x00001e01
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_INCR						0x00001e02
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_DECR						0x00001e03
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_FRONT_OP_ZPASS							0x00000344
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_ZERO						0x00000000
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_REPLACE					0x00001e01
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_INCR						0x00001e02
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_DECR						0x00001e03
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_BACK_ENABLE							0x00000348
+#define  NV34TCL_STENCIL_BACK_MASK							0x0000034c
+#define  NV34TCL_STENCIL_BACK_FUNC_FUNC							0x00000350
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_NEVER						0x00000200
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_LESS						0x00000201
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_EQUAL						0x00000202
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV34TCL_STENCIL_BACK_FUNC_REF							0x00000354
+#define  NV34TCL_STENCIL_BACK_FUNC_MASK							0x00000358
+#define  NV34TCL_STENCIL_BACK_OP_FAIL							0x0000035c
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_ZERO						0x00000000
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_REPLACE						0x00001e01
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_INCR						0x00001e02
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_DECR						0x00001e03
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_BACK_OP_ZFAIL							0x00000360
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_ZERO						0x00000000
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_INCR						0x00001e02
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_DECR						0x00001e03
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_BACK_OP_ZPASS							0x00000364
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_ZERO						0x00000000
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_REPLACE						0x00001e01
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_INCR						0x00001e02
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_DECR						0x00001e03
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV34TCL_SHADE_MODEL								0x00000368
+#define   NV34TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV34TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV34TCL_FOG_ENABLE								0x0000036c
+#define  NV34TCL_FOG_COLOR								0x00000370
+#define   NV34TCL_FOG_COLOR_R_SHIFT							0
+#define   NV34TCL_FOG_COLOR_R_MASK							0x000000ff
+#define   NV34TCL_FOG_COLOR_G_SHIFT							8
+#define   NV34TCL_FOG_COLOR_G_MASK							0x0000ff00
+#define   NV34TCL_FOG_COLOR_B_SHIFT							16
+#define   NV34TCL_FOG_COLOR_B_MASK							0x00ff0000
+#define   NV34TCL_FOG_COLOR_A_SHIFT							24
+#define   NV34TCL_FOG_COLOR_A_MASK							0xff000000
+#define  NV34TCL_COLOR_LOGIC_OP_ENABLE							0x00000374
+#define  NV34TCL_COLOR_LOGIC_OP_OP							0x00000378
+#define   NV34TCL_COLOR_LOGIC_OP_OP_CLEAR						0x00001500
+#define   NV34TCL_COLOR_LOGIC_OP_OP_AND							0x00001501
+#define   NV34TCL_COLOR_LOGIC_OP_OP_AND_REVERSE						0x00001502
+#define   NV34TCL_COLOR_LOGIC_OP_OP_COPY						0x00001503
+#define   NV34TCL_COLOR_LOGIC_OP_OP_AND_INVERTED					0x00001504
+#define   NV34TCL_COLOR_LOGIC_OP_OP_NOOP						0x00001505
+#define   NV34TCL_COLOR_LOGIC_OP_OP_XOR							0x00001506
+#define   NV34TCL_COLOR_LOGIC_OP_OP_OR							0x00001507
+#define   NV34TCL_COLOR_LOGIC_OP_OP_NOR							0x00001508
+#define   NV34TCL_COLOR_LOGIC_OP_OP_EQUIV						0x00001509
+#define   NV34TCL_COLOR_LOGIC_OP_OP_INVERT						0x0000150a
+#define   NV34TCL_COLOR_LOGIC_OP_OP_OR_REVERSE						0x0000150b
+#define   NV34TCL_COLOR_LOGIC_OP_OP_COPY_INVERTED					0x0000150c
+#define   NV34TCL_COLOR_LOGIC_OP_OP_OR_INVERTED						0x0000150d
+#define   NV34TCL_COLOR_LOGIC_OP_OP_NAND						0x0000150e
+#define   NV34TCL_COLOR_LOGIC_OP_OP_SET							0x0000150f
+#define  NV34TCL_NORMALIZE_ENABLE							0x0000037c
+#define  NV34TCL_COLOR_MATERIAL								0x00000390
+#define   NV34TCL_COLOR_MATERIAL_FRONT_EMISSION_ENABLE					(1 <<  0)
+#define   NV34TCL_COLOR_MATERIAL_FRONT_AMBIENT_ENABLE					(1 <<  2)
+#define   NV34TCL_COLOR_MATERIAL_FRONT_DIFFUSE_ENABLE					(1 <<  4)
+#define   NV34TCL_COLOR_MATERIAL_FRONT_SPECULAR_ENABLE					(1 <<  6)
+#define   NV34TCL_COLOR_MATERIAL_BACK_EMISSION_ENABLE					(1 <<  8)
+#define   NV34TCL_COLOR_MATERIAL_BACK_AMBIENT_ENABLE					(1 << 10)
+#define   NV34TCL_COLOR_MATERIAL_BACK_DIFFUSE_ENABLE					(1 << 12)
+#define   NV34TCL_COLOR_MATERIAL_BACK_SPECULAR_ENABLE					(1 << 14)
+#define  NV34TCL_DEPTH_RANGE_NEAR							0x00000394
+#define  NV34TCL_DEPTH_RANGE_FAR							0x00000398
+#define  NV34TCL_COLOR_MATERIAL_FRONT_R							0x000003a0
+#define  NV34TCL_COLOR_MATERIAL_FRONT_G							0x000003a4
+#define  NV34TCL_COLOR_MATERIAL_FRONT_B							0x000003a8
+#define  NV34TCL_COLOR_MATERIAL_FRONT_A							0x000003b4
+#define  NV34TCL_LINE_WIDTH								0x000003b8
+#define  NV34TCL_LINE_SMOOTH_ENABLE							0x000003bc
+#define  NV34TCL_TX_GEN_S(x)								(0x00000400+((x)*16))
+#define  NV34TCL_TX_GEN_S__SIZE								0x00000008
+#define   NV34TCL_TX_GEN_S_FALSE							0x00000000
+#define   NV34TCL_TX_GEN_S_EYE_LINEAR							0x00002400
+#define   NV34TCL_TX_GEN_S_OBJECT_LINEAR						0x00002401
+#define   NV34TCL_TX_GEN_S_SPHERE_MAP							0x00002402
+#define   NV34TCL_TX_GEN_S_NORMAL_MAP							0x00008511
+#define   NV34TCL_TX_GEN_S_REFLECTION_MAP						0x00008512
+#define  NV34TCL_TX_GEN_T(x)								(0x00000404+((x)*16))
+#define  NV34TCL_TX_GEN_T__SIZE								0x00000008
+#define   NV34TCL_TX_GEN_T_FALSE							0x00000000
+#define   NV34TCL_TX_GEN_T_EYE_LINEAR							0x00002400
+#define   NV34TCL_TX_GEN_T_OBJECT_LINEAR						0x00002401
+#define   NV34TCL_TX_GEN_T_SPHERE_MAP							0x00002402
+#define   NV34TCL_TX_GEN_T_NORMAL_MAP							0x00008511
+#define   NV34TCL_TX_GEN_T_REFLECTION_MAP						0x00008512
+#define  NV34TCL_TX_GEN_R(x)								(0x00000408+((x)*16))
+#define  NV34TCL_TX_GEN_R__SIZE								0x00000008
+#define   NV34TCL_TX_GEN_R_FALSE							0x00000000
+#define   NV34TCL_TX_GEN_R_EYE_LINEAR							0x00002400
+#define   NV34TCL_TX_GEN_R_OBJECT_LINEAR						0x00002401
+#define   NV34TCL_TX_GEN_R_SPHERE_MAP							0x00002402
+#define   NV34TCL_TX_GEN_R_NORMAL_MAP							0x00008511
+#define   NV34TCL_TX_GEN_R_REFLECTION_MAP						0x00008512
+#define  NV34TCL_TX_GEN_Q(x)								(0x0000040c+((x)*16))
+#define  NV34TCL_TX_GEN_Q__SIZE								0x00000008
+#define   NV34TCL_TX_GEN_Q_FALSE							0x00000000
+#define   NV34TCL_TX_GEN_Q_EYE_LINEAR							0x00002400
+#define   NV34TCL_TX_GEN_Q_OBJECT_LINEAR						0x00002401
+#define   NV34TCL_TX_GEN_Q_SPHERE_MAP							0x00002402
+#define   NV34TCL_TX_GEN_Q_NORMAL_MAP							0x00008511
+#define   NV34TCL_TX_GEN_Q_REFLECTION_MAP						0x00008512
+#define  NV34TCL_MODELVIEW_MATRIX(x)							(0x00000480+((x)*4))
+#define  NV34TCL_MODELVIEW_MATRIX__SIZE							0x00000010
+#define  NV34TCL_INVERSE_MODELVIEW_MATRIX(x)						(0x00000580+((x)*4))
+#define  NV34TCL_INVERSE_MODELVIEW_MATRIX__SIZE						0x0000000c
+#define  NV34TCL_PROJECTION_MATRIX(x)							(0x00000680+((x)*4))
+#define  NV34TCL_PROJECTION_MATRIX__SIZE						0x00000010
+#define  NV34TCL_TX0_MATRIX(x)								(0x000006c0+((x)*4))
+#define  NV34TCL_TX0_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX1_MATRIX(x)								(0x00000700+((x)*4))
+#define  NV34TCL_TX1_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX2_MATRIX(x)								(0x00000740+((x)*4))
+#define  NV34TCL_TX2_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX3_MATRIX(x)								(0x00000780+((x)*4))
+#define  NV34TCL_TX3_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX4_MATRIX(x)								(0x000007c0+((x)*4))
+#define  NV34TCL_TX4_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX5_MATRIX(x)								(0x00000800+((x)*4))
+#define  NV34TCL_TX5_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX6_MATRIX(x)								(0x00000840+((x)*4))
+#define  NV34TCL_TX6_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX7_MATRIX(x)								(0x00000880+((x)*4))
+#define  NV34TCL_TX7_MATRIX__SIZE							0x00000010
+#define  NV34TCL_SCISSOR_HORIZ								0x000008c0
+#define   NV34TCL_SCISSOR_HORIZ_X_SHIFT							0
+#define   NV34TCL_SCISSOR_HORIZ_X_MASK							0x0000ffff
+#define   NV34TCL_SCISSOR_HORIZ_W_SHIFT							16
+#define   NV34TCL_SCISSOR_HORIZ_W_MASK							0xffff0000
+#define  NV34TCL_SCISSOR_VERT								0x000008c4
+#define   NV34TCL_SCISSOR_VERT_Y_SHIFT							0
+#define   NV34TCL_SCISSOR_VERT_Y_MASK							0x0000ffff
+#define   NV34TCL_SCISSOR_VERT_H_SHIFT							16
+#define   NV34TCL_SCISSOR_VERT_H_MASK							0xffff0000
+#define  NV34TCL_FOG_COORD_DIST								0x000008c8
+#define  NV34TCL_FOG_MODE								0x000008cc
+#define  NV34TCL_FOG_EQUATION_CONSTANT							0x000008d0
+#define  NV34TCL_FOG_EQUATION_LINEAR							0x000008d4
+#define  NV34TCL_FOG_EQUATION_QUADRATIC							0x000008d8
+#define  NV34TCL_FP_ACTIVE_PROGRAM							0x000008e4
+#define   NV34TCL_FP_ACTIVE_PROGRAM_DMA0						(1 <<  0)
+#define   NV34TCL_FP_ACTIVE_PROGRAM_DMA1						(1 <<  1)
+#define   NV34TCL_FP_ACTIVE_PROGRAM_OFFSET_SHIFT					2
+#define   NV34TCL_FP_ACTIVE_PROGRAM_OFFSET_MASK						0xfffffffc
+#define  NV34TCL_RC_COLOR0								0x000008ec
+#define   NV34TCL_RC_COLOR0_B_SHIFT							0
+#define   NV34TCL_RC_COLOR0_B_MASK							0x000000ff
+#define   NV34TCL_RC_COLOR0_G_SHIFT							8
+#define   NV34TCL_RC_COLOR0_G_MASK							0x0000ff00
+#define   NV34TCL_RC_COLOR0_R_SHIFT							16
+#define   NV34TCL_RC_COLOR0_R_MASK							0x00ff0000
+#define   NV34TCL_RC_COLOR0_A_SHIFT							24
+#define   NV34TCL_RC_COLOR0_A_MASK							0xff000000
+#define  NV34TCL_RC_COLOR1								0x000008f0
+#define   NV34TCL_RC_COLOR1_B_SHIFT							0
+#define   NV34TCL_RC_COLOR1_B_MASK							0x000000ff
+#define   NV34TCL_RC_COLOR1_G_SHIFT							8
+#define   NV34TCL_RC_COLOR1_G_MASK							0x0000ff00
+#define   NV34TCL_RC_COLOR1_R_SHIFT							16
+#define   NV34TCL_RC_COLOR1_R_MASK							0x00ff0000
+#define   NV34TCL_RC_COLOR1_A_SHIFT							24
+#define   NV34TCL_RC_COLOR1_A_MASK							0xff000000
+#define  NV34TCL_RC_FINAL0								0x000008f4
+#define   NV34TCL_RC_FINAL0_D_INPUT_SHIFT						0
+#define   NV34TCL_RC_FINAL0_D_INPUT_MASK						0x0000000f
+#define    NV34TCL_RC_FINAL0_D_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR0					0x00000001
+#define    NV34TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR1					0x00000002
+#define    NV34TCL_RC_FINAL0_D_INPUT_FOG						0x00000003
+#define    NV34TCL_RC_FINAL0_D_INPUT_PRIMARY_COLOR					0x00000004
+#define    NV34TCL_RC_FINAL0_D_INPUT_SECONDARY_COLOR					0x00000005
+#define    NV34TCL_RC_FINAL0_D_INPUT_TEXTURE0						0x00000008
+#define    NV34TCL_RC_FINAL0_D_INPUT_TEXTURE1						0x00000009
+#define    NV34TCL_RC_FINAL0_D_INPUT_SPARE0						0x0000000c
+#define    NV34TCL_RC_FINAL0_D_INPUT_SPARE1						0x0000000d
+#define    NV34TCL_RC_FINAL0_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV34TCL_RC_FINAL0_D_INPUT_E_TIMES_F						0x0000000f
+#define    NV34TCL_RC_FINAL0_D_INPUT_TEXTURE2						0x0000000a
+#define    NV34TCL_RC_FINAL0_D_INPUT_TEXTURE3						0x0000000b
+#define   NV34TCL_RC_FINAL0_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV34TCL_RC_FINAL0_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL0_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV34TCL_RC_FINAL0_D_MAPPING_SHIFT						5
+#define   NV34TCL_RC_FINAL0_D_MAPPING_MASK						0x000000e0
+#define    NV34TCL_RC_FINAL0_D_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_FINAL0_D_MAPPING_UNSIGNED_INVERT					0x00000020
+#define    NV34TCL_RC_FINAL0_D_MAPPING_EXPAND_NORMAL					0x00000040
+#define    NV34TCL_RC_FINAL0_D_MAPPING_EXPAND_NEGATE					0x00000060
+#define    NV34TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NORMAL					0x00000080
+#define    NV34TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NEGATE					0x000000a0
+#define    NV34TCL_RC_FINAL0_D_MAPPING_SIGNED_IDENTITY					0x000000c0
+#define    NV34TCL_RC_FINAL0_D_MAPPING_SIGNED_NEGATE					0x000000e0
+#define   NV34TCL_RC_FINAL0_C_INPUT_SHIFT						8
+#define   NV34TCL_RC_FINAL0_C_INPUT_MASK						0x00000f00
+#define    NV34TCL_RC_FINAL0_C_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR0					0x00000100
+#define    NV34TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR1					0x00000200
+#define    NV34TCL_RC_FINAL0_C_INPUT_FOG						0x00000300
+#define    NV34TCL_RC_FINAL0_C_INPUT_PRIMARY_COLOR					0x00000400
+#define    NV34TCL_RC_FINAL0_C_INPUT_SECONDARY_COLOR					0x00000500
+#define    NV34TCL_RC_FINAL0_C_INPUT_TEXTURE0						0x00000800
+#define    NV34TCL_RC_FINAL0_C_INPUT_TEXTURE1						0x00000900
+#define    NV34TCL_RC_FINAL0_C_INPUT_SPARE0						0x00000c00
+#define    NV34TCL_RC_FINAL0_C_INPUT_SPARE1						0x00000d00
+#define    NV34TCL_RC_FINAL0_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV34TCL_RC_FINAL0_C_INPUT_E_TIMES_F						0x00000f00
+#define    NV34TCL_RC_FINAL0_C_INPUT_TEXTURE2						0x00000a00
+#define    NV34TCL_RC_FINAL0_C_INPUT_TEXTURE3						0x00000b00
+#define   NV34TCL_RC_FINAL0_C_COMPONENT_USAGE						(1 << 12)
+#define    NV34TCL_RC_FINAL0_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL0_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV34TCL_RC_FINAL0_C_MAPPING_SHIFT						13
+#define   NV34TCL_RC_FINAL0_C_MAPPING_MASK						0x0000e000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_UNSIGNED_INVERT					0x00002000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_EXPAND_NORMAL					0x00004000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_EXPAND_NEGATE					0x00006000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NORMAL					0x00008000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NEGATE					0x0000a000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_SIGNED_IDENTITY					0x0000c000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_SIGNED_NEGATE					0x0000e000
+#define   NV34TCL_RC_FINAL0_B_INPUT_SHIFT						16
+#define   NV34TCL_RC_FINAL0_B_INPUT_MASK						0x000f0000
+#define    NV34TCL_RC_FINAL0_B_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR0					0x00010000
+#define    NV34TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR1					0x00020000
+#define    NV34TCL_RC_FINAL0_B_INPUT_FOG						0x00030000
+#define    NV34TCL_RC_FINAL0_B_INPUT_PRIMARY_COLOR					0x00040000
+#define    NV34TCL_RC_FINAL0_B_INPUT_SECONDARY_COLOR					0x00050000
+#define    NV34TCL_RC_FINAL0_B_INPUT_TEXTURE0						0x00080000
+#define    NV34TCL_RC_FINAL0_B_INPUT_TEXTURE1						0x00090000
+#define    NV34TCL_RC_FINAL0_B_INPUT_SPARE0						0x000c0000
+#define    NV34TCL_RC_FINAL0_B_INPUT_SPARE1						0x000d0000
+#define    NV34TCL_RC_FINAL0_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000e0000
+#define    NV34TCL_RC_FINAL0_B_INPUT_E_TIMES_F						0x000f0000
+#define    NV34TCL_RC_FINAL0_B_INPUT_TEXTURE2						0x000a0000
+#define    NV34TCL_RC_FINAL0_B_INPUT_TEXTURE3						0x000b0000
+#define   NV34TCL_RC_FINAL0_B_COMPONENT_USAGE						(1 << 20)
+#define    NV34TCL_RC_FINAL0_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL0_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV34TCL_RC_FINAL0_B_MAPPING_SHIFT						21
+#define   NV34TCL_RC_FINAL0_B_MAPPING_MASK						0x00e00000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_UNSIGNED_INVERT					0x00200000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_EXPAND_NORMAL					0x00400000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_EXPAND_NEGATE					0x00600000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NORMAL					0x00800000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NEGATE					0x00a00000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_SIGNED_IDENTITY					0x00c00000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_SIGNED_NEGATE					0x00e00000
+#define   NV34TCL_RC_FINAL0_A_INPUT_SHIFT						24
+#define   NV34TCL_RC_FINAL0_A_INPUT_MASK						0x0f000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR0					0x01000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR1					0x02000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_FOG						0x03000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_PRIMARY_COLOR					0x04000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_SECONDARY_COLOR					0x05000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_TEXTURE0						0x08000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_TEXTURE1						0x09000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_SPARE0						0x0c000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_SPARE1						0x0d000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0e000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_E_TIMES_F						0x0f000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_TEXTURE2						0x0a000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_TEXTURE3						0x0b000000
+#define   NV34TCL_RC_FINAL0_A_COMPONENT_USAGE						(1 << 28)
+#define    NV34TCL_RC_FINAL0_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL0_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV34TCL_RC_FINAL0_A_MAPPING_SHIFT						29
+#define   NV34TCL_RC_FINAL0_A_MAPPING_MASK						0xe0000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_UNSIGNED_INVERT					0x20000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_EXPAND_NORMAL					0x40000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_EXPAND_NEGATE					0x60000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NORMAL					0x80000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NEGATE					0xa0000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_SIGNED_IDENTITY					0xc0000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_SIGNED_NEGATE					0xe0000000
+#define  NV34TCL_RC_FINAL1								0x000008f8
+#define   NV34TCL_RC_FINAL1_COLOR_SUM_CLAMP						(1 <<  7)
+#define   NV34TCL_RC_FINAL1_G_INPUT_SHIFT						8
+#define   NV34TCL_RC_FINAL1_G_INPUT_MASK						0x00000f00
+#define    NV34TCL_RC_FINAL1_G_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR0					0x00000100
+#define    NV34TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR1					0x00000200
+#define    NV34TCL_RC_FINAL1_G_INPUT_FOG						0x00000300
+#define    NV34TCL_RC_FINAL1_G_INPUT_PRIMARY_COLOR					0x00000400
+#define    NV34TCL_RC_FINAL1_G_INPUT_SECONDARY_COLOR					0x00000500
+#define    NV34TCL_RC_FINAL1_G_INPUT_TEXTURE0						0x00000800
+#define    NV34TCL_RC_FINAL1_G_INPUT_TEXTURE1						0x00000900
+#define    NV34TCL_RC_FINAL1_G_INPUT_SPARE0						0x00000c00
+#define    NV34TCL_RC_FINAL1_G_INPUT_SPARE1						0x00000d00
+#define    NV34TCL_RC_FINAL1_G_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV34TCL_RC_FINAL1_G_INPUT_E_TIMES_F						0x00000f00
+#define    NV34TCL_RC_FINAL1_G_INPUT_TEXTURE2						0x00000a00
+#define    NV34TCL_RC_FINAL1_G_INPUT_TEXTURE3						0x00000b00
+#define   NV34TCL_RC_FINAL1_G_COMPONENT_USAGE						(1 << 12)
+#define    NV34TCL_RC_FINAL1_G_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL1_G_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV34TCL_RC_FINAL1_G_MAPPING_SHIFT						13
+#define   NV34TCL_RC_FINAL1_G_MAPPING_MASK						0x0000e000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_UNSIGNED_INVERT					0x00002000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_EXPAND_NORMAL					0x00004000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_EXPAND_NEGATE					0x00006000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NORMAL					0x00008000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NEGATE					0x0000a000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_SIGNED_IDENTITY					0x0000c000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_SIGNED_NEGATE					0x0000e000
+#define   NV34TCL_RC_FINAL1_F_INPUT_SHIFT						16
+#define   NV34TCL_RC_FINAL1_F_INPUT_MASK						0x000f0000
+#define    NV34TCL_RC_FINAL1_F_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR0					0x00010000
+#define    NV34TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR1					0x00020000
+#define    NV34TCL_RC_FINAL1_F_INPUT_FOG						0x00030000
+#define    NV34TCL_RC_FINAL1_F_INPUT_PRIMARY_COLOR					0x00040000
+#define    NV34TCL_RC_FINAL1_F_INPUT_SECONDARY_COLOR					0x00050000
+#define    NV34TCL_RC_FINAL1_F_INPUT_TEXTURE0						0x00080000
+#define    NV34TCL_RC_FINAL1_F_INPUT_TEXTURE1						0x00090000
+#define    NV34TCL_RC_FINAL1_F_INPUT_SPARE0						0x000c0000
+#define    NV34TCL_RC_FINAL1_F_INPUT_SPARE1						0x000d0000
+#define    NV34TCL_RC_FINAL1_F_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000e0000
+#define    NV34TCL_RC_FINAL1_F_INPUT_E_TIMES_F						0x000f0000
+#define    NV34TCL_RC_FINAL1_F_INPUT_TEXTURE2						0x000a0000
+#define    NV34TCL_RC_FINAL1_F_INPUT_TEXTURE3						0x000b0000
+#define   NV34TCL_RC_FINAL1_F_COMPONENT_USAGE						(1 << 20)
+#define    NV34TCL_RC_FINAL1_F_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL1_F_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV34TCL_RC_FINAL1_F_MAPPING_SHIFT						21
+#define   NV34TCL_RC_FINAL1_F_MAPPING_MASK						0x00e00000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_UNSIGNED_INVERT					0x00200000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_EXPAND_NORMAL					0x00400000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_EXPAND_NEGATE					0x00600000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NORMAL					0x00800000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NEGATE					0x00a00000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_SIGNED_IDENTITY					0x00c00000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_SIGNED_NEGATE					0x00e00000
+#define   NV34TCL_RC_FINAL1_E_INPUT_SHIFT						24
+#define   NV34TCL_RC_FINAL1_E_INPUT_MASK						0x0f000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR0					0x01000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR1					0x02000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_FOG						0x03000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_PRIMARY_COLOR					0x04000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_SECONDARY_COLOR					0x05000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_TEXTURE0						0x08000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_TEXTURE1						0x09000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_SPARE0						0x0c000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_SPARE1						0x0d000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0e000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_E_TIMES_F						0x0f000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_TEXTURE2						0x0a000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_TEXTURE3						0x0b000000
+#define   NV34TCL_RC_FINAL1_E_COMPONENT_USAGE						(1 << 28)
+#define    NV34TCL_RC_FINAL1_E_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL1_E_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV34TCL_RC_FINAL1_E_MAPPING_SHIFT						29
+#define   NV34TCL_RC_FINAL1_E_MAPPING_MASK						0xe0000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_UNSIGNED_INVERT					0x20000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_EXPAND_NORMAL					0x40000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_EXPAND_NEGATE					0x60000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NORMAL					0x80000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NEGATE					0xa0000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_SIGNED_IDENTITY					0xc0000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_SIGNED_NEGATE					0xe0000000
+#define  NV34TCL_RC_ENABLE								0x000008fc
+#define   NV34TCL_RC_ENABLE_NUM_COMBINERS_SHIFT						0
+#define   NV34TCL_RC_ENABLE_NUM_COMBINERS_MASK						0x0000000f
+#define   NV34TCL_RC_ENABLE_STAGE_CONSTANT_COLOR0_SHIFT					12
+#define   NV34TCL_RC_ENABLE_STAGE_CONSTANT_COLOR0_MASK					0x0000f000
+#define   NV34TCL_RC_ENABLE_STAGE_CONSTANT_COLOR1_SHIFT					16
+#define   NV34TCL_RC_ENABLE_STAGE_CONSTANT_COLOR1_MASK					0x000f0000
+#define  NV34TCL_RC_IN_ALPHA(x)								(0x00000900+((x)*32))
+#define  NV34TCL_RC_IN_ALPHA__SIZE							0x00000008
+#define   NV34TCL_RC_IN_ALPHA_D_INPUT_SHIFT						0
+#define   NV34TCL_RC_IN_ALPHA_D_INPUT_MASK						0x0000000f
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR0					0x00000001
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR1					0x00000002
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_FOG						0x00000003
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_PRIMARY_COLOR					0x00000004
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_SECONDARY_COLOR					0x00000005
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_TEXTURE0						0x00000008
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_TEXTURE1						0x00000009
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_SPARE0						0x0000000c
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_SPARE1						0x0000000d
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_E_TIMES_F					0x0000000f
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_TEXTURE2						0x0000000a
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_TEXTURE3						0x0000000b
+#define   NV34TCL_RC_IN_ALPHA_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV34TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV34TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV34TCL_RC_IN_ALPHA_D_MAPPING_SHIFT						5
+#define   NV34TCL_RC_IN_ALPHA_D_MAPPING_MASK						0x000000e0
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_INVERT				0x00000020
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NORMAL					0x00000040
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NEGATE					0x00000060
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NORMAL				0x00000080
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NEGATE				0x000000a0
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_IDENTITY				0x000000c0
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_NEGATE					0x000000e0
+#define   NV34TCL_RC_IN_ALPHA_C_INPUT_SHIFT						8
+#define   NV34TCL_RC_IN_ALPHA_C_INPUT_MASK						0x00000f00
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR0					0x00000100
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR1					0x00000200
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_FOG						0x00000300
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_PRIMARY_COLOR					0x00000400
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_SECONDARY_COLOR					0x00000500
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_TEXTURE0						0x00000800
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_TEXTURE1						0x00000900
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_SPARE0						0x00000c00
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_SPARE1						0x00000d00
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_E_TIMES_F					0x00000f00
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_TEXTURE2						0x00000a00
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_TEXTURE3						0x00000b00
+#define   NV34TCL_RC_IN_ALPHA_C_COMPONENT_USAGE						(1 << 12)
+#define    NV34TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV34TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV34TCL_RC_IN_ALPHA_C_MAPPING_SHIFT						13
+#define   NV34TCL_RC_IN_ALPHA_C_MAPPING_MASK						0x0000e000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_INVERT				0x00002000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NORMAL					0x00004000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NEGATE					0x00006000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NORMAL				0x00008000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NEGATE				0x0000a000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_IDENTITY				0x0000c000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_NEGATE					0x0000e000
+#define   NV34TCL_RC_IN_ALPHA_B_INPUT_SHIFT						16
+#define   NV34TCL_RC_IN_ALPHA_B_INPUT_MASK						0x000f0000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR0					0x00010000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR1					0x00020000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_FOG						0x00030000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_PRIMARY_COLOR					0x00040000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_SECONDARY_COLOR					0x00050000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_TEXTURE0						0x00080000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_TEXTURE1						0x00090000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_SPARE0						0x000c0000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_SPARE1						0x000d0000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000e0000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_E_TIMES_F					0x000f0000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_TEXTURE2						0x000a0000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_TEXTURE3						0x000b0000
+#define   NV34TCL_RC_IN_ALPHA_B_COMPONENT_USAGE						(1 << 20)
+#define    NV34TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV34TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV34TCL_RC_IN_ALPHA_B_MAPPING_SHIFT						21
+#define   NV34TCL_RC_IN_ALPHA_B_MAPPING_MASK						0x00e00000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_INVERT				0x00200000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NORMAL					0x00400000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NEGATE					0x00600000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NORMAL				0x00800000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NEGATE				0x00a00000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_IDENTITY				0x00c00000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_NEGATE					0x00e00000
+#define   NV34TCL_RC_IN_ALPHA_A_INPUT_SHIFT						24
+#define   NV34TCL_RC_IN_ALPHA_A_INPUT_MASK						0x0f000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR0					0x01000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR1					0x02000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_FOG						0x03000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_PRIMARY_COLOR					0x04000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_SECONDARY_COLOR					0x05000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_TEXTURE0						0x08000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_TEXTURE1						0x09000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_SPARE0						0x0c000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_SPARE1						0x0d000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0e000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_E_TIMES_F					0x0f000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_TEXTURE2						0x0a000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_TEXTURE3						0x0b000000
+#define   NV34TCL_RC_IN_ALPHA_A_COMPONENT_USAGE						(1 << 28)
+#define    NV34TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV34TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV34TCL_RC_IN_ALPHA_A_MAPPING_SHIFT						29
+#define   NV34TCL_RC_IN_ALPHA_A_MAPPING_MASK						0xe0000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_INVERT				0x20000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NORMAL					0x40000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NEGATE					0x60000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NORMAL				0x80000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NEGATE				0xa0000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_IDENTITY				0xc0000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_NEGATE					0xe0000000
+#define  NV34TCL_RC_IN_RGB(x)								(0x00000904+((x)*32))
+#define  NV34TCL_RC_IN_RGB__SIZE							0x00000008
+#define   NV34TCL_RC_IN_RGB_D_INPUT_SHIFT						0
+#define   NV34TCL_RC_IN_RGB_D_INPUT_MASK						0x0000000f
+#define    NV34TCL_RC_IN_RGB_D_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR0					0x00000001
+#define    NV34TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR1					0x00000002
+#define    NV34TCL_RC_IN_RGB_D_INPUT_FOG						0x00000003
+#define    NV34TCL_RC_IN_RGB_D_INPUT_PRIMARY_COLOR					0x00000004
+#define    NV34TCL_RC_IN_RGB_D_INPUT_SECONDARY_COLOR					0x00000005
+#define    NV34TCL_RC_IN_RGB_D_INPUT_TEXTURE0						0x00000008
+#define    NV34TCL_RC_IN_RGB_D_INPUT_TEXTURE1						0x00000009
+#define    NV34TCL_RC_IN_RGB_D_INPUT_SPARE0						0x0000000c
+#define    NV34TCL_RC_IN_RGB_D_INPUT_SPARE1						0x0000000d
+#define    NV34TCL_RC_IN_RGB_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV34TCL_RC_IN_RGB_D_INPUT_E_TIMES_F						0x0000000f
+#define    NV34TCL_RC_IN_RGB_D_INPUT_TEXTURE2						0x0000000a
+#define    NV34TCL_RC_IN_RGB_D_INPUT_TEXTURE3						0x0000000b
+#define   NV34TCL_RC_IN_RGB_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV34TCL_RC_IN_RGB_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_IN_RGB_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV34TCL_RC_IN_RGB_D_MAPPING_SHIFT						5
+#define   NV34TCL_RC_IN_RGB_D_MAPPING_MASK						0x000000e0
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_INVERT					0x00000020
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_EXPAND_NORMAL					0x00000040
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_EXPAND_NEGATE					0x00000060
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NORMAL					0x00000080
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NEGATE					0x000000a0
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_SIGNED_IDENTITY					0x000000c0
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_SIGNED_NEGATE					0x000000e0
+#define   NV34TCL_RC_IN_RGB_C_INPUT_SHIFT						8
+#define   NV34TCL_RC_IN_RGB_C_INPUT_MASK						0x00000f00
+#define    NV34TCL_RC_IN_RGB_C_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR0					0x00000100
+#define    NV34TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR1					0x00000200
+#define    NV34TCL_RC_IN_RGB_C_INPUT_FOG						0x00000300
+#define    NV34TCL_RC_IN_RGB_C_INPUT_PRIMARY_COLOR					0x00000400
+#define    NV34TCL_RC_IN_RGB_C_INPUT_SECONDARY_COLOR					0x00000500
+#define    NV34TCL_RC_IN_RGB_C_INPUT_TEXTURE0						0x00000800
+#define    NV34TCL_RC_IN_RGB_C_INPUT_TEXTURE1						0x00000900
+#define    NV34TCL_RC_IN_RGB_C_INPUT_SPARE0						0x00000c00
+#define    NV34TCL_RC_IN_RGB_C_INPUT_SPARE1						0x00000d00
+#define    NV34TCL_RC_IN_RGB_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV34TCL_RC_IN_RGB_C_INPUT_E_TIMES_F						0x00000f00
+#define    NV34TCL_RC_IN_RGB_C_INPUT_TEXTURE2						0x00000a00
+#define    NV34TCL_RC_IN_RGB_C_INPUT_TEXTURE3						0x00000b00
+#define   NV34TCL_RC_IN_RGB_C_COMPONENT_USAGE						(1 << 12)
+#define    NV34TCL_RC_IN_RGB_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_IN_RGB_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV34TCL_RC_IN_RGB_C_MAPPING_SHIFT						13
+#define   NV34TCL_RC_IN_RGB_C_MAPPING_MASK						0x0000e000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_INVERT					0x00002000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_EXPAND_NORMAL					0x00004000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_EXPAND_NEGATE					0x00006000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NORMAL					0x00008000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NEGATE					0x0000a000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_SIGNED_IDENTITY					0x0000c000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_SIGNED_NEGATE					0x0000e000
+#define   NV34TCL_RC_IN_RGB_B_INPUT_SHIFT						16
+#define   NV34TCL_RC_IN_RGB_B_INPUT_MASK						0x000f0000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR0					0x00010000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR1					0x00020000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_FOG						0x00030000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_PRIMARY_COLOR					0x00040000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_SECONDARY_COLOR					0x00050000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_TEXTURE0						0x00080000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_TEXTURE1						0x00090000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_SPARE0						0x000c0000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_SPARE1						0x000d0000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000e0000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_E_TIMES_F						0x000f0000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_TEXTURE2						0x000a0000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_TEXTURE3						0x000b0000
+#define   NV34TCL_RC_IN_RGB_B_COMPONENT_USAGE						(1 << 20)
+#define    NV34TCL_RC_IN_RGB_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_IN_RGB_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV34TCL_RC_IN_RGB_B_MAPPING_SHIFT						21
+#define   NV34TCL_RC_IN_RGB_B_MAPPING_MASK						0x00e00000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_INVERT					0x00200000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_EXPAND_NORMAL					0x00400000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_EXPAND_NEGATE					0x00600000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NORMAL					0x00800000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NEGATE					0x00a00000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_SIGNED_IDENTITY					0x00c00000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_SIGNED_NEGATE					0x00e00000
+#define   NV34TCL_RC_IN_RGB_A_INPUT_SHIFT						24
+#define   NV34TCL_RC_IN_RGB_A_INPUT_MASK						0x0f000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR0					0x01000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR1					0x02000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_FOG						0x03000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_PRIMARY_COLOR					0x04000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_SECONDARY_COLOR					0x05000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_TEXTURE0						0x08000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_TEXTURE1						0x09000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_SPARE0						0x0c000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_SPARE1						0x0d000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0e000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_E_TIMES_F						0x0f000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_TEXTURE2						0x0a000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_TEXTURE3						0x0b000000
+#define   NV34TCL_RC_IN_RGB_A_COMPONENT_USAGE						(1 << 28)
+#define    NV34TCL_RC_IN_RGB_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_IN_RGB_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV34TCL_RC_IN_RGB_A_MAPPING_SHIFT						29
+#define   NV34TCL_RC_IN_RGB_A_MAPPING_MASK						0xe0000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_IDENTITY				0x00000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_INVERT					0x20000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_EXPAND_NORMAL					0x40000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_EXPAND_NEGATE					0x60000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NORMAL					0x80000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NEGATE					0xa0000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_SIGNED_IDENTITY					0xc0000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_SIGNED_NEGATE					0xe0000000
+#define  NV34TCL_RC_CONSTANT_COLOR0(x)							(0x00000908+((x)*32))
+#define  NV34TCL_RC_CONSTANT_COLOR0__SIZE						0x00000008
+#define   NV34TCL_RC_CONSTANT_COLOR0_B_SHIFT						0
+#define   NV34TCL_RC_CONSTANT_COLOR0_B_MASK						0x000000ff
+#define   NV34TCL_RC_CONSTANT_COLOR0_G_SHIFT						8
+#define   NV34TCL_RC_CONSTANT_COLOR0_G_MASK						0x0000ff00
+#define   NV34TCL_RC_CONSTANT_COLOR0_R_SHIFT						16
+#define   NV34TCL_RC_CONSTANT_COLOR0_R_MASK						0x00ff0000
+#define   NV34TCL_RC_CONSTANT_COLOR0_A_SHIFT						24
+#define   NV34TCL_RC_CONSTANT_COLOR0_A_MASK						0xff000000
+#define  NV34TCL_RC_CONSTANT_COLOR1(x)							(0x0000090c+((x)*32))
+#define  NV34TCL_RC_CONSTANT_COLOR1__SIZE						0x00000008
+#define   NV34TCL_RC_CONSTANT_COLOR1_B_SHIFT						0
+#define   NV34TCL_RC_CONSTANT_COLOR1_B_MASK						0x000000ff
+#define   NV34TCL_RC_CONSTANT_COLOR1_G_SHIFT						8
+#define   NV34TCL_RC_CONSTANT_COLOR1_G_MASK						0x0000ff00
+#define   NV34TCL_RC_CONSTANT_COLOR1_R_SHIFT						16
+#define   NV34TCL_RC_CONSTANT_COLOR1_R_MASK						0x00ff0000
+#define   NV34TCL_RC_CONSTANT_COLOR1_A_SHIFT						24
+#define   NV34TCL_RC_CONSTANT_COLOR1_A_MASK						0xff000000
+#define  NV34TCL_RC_OUT_ALPHA(x)							(0x00000910+((x)*32))
+#define  NV34TCL_RC_OUT_ALPHA__SIZE							0x00000008
+#define   NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_SHIFT						0
+#define   NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_MASK						0x0000000f
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_ZERO						0x00000000
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR0				0x00000001
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR1				0x00000002
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_FOG						0x00000003
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_PRIMARY_COLOR					0x00000004
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_SECONDARY_COLOR				0x00000005
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE0					0x00000008
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE1					0x00000009
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0					0x0000000c
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE1					0x0000000d
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_E_TIMES_F					0x0000000f
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE2					0x0000000a
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE3					0x0000000b
+#define   NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_SHIFT						4
+#define   NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_MASK						0x000000f0
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_ZERO						0x00000000
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR0				0x00000010
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR1				0x00000020
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_FOG						0x00000030
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_PRIMARY_COLOR					0x00000040
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_SECONDARY_COLOR				0x00000050
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE0					0x00000080
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE1					0x00000090
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0					0x000000c0
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE1					0x000000d0
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000000e0
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_E_TIMES_F					0x000000f0
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE2					0x000000a0
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE3					0x000000b0
+#define   NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_SHIFT						8
+#define   NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_MASK						0x00000f00
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_ZERO						0x00000000
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR0				0x00000100
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR1				0x00000200
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_FOG						0x00000300
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_PRIMARY_COLOR				0x00000400
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_SECONDARY_COLOR				0x00000500
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE0					0x00000800
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE1					0x00000900
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0					0x00000c00
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE1					0x00000d00
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_E_TIMES_F					0x00000f00
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE2					0x00000a00
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE3					0x00000b00
+#define   NV34TCL_RC_OUT_ALPHA_CD_DOT_PRODUCT						(1 << 12)
+#define   NV34TCL_RC_OUT_ALPHA_AB_DOT_PRODUCT						(1 << 13)
+#define   NV34TCL_RC_OUT_ALPHA_MUX_SUM							(1 << 14)
+#define   NV34TCL_RC_OUT_ALPHA_BIAS							(1 << 15)
+#define    NV34TCL_RC_OUT_ALPHA_BIAS_NONE						0x00000000
+#define    NV34TCL_RC_OUT_ALPHA_BIAS_BIAS_BY_NEGATIVE_ONE_HALF				0x00008000
+#define   NV34TCL_RC_OUT_ALPHA_SCALE_SHIFT						17
+#define   NV34TCL_RC_OUT_ALPHA_SCALE_MASK						0x00000000
+#define    NV34TCL_RC_OUT_ALPHA_SCALE_NONE						0x00000000
+#define    NV34TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_TWO					0x00020000
+#define    NV34TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_FOUR					0x00040000
+#define    NV34TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_ONE_HALF					0x00060000
+#define  NV34TCL_RC_OUT_RGB(x)								(0x00000914+((x)*32))
+#define  NV34TCL_RC_OUT_RGB__SIZE							0x00000008
+#define   NV34TCL_RC_OUT_RGB_CD_OUTPUT_SHIFT						0
+#define   NV34TCL_RC_OUT_RGB_CD_OUTPUT_MASK						0x0000000f
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_ZERO						0x00000000
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR0					0x00000001
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR1					0x00000002
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_FOG						0x00000003
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_PRIMARY_COLOR					0x00000004
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_SECONDARY_COLOR					0x00000005
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE0					0x00000008
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE1					0x00000009
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0						0x0000000c
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_SPARE1						0x0000000d
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x0000000e
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_E_TIMES_F					0x0000000f
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE2					0x0000000a
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE3					0x0000000b
+#define   NV34TCL_RC_OUT_RGB_AB_OUTPUT_SHIFT						4
+#define   NV34TCL_RC_OUT_RGB_AB_OUTPUT_MASK						0x000000f0
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_ZERO						0x00000000
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR0					0x00000010
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR1					0x00000020
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_FOG						0x00000030
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_PRIMARY_COLOR					0x00000040
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_SECONDARY_COLOR					0x00000050
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE0					0x00000080
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE1					0x00000090
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0						0x000000c0
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_SPARE1						0x000000d0
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x000000e0
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_E_TIMES_F					0x000000f0
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE2					0x000000a0
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE3					0x000000b0
+#define   NV34TCL_RC_OUT_RGB_SUM_OUTPUT_SHIFT						8
+#define   NV34TCL_RC_OUT_RGB_SUM_OUTPUT_MASK						0x00000f00
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_ZERO						0x00000000
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR0				0x00000100
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR1				0x00000200
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_FOG						0x00000300
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_PRIMARY_COLOR					0x00000400
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_SECONDARY_COLOR				0x00000500
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE0					0x00000800
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE1					0x00000900
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0						0x00000c00
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE1						0x00000d00
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR			0x00000e00
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_E_TIMES_F					0x00000f00
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE2					0x00000a00
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE3					0x00000b00
+#define   NV34TCL_RC_OUT_RGB_CD_DOT_PRODUCT						(1 << 12)
+#define   NV34TCL_RC_OUT_RGB_AB_DOT_PRODUCT						(1 << 13)
+#define   NV34TCL_RC_OUT_RGB_MUX_SUM							(1 << 14)
+#define   NV34TCL_RC_OUT_RGB_BIAS							(1 << 15)
+#define    NV34TCL_RC_OUT_RGB_BIAS_NONE							0x00000000
+#define    NV34TCL_RC_OUT_RGB_BIAS_BIAS_BY_NEGATIVE_ONE_HALF				0x00008000
+#define   NV34TCL_RC_OUT_RGB_SCALE_SHIFT						17
+#define   NV34TCL_RC_OUT_RGB_SCALE_MASK							0x00000000
+#define    NV34TCL_RC_OUT_RGB_SCALE_NONE						0x00000000
+#define    NV34TCL_RC_OUT_RGB_SCALE_SCALE_BY_TWO					0x00020000
+#define    NV34TCL_RC_OUT_RGB_SCALE_SCALE_BY_FOUR					0x00040000
+#define    NV34TCL_RC_OUT_RGB_SCALE_SCALE_BY_ONE_HALF					0x00060000
+#define  NV34TCL_VIEWPORT_HORIZ								0x00000a00
+#define   NV34TCL_VIEWPORT_HORIZ_X_SHIFT						0
+#define   NV34TCL_VIEWPORT_HORIZ_X_MASK							0x0000ffff
+#define   NV34TCL_VIEWPORT_HORIZ_W_SHIFT						16
+#define   NV34TCL_VIEWPORT_HORIZ_W_MASK							0xffff0000
+#define  NV34TCL_VIEWPORT_VERT								0x00000a04
+#define   NV34TCL_VIEWPORT_VERT_Y_SHIFT							0
+#define   NV34TCL_VIEWPORT_VERT_Y_MASK							0x0000ffff
+#define   NV34TCL_VIEWPORT_VERT_H_SHIFT							16
+#define   NV34TCL_VIEWPORT_VERT_H_MASK							0xffff0000
+#define  NV34TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R			0x00000a10
+#define  NV34TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G			0x00000a14
+#define  NV34TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B			0x00000a18
+#define  NV34TCL_VIEWPORT_TRANSLATE_X							0x00000a20
+#define  NV34TCL_VIEWPORT_TRANSLATE_Y							0x00000a24
+#define  NV34TCL_VIEWPORT_TRANSLATE_Z							0x00000a28
+#define  NV34TCL_VIEWPORT_TRANSLATE_W							0x00000a2c
+#define  NV34TCL_VIEWPORT_SCALE_X							0x00000a30
+#define  NV34TCL_VIEWPORT_SCALE_Y							0x00000a34
+#define  NV34TCL_VIEWPORT_SCALE_Z							0x00000a38
+#define  NV34TCL_VIEWPORT_SCALE_W							0x00000a3c
+#define  NV34TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000a60
+#define  NV34TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000a64
+#define  NV34TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000a68
+#define  NV34TCL_DEPTH_FUNC								0x00000a6c
+#define   NV34TCL_DEPTH_FUNC_NEVER							0x00000200
+#define   NV34TCL_DEPTH_FUNC_LESS							0x00000201
+#define   NV34TCL_DEPTH_FUNC_EQUAL							0x00000202
+#define   NV34TCL_DEPTH_FUNC_LEQUAL							0x00000203
+#define   NV34TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV34TCL_DEPTH_FUNC_NOTEQUAL							0x00000205
+#define   NV34TCL_DEPTH_FUNC_GEQUAL							0x00000206
+#define   NV34TCL_DEPTH_FUNC_ALWAYS							0x00000207
+#define  NV34TCL_DEPTH_WRITE_ENABLE							0x00000a70
+#define  NV34TCL_DEPTH_TEST_ENABLE							0x00000a74
+#define  NV34TCL_POLYGON_OFFSET_FACTOR							0x00000a78
+#define  NV34TCL_POLYGON_OFFSET_UNITS							0x00000a7c
+#define  NV34TCL_VTX_ATTR_3I_XY(x)							(0x00000a80+((x)*8))
+#define  NV34TCL_VTX_ATTR_3I_XY__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_3I_XY_X_SHIFT						0
+#define   NV34TCL_VTX_ATTR_3I_XY_X_MASK							0x0000ffff
+#define   NV34TCL_VTX_ATTR_3I_XY_Y_SHIFT						16
+#define   NV34TCL_VTX_ATTR_3I_XY_Y_MASK							0xffff0000
+#define  NV34TCL_VTX_ATTR_3I_Z(x)							(0x00000a84+((x)*8))
+#define  NV34TCL_VTX_ATTR_3I_Z__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_3I_Z_Z_SHIFT							0
+#define   NV34TCL_VTX_ATTR_3I_Z_Z_MASK							0x0000ffff
+#define  NV34TCL_VP_UPLOAD_INST(x)							(0x00000b80+((x)*4))
+#define  NV34TCL_VP_UPLOAD_INST__SIZE							0x00000004
+#define  NV34TCL_TX0_CLIP_PLANE_A(x)							(0x00000e00+((x)*16))
+#define  NV34TCL_TX0_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX0_CLIP_PLANE_B(x)							(0x00000e04+((x)*16))
+#define  NV34TCL_TX0_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX0_CLIP_PLANE_C(x)							(0x00000e08+((x)*16))
+#define  NV34TCL_TX0_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX0_CLIP_PLANE_D(x)							(0x00000e0c+((x)*16))
+#define  NV34TCL_TX0_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX1_CLIP_PLANE_A(x)							(0x00000e40+((x)*16))
+#define  NV34TCL_TX1_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX1_CLIP_PLANE_B(x)							(0x00000e44+((x)*16))
+#define  NV34TCL_TX1_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX1_CLIP_PLANE_C(x)							(0x00000e48+((x)*16))
+#define  NV34TCL_TX1_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX1_CLIP_PLANE_D(x)							(0x00000e4c+((x)*16))
+#define  NV34TCL_TX1_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX2_CLIP_PLANE_A(x)							(0x00000e80+((x)*16))
+#define  NV34TCL_TX2_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX2_CLIP_PLANE_B(x)							(0x00000e84+((x)*16))
+#define  NV34TCL_TX2_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX2_CLIP_PLANE_C(x)							(0x00000e88+((x)*16))
+#define  NV34TCL_TX2_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX2_CLIP_PLANE_D(x)							(0x00000e8c+((x)*16))
+#define  NV34TCL_TX2_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX3_CLIP_PLANE_A(x)							(0x00000ec0+((x)*16))
+#define  NV34TCL_TX3_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX3_CLIP_PLANE_B(x)							(0x00000ec4+((x)*16))
+#define  NV34TCL_TX3_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX3_CLIP_PLANE_C(x)							(0x00000ec8+((x)*16))
+#define  NV34TCL_TX3_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX3_CLIP_PLANE_D(x)							(0x00000ecc+((x)*16))
+#define  NV34TCL_TX3_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX4_CLIP_PLANE_A(x)							(0x00000f00+((x)*16))
+#define  NV34TCL_TX4_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX4_CLIP_PLANE_B(x)							(0x00000f04+((x)*16))
+#define  NV34TCL_TX4_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX4_CLIP_PLANE_C(x)							(0x00000f08+((x)*16))
+#define  NV34TCL_TX4_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX4_CLIP_PLANE_D(x)							(0x00000f0c+((x)*16))
+#define  NV34TCL_TX4_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX5_CLIP_PLANE_A(x)							(0x00000f40+((x)*16))
+#define  NV34TCL_TX5_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX5_CLIP_PLANE_B(x)							(0x00000f44+((x)*16))
+#define  NV34TCL_TX5_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX5_CLIP_PLANE_C(x)							(0x00000f48+((x)*16))
+#define  NV34TCL_TX5_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX5_CLIP_PLANE_D(x)							(0x00000f4c+((x)*16))
+#define  NV34TCL_TX5_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX6_CLIP_PLANE_A(x)							(0x00000f80+((x)*16))
+#define  NV34TCL_TX6_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX6_CLIP_PLANE_B(x)							(0x00000f84+((x)*16))
+#define  NV34TCL_TX6_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX6_CLIP_PLANE_C(x)							(0x00000f88+((x)*16))
+#define  NV34TCL_TX6_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX6_CLIP_PLANE_D(x)							(0x00000f8c+((x)*16))
+#define  NV34TCL_TX6_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX7_CLIP_PLANE_A(x)							(0x00000fc0+((x)*16))
+#define  NV34TCL_TX7_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX7_CLIP_PLANE_B(x)							(0x00000fc4+((x)*16))
+#define  NV34TCL_TX7_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX7_CLIP_PLANE_C(x)							(0x00000fc8+((x)*16))
+#define  NV34TCL_TX7_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX7_CLIP_PLANE_D(x)							(0x00000fcc+((x)*16))
+#define  NV34TCL_TX7_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R(x)					(0x00001000+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G(x)					(0x00001004+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B(x)					(0x00001008+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R(x)					(0x0000100c+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G(x)					(0x00001010+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B(x)					(0x00001014+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R(x)					(0x00001018+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G(x)					(0x0000101c+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B(x)					(0x00001020+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B__SIZE				0x00000008
+#define  NV34TCL_LIGHT_HALF_VECTOR_X(x)							(0x00001028+((x)*64))
+#define  NV34TCL_LIGHT_HALF_VECTOR_X__SIZE						0x00000008
+#define  NV34TCL_LIGHT_HALF_VECTOR_Y(x)							(0x0000102c+((x)*64))
+#define  NV34TCL_LIGHT_HALF_VECTOR_Y__SIZE						0x00000008
+#define  NV34TCL_LIGHT_HALF_VECTOR_Z(x)							(0x00001030+((x)*64))
+#define  NV34TCL_LIGHT_HALF_VECTOR_Z__SIZE						0x00000008
+#define  NV34TCL_LIGHT_DIRECTION_X(x)							(0x00001034+((x)*64))
+#define  NV34TCL_LIGHT_DIRECTION_X__SIZE						0x00000008
+#define  NV34TCL_LIGHT_DIRECTION_Y(x)							(0x00001038+((x)*64))
+#define  NV34TCL_LIGHT_DIRECTION_Y__SIZE						0x00000008
+#define  NV34TCL_LIGHT_DIRECTION_Z(x)							(0x0000103c+((x)*64))
+#define  NV34TCL_LIGHT_DIRECTION_Z__SIZE						0x00000008
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_A(x)							(0x00001200+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_A__SIZE						0x00000008
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_B(x)							(0x00001204+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_B__SIZE						0x00000008
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_C(x)							(0x00001208+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_C__SIZE						0x00000008
+#define  NV34TCL_LIGHT_SPOT_DIR_X(x)							(0x0000120c+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_DIR_X__SIZE							0x00000008
+#define  NV34TCL_LIGHT_SPOT_DIR_Y(x)							(0x00001210+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_DIR_Y__SIZE							0x00000008
+#define  NV34TCL_LIGHT_SPOT_DIR_Z(x)							(0x00001214+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_DIR_Z__SIZE							0x00000008
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_D(x)							(0x00001218+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_D__SIZE						0x00000008
+#define  NV34TCL_LIGHT_POSITION_X(x)							(0x0000121c+((x)*64))
+#define  NV34TCL_LIGHT_POSITION_X__SIZE							0x00000008
+#define  NV34TCL_LIGHT_POSITION_Y(x)							(0x00001220+((x)*64))
+#define  NV34TCL_LIGHT_POSITION_Y__SIZE							0x00000008
+#define  NV34TCL_LIGHT_POSITION_Z(x)							(0x00001224+((x)*64))
+#define  NV34TCL_LIGHT_POSITION_Z__SIZE							0x00000008
+#define  NV34TCL_LIGHT_ATTENUATION_CONSTANT(x)						(0x00001228+((x)*64))
+#define  NV34TCL_LIGHT_ATTENUATION_CONSTANT__SIZE					0x00000008
+#define  NV34TCL_LIGHT_ATTENUATION_LINEAR(x)						(0x0000122c+((x)*64))
+#define  NV34TCL_LIGHT_ATTENUATION_LINEAR__SIZE						0x00000008
+#define  NV34TCL_LIGHT_ATTENUATION_QUADRATIC(x)						(0x00001230+((x)*64))
+#define  NV34TCL_LIGHT_ATTENUATION_QUADRATIC__SIZE					0x00000008
+#define  NV34TCL_FRONT_MATERIAL_SHININESS(x)						(0x00001400+((x)*4))
+#define  NV34TCL_FRONT_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV34TCL_ENABLED_LIGHTS								0x00001420
+#define  NV34TCL_VERTEX_TWO_SIDE_ENABLE							0x0000142c
+#define  NV34TCL_FP_REG_CONTROL								0x00001450
+#define   NV34TCL_FP_REG_CONTROL_UNK1_SHIFT						16
+#define   NV34TCL_FP_REG_CONTROL_UNK1_MASK						0xffff0000
+#define   NV34TCL_FP_REG_CONTROL_UNK0_SHIFT						0
+#define   NV34TCL_FP_REG_CONTROL_UNK0_MASK						0x0000ffff
+#define  NV34TCL_VP_CLIP_PLANES_ENABLE							0x00001478
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0						(1 <<  1)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1						(1 <<  5)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2						(1 <<  9)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3						(1 << 13)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4						(1 << 17)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5						(1 << 21)
+#define  NV34TCL_POLYGON_STIPPLE_ENABLE							0x0000147c
+#define  NV34TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001480+((x)*4))
+#define  NV34TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NV34TCL_VTX_ATTR_3F_X(x)							(0x00001500+((x)*16))
+#define  NV34TCL_VTX_ATTR_3F_X__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_3F_Y(x)							(0x00001504+((x)*16))
+#define  NV34TCL_VTX_ATTR_3F_Y__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_3F_Z(x)							(0x00001508+((x)*16))
+#define  NV34TCL_VTX_ATTR_3F_Z__SIZE							0x00000010
+#define  NV34TCL_VP_CLIP_PLANE_A(x)							(0x00001600+((x)*16))
+#define  NV34TCL_VP_CLIP_PLANE_A__SIZE							0x00000006
+#define  NV34TCL_VP_CLIP_PLANE_B(x)							(0x00001604+((x)*16))
+#define  NV34TCL_VP_CLIP_PLANE_B__SIZE							0x00000006
+#define  NV34TCL_VP_CLIP_PLANE_C(x)							(0x00001608+((x)*16))
+#define  NV34TCL_VP_CLIP_PLANE_C__SIZE							0x00000006
+#define  NV34TCL_VP_CLIP_PLANE_D(x)							(0x0000160c+((x)*16))
+#define  NV34TCL_VP_CLIP_PLANE_D__SIZE							0x00000006
+#define  NV34TCL_VTXBUF_ADDRESS(x)							(0x00001680+((x)*4))
+#define  NV34TCL_VTXBUF_ADDRESS__SIZE							0x00000010
+#define   NV34TCL_VTXBUF_ADDRESS_DMA1							(1 << 31)
+#define   NV34TCL_VTXBUF_ADDRESS_OFFSET_SHIFT						0
+#define   NV34TCL_VTXBUF_ADDRESS_OFFSET_MASK						0x0fffffff
+#define  NV34TCL_VTXFMT(x)								(0x00001740+((x)*4))
+#define  NV34TCL_VTXFMT__SIZE								0x00000010
+#define   NV34TCL_VTXFMT_TYPE_SHIFT							0
+#define   NV34TCL_VTXFMT_TYPE_MASK							0x0000000f
+#define    NV34TCL_VTXFMT_TYPE_FLOAT							0x00000002
+#define    NV34TCL_VTXFMT_TYPE_HALF							0x00000003
+#define    NV34TCL_VTXFMT_TYPE_UBYTE							0x00000004
+#define    NV34TCL_VTXFMT_TYPE_USHORT							0x00000005
+#define   NV34TCL_VTXFMT_SIZE_SHIFT							4
+#define   NV34TCL_VTXFMT_SIZE_MASK							0x000000f0
+#define   NV34TCL_VTXFMT_STRIDE_SHIFT							8
+#define   NV34TCL_VTXFMT_STRIDE_MASK							0x0000ff00
+#define  NV34TCL_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R			0x000017a0
+#define  NV34TCL_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G			0x000017a4
+#define  NV34TCL_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B			0x000017a8
+#define  NV34TCL_COLOR_MATERIAL_BACK_R							0x000017b0
+#define  NV34TCL_COLOR_MATERIAL_BACK_G							0x000017b4
+#define  NV34TCL_COLOR_MATERIAL_BACK_B							0x000017b8
+#define  NV34TCL_COLOR_MATERIAL_BACK_A							0x000017c0
+#define  NV34TCL_QUERY_RESET								0x000017c8
+#define  NV34TCL_QUERY_UNK17CC								0x000017cc
+#define  NV34TCL_QUERY_GET								0x00001800
+#define   NV34TCL_QUERY_GET_UNK24_SHIFT							24
+#define   NV34TCL_QUERY_GET_UNK24_MASK							0xff000000
+#define   NV34TCL_QUERY_GET_OFFSET_SHIFT						0
+#define   NV34TCL_QUERY_GET_OFFSET_MASK							0x00ffffff
+#define  NV34TCL_VERTEX_BEGIN_END							0x00001808
+#define   NV34TCL_VERTEX_BEGIN_END_STOP							0x00000000
+#define   NV34TCL_VERTEX_BEGIN_END_POINTS						0x00000001
+#define   NV34TCL_VERTEX_BEGIN_END_LINES						0x00000002
+#define   NV34TCL_VERTEX_BEGIN_END_LINE_LOOP						0x00000003
+#define   NV34TCL_VERTEX_BEGIN_END_LINE_STRIP						0x00000004
+#define   NV34TCL_VERTEX_BEGIN_END_TRIANGLES						0x00000005
+#define   NV34TCL_VERTEX_BEGIN_END_TRIANGLE_STRIP					0x00000006
+#define   NV34TCL_VERTEX_BEGIN_END_TRIANGLE_FAN						0x00000007
+#define   NV34TCL_VERTEX_BEGIN_END_QUADS						0x00000008
+#define   NV34TCL_VERTEX_BEGIN_END_QUAD_STRIP						0x00000009
+#define   NV34TCL_VERTEX_BEGIN_END_POLYGON						0x0000000a
+#define  NV34TCL_VB_ELEMENT_U16								0x0000180c
+#define   NV34TCL_VB_ELEMENT_U16_I0_SHIFT						0
+#define   NV34TCL_VB_ELEMENT_U16_I0_MASK						0x0000ffff
+#define   NV34TCL_VB_ELEMENT_U16_I1_SHIFT						16
+#define   NV34TCL_VB_ELEMENT_U16_I1_MASK						0xffff0000
+#define  NV34TCL_VB_ELEMENT_U32								0x00001810
+#define  NV34TCL_VB_VERTEX_BATCH							0x00001814
+#define   NV34TCL_VB_VERTEX_BATCH_OFFSET_SHIFT						0
+#define   NV34TCL_VB_VERTEX_BATCH_OFFSET_MASK						0x00ffffff
+#define   NV34TCL_VB_VERTEX_BATCH_COUNT_SHIFT						24
+#define   NV34TCL_VB_VERTEX_BATCH_COUNT_MASK						0xff000000
+#define  NV34TCL_VERTEX_DATA								0x00001818
+#define  NV34TCL_IDXBUF_ADDRESS								0x0000181c
+#define  NV34TCL_IDXBUF_FORMAT								0x00001820
+#define   NV34TCL_IDXBUF_FORMAT_TYPE_SHIFT						4
+#define   NV34TCL_IDXBUF_FORMAT_TYPE_MASK						0x000000f0
+#define    NV34TCL_IDXBUF_FORMAT_TYPE_U32						0x00000000
+#define    NV34TCL_IDXBUF_FORMAT_TYPE_U16						0x00000010
+#define   NV34TCL_IDXBUF_FORMAT_DMA1							(1 <<  0)
+#define  NV34TCL_VB_INDEX_BATCH								0x00001824
+#define   NV34TCL_VB_INDEX_BATCH_COUNT_SHIFT						24
+#define   NV34TCL_VB_INDEX_BATCH_COUNT_MASK						0xff000000
+#define   NV34TCL_VB_INDEX_BATCH_START_SHIFT						0
+#define   NV34TCL_VB_INDEX_BATCH_START_MASK						0x00ffffff
+#define  NV34TCL_POLYGON_MODE_FRONT							0x00001828
+#define   NV34TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV34TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV34TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV34TCL_POLYGON_MODE_BACK							0x0000182c
+#define   NV34TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV34TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV34TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV34TCL_CULL_FACE								0x00001830
+#define   NV34TCL_CULL_FACE_FRONT							0x00000404
+#define   NV34TCL_CULL_FACE_BACK							0x00000405
+#define   NV34TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV34TCL_FRONT_FACE								0x00001834
+#define   NV34TCL_FRONT_FACE_CW								0x00000900
+#define   NV34TCL_FRONT_FACE_CCW							0x00000901
+#define  NV34TCL_POLYGON_SMOOTH_ENABLE							0x00001838
+#define  NV34TCL_CULL_FACE_ENABLE							0x0000183c
+#define  NV34TCL_TX_PALETTE_OFFSET(x)							(0x00001840+((x)*4))
+#define  NV34TCL_TX_PALETTE_OFFSET__SIZE						0x00000008
+#define  NV34TCL_VTX_ATTR_2F_X(x)							(0x00001880+((x)*8))
+#define  NV34TCL_VTX_ATTR_2F_X__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_2F_Y(x)							(0x00001884+((x)*8))
+#define  NV34TCL_VTX_ATTR_2F_Y__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_2I(x)								(0x00001900+((x)*4))
+#define  NV34TCL_VTX_ATTR_2I__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_2I_X_SHIFT							0
+#define   NV34TCL_VTX_ATTR_2I_X_MASK							0x0000ffff
+#define   NV34TCL_VTX_ATTR_2I_Y_SHIFT							16
+#define   NV34TCL_VTX_ATTR_2I_Y_MASK							0xffff0000
+#define  NV34TCL_VTX_ATTR_4UB(x)							(0x00001940+((x)*4))
+#define  NV34TCL_VTX_ATTR_4UB__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_4UB_X_SHIFT							0
+#define   NV34TCL_VTX_ATTR_4UB_X_MASK							0x000000ff
+#define   NV34TCL_VTX_ATTR_4UB_Y_SHIFT							8
+#define   NV34TCL_VTX_ATTR_4UB_Y_MASK							0x0000ff00
+#define   NV34TCL_VTX_ATTR_4UB_Z_SHIFT							16
+#define   NV34TCL_VTX_ATTR_4UB_Z_MASK							0x00ff0000
+#define   NV34TCL_VTX_ATTR_4UB_W_SHIFT							24
+#define   NV34TCL_VTX_ATTR_4UB_W_MASK							0xff000000
+#define  NV34TCL_VTX_ATTR_4I_XY(x)							(0x00001980+((x)*8))
+#define  NV34TCL_VTX_ATTR_4I_XY__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_4I_XY_X_SHIFT						0
+#define   NV34TCL_VTX_ATTR_4I_XY_X_MASK							0x0000ffff
+#define   NV34TCL_VTX_ATTR_4I_XY_Y_SHIFT						16
+#define   NV34TCL_VTX_ATTR_4I_XY_Y_MASK							0xffff0000
+#define  NV34TCL_VTX_ATTR_4I_ZW(x)							(0x00001984+((x)*8))
+#define  NV34TCL_VTX_ATTR_4I_ZW__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_4I_ZW_Z_SHIFT						0
+#define   NV34TCL_VTX_ATTR_4I_ZW_Z_MASK							0x0000ffff
+#define   NV34TCL_VTX_ATTR_4I_ZW_W_SHIFT						16
+#define   NV34TCL_VTX_ATTR_4I_ZW_W_MASK							0xffff0000
+#define  NV34TCL_TX_OFFSET(x)								(0x00001a00+((x)*32))
+#define  NV34TCL_TX_OFFSET__SIZE							0x00000008
+#define  NV34TCL_TX_FORMAT(x)								(0x00001a04+((x)*32))
+#define  NV34TCL_TX_FORMAT__SIZE							0x00000008
+#define   NV34TCL_TX_FORMAT_DMA0							(1 <<  0)
+#define   NV34TCL_TX_FORMAT_DMA1							(1 <<  1)
+#define   NV34TCL_TX_FORMAT_CUBIC							(1 <<  2)
+#define   NV34TCL_TX_FORMAT_NO_BORDER							(1 <<  3)
+#define   NV34TCL_TX_FORMAT_DIMS_SHIFT							4
+#define   NV34TCL_TX_FORMAT_DIMS_MASK							0x000000f0
+#define    NV34TCL_TX_FORMAT_DIMS_1D							0x00000010
+#define    NV34TCL_TX_FORMAT_DIMS_2D							0x00000020
+#define    NV34TCL_TX_FORMAT_DIMS_3D							0x00000030
+#define   NV34TCL_TX_FORMAT_FORMAT_SHIFT						8
+#define   NV34TCL_TX_FORMAT_FORMAT_MASK							0x0000ff00
+#define    NV34TCL_TX_FORMAT_FORMAT_L8							0x00000000
+#define    NV34TCL_TX_FORMAT_FORMAT_A8							0x00000100
+#define    NV34TCL_TX_FORMAT_FORMAT_A1R5G5B5						0x00000200
+#define    NV34TCL_TX_FORMAT_FORMAT_A4R4G4B4						0x00000400
+#define    NV34TCL_TX_FORMAT_FORMAT_R5G6B5						0x00000500
+#define    NV34TCL_TX_FORMAT_FORMAT_A8R8G8B8						0x00000600
+#define    NV34TCL_TX_FORMAT_FORMAT_X8R8G8B8						0x00000700
+#define    NV34TCL_TX_FORMAT_FORMAT_INDEX8						0x00000b00
+#define    NV34TCL_TX_FORMAT_FORMAT_DXT1						0x00000c00
+#define    NV34TCL_TX_FORMAT_FORMAT_DXT3						0x00000e00
+#define    NV34TCL_TX_FORMAT_FORMAT_DXT5						0x00000f00
+#define    NV34TCL_TX_FORMAT_FORMAT_A1R5G5B5_RECT					0x00001000
+#define    NV34TCL_TX_FORMAT_FORMAT_R5G6B5_RECT						0x00001100
+#define    NV34TCL_TX_FORMAT_FORMAT_A8R8G8B8_RECT					0x00001200
+#define    NV34TCL_TX_FORMAT_FORMAT_L8_RECT						0x00001300
+#define    NV34TCL_TX_FORMAT_FORMAT_DSDT8_RECT						0x00001700
+#define    NV34TCL_TX_FORMAT_FORMAT_A8L8						0x00001a00
+#define    NV34TCL_TX_FORMAT_FORMAT_A8_RECT						0x00001b00
+#define    NV34TCL_TX_FORMAT_FORMAT_A4R4G4B4_RECT					0x00001d00
+#define    NV34TCL_TX_FORMAT_FORMAT_R8G8B8_RECT						0x00001e00
+#define    NV34TCL_TX_FORMAT_FORMAT_A8L8_RECT						0x00002000
+#define    NV34TCL_TX_FORMAT_FORMAT_DSDT8						0x00002800
+#define    NV34TCL_TX_FORMAT_FORMAT_HILO16						0x00003300
+#define    NV34TCL_TX_FORMAT_FORMAT_HILO16_RECT						0x00003600
+#define    NV34TCL_TX_FORMAT_FORMAT_HILO8						0x00004400
+#define    NV34TCL_TX_FORMAT_FORMAT_SIGNED_HILO8					0x00004500
+#define    NV34TCL_TX_FORMAT_FORMAT_HILO8_RECT						0x00004600
+#define    NV34TCL_TX_FORMAT_FORMAT_SIGNED_HILO8_RECT					0x00004700
+#define    NV34TCL_TX_FORMAT_FORMAT_A16							0x00003200
+#define    NV34TCL_TX_FORMAT_FORMAT_A16_RECT						0x00003500
+#define    NV34TCL_TX_FORMAT_FORMAT_FLOAT_RGBA16_NV					0x00004a00
+#define    NV34TCL_TX_FORMAT_FORMAT_FLOAT_RGBA32_NV					0x00004b00
+#define    NV34TCL_TX_FORMAT_FORMAT_FLOAT_R32_NV					0x00004c00
+#define   NV34TCL_TX_FORMAT_MIPMAP							(1 << 19)
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT						20
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_U_MASK						0x00f00000
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT						24
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_V_MASK						0x0f000000
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT						28
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_W_MASK						0xf0000000
+#define  NV34TCL_TX_WRAP(x)								(0x00001a08+((x)*32))
+#define  NV34TCL_TX_WRAP__SIZE								0x00000008
+#define   NV34TCL_TX_WRAP_S_SHIFT							0
+#define   NV34TCL_TX_WRAP_S_MASK							0x000000ff
+#define    NV34TCL_TX_WRAP_S_REPEAT							0x00000001
+#define    NV34TCL_TX_WRAP_S_MIRRORED_REPEAT						0x00000002
+#define    NV34TCL_TX_WRAP_S_CLAMP_TO_EDGE						0x00000003
+#define    NV34TCL_TX_WRAP_S_CLAMP_TO_BORDER						0x00000004
+#define    NV34TCL_TX_WRAP_S_CLAMP							0x00000005
+#define   NV34TCL_TX_WRAP_T_SHIFT							8
+#define   NV34TCL_TX_WRAP_T_MASK							0x00000f00
+#define    NV34TCL_TX_WRAP_T_REPEAT							0x00000100
+#define    NV34TCL_TX_WRAP_T_MIRRORED_REPEAT						0x00000200
+#define    NV34TCL_TX_WRAP_T_CLAMP_TO_EDGE						0x00000300
+#define    NV34TCL_TX_WRAP_T_CLAMP_TO_BORDER						0x00000400
+#define    NV34TCL_TX_WRAP_T_CLAMP							0x00000500
+#define   NV34TCL_TX_WRAP_EXPAND_NORMAL_SHIFT						12
+#define   NV34TCL_TX_WRAP_EXPAND_NORMAL_MASK						0x0000f000
+#define   NV34TCL_TX_WRAP_R_SHIFT							16
+#define   NV34TCL_TX_WRAP_R_MASK							0x000f0000
+#define    NV34TCL_TX_WRAP_R_REPEAT							0x00010000
+#define    NV34TCL_TX_WRAP_R_MIRRORED_REPEAT						0x00020000
+#define    NV34TCL_TX_WRAP_R_CLAMP_TO_EDGE						0x00030000
+#define    NV34TCL_TX_WRAP_R_CLAMP_TO_BORDER						0x00040000
+#define    NV34TCL_TX_WRAP_R_CLAMP							0x00050000
+#define   NV34TCL_TX_WRAP_RCOMP_SHIFT							28
+#define   NV34TCL_TX_WRAP_RCOMP_MASK							0xf0000000
+#define    NV34TCL_TX_WRAP_RCOMP_NEVER							0x00000000
+#define    NV34TCL_TX_WRAP_RCOMP_GREATER						0x10000000
+#define    NV34TCL_TX_WRAP_RCOMP_EQUAL							0x20000000
+#define    NV34TCL_TX_WRAP_RCOMP_GEQUAL							0x30000000
+#define    NV34TCL_TX_WRAP_RCOMP_LESS							0x40000000
+#define    NV34TCL_TX_WRAP_RCOMP_NOTEQUAL						0x50000000
+#define    NV34TCL_TX_WRAP_RCOMP_LEQUAL							0x60000000
+#define    NV34TCL_TX_WRAP_RCOMP_ALWAYS							0x70000000
+#define  NV34TCL_TX_ENABLE(x)								(0x00001a0c+((x)*32))
+#define  NV34TCL_TX_ENABLE__SIZE							0x00000008
+#define   NV34TCL_TX_ENABLE_ANISO_SHIFT							4
+#define   NV34TCL_TX_ENABLE_ANISO_MASK							0x00000030
+#define    NV34TCL_TX_ENABLE_ANISO_NONE							0x00000000
+#define    NV34TCL_TX_ENABLE_ANISO_2X							0x00000010
+#define    NV34TCL_TX_ENABLE_ANISO_4X							0x00000020
+#define    NV34TCL_TX_ENABLE_ANISO_8X							0x00000030
+#define   NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT					14
+#define   NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_MASK						0x0003c000
+#define   NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT					26
+#define   NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_MASK						0x3c000000
+#define   NV34TCL_TX_ENABLE_ENABLE							(1 << 30)
+#define  NV34TCL_TX_SWIZZLE(x)								(0x00001a10+((x)*32))
+#define  NV34TCL_TX_SWIZZLE__SIZE							0x00000008
+#define   NV34TCL_TX_SWIZZLE_S0_X_SHIFT							14
+#define   NV34TCL_TX_SWIZZLE_S0_X_MASK							0x0000c000
+#define    NV34TCL_TX_SWIZZLE_S0_X_ZERO							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S0_X_ONE							0x00004000
+#define    NV34TCL_TX_SWIZZLE_S0_X_S1							0x00008000
+#define   NV34TCL_TX_SWIZZLE_S0_Y_SHIFT							12
+#define   NV34TCL_TX_SWIZZLE_S0_Y_MASK							0x00003000
+#define    NV34TCL_TX_SWIZZLE_S0_Y_ZERO							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S0_Y_ONE							0x00001000
+#define    NV34TCL_TX_SWIZZLE_S0_Y_S1							0x00002000
+#define   NV34TCL_TX_SWIZZLE_S0_Z_SHIFT							10
+#define   NV34TCL_TX_SWIZZLE_S0_Z_MASK							0x00000c00
+#define    NV34TCL_TX_SWIZZLE_S0_Z_ZERO							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S0_Z_ONE							0x00000400
+#define    NV34TCL_TX_SWIZZLE_S0_Z_S1							0x00000800
+#define   NV34TCL_TX_SWIZZLE_S0_W_SHIFT							8
+#define   NV34TCL_TX_SWIZZLE_S0_W_MASK							0x00000300
+#define    NV34TCL_TX_SWIZZLE_S0_W_ZERO							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S0_W_ONE							0x00000100
+#define    NV34TCL_TX_SWIZZLE_S0_W_S1							0x00000200
+#define   NV34TCL_TX_SWIZZLE_S1_X_SHIFT							6
+#define   NV34TCL_TX_SWIZZLE_S1_X_MASK							0x000000c0
+#define    NV34TCL_TX_SWIZZLE_S1_X_W							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S1_X_Z							0x00000040
+#define    NV34TCL_TX_SWIZZLE_S1_X_Y							0x00000080
+#define    NV34TCL_TX_SWIZZLE_S1_X_X							0x000000c0
+#define   NV34TCL_TX_SWIZZLE_S1_Y_SHIFT							4
+#define   NV34TCL_TX_SWIZZLE_S1_Y_MASK							0x00000030
+#define    NV34TCL_TX_SWIZZLE_S1_Y_W							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S1_Y_Z							0x00000010
+#define    NV34TCL_TX_SWIZZLE_S1_Y_Y							0x00000020
+#define    NV34TCL_TX_SWIZZLE_S1_Y_X							0x00000030
+#define   NV34TCL_TX_SWIZZLE_S1_Z_SHIFT							2
+#define   NV34TCL_TX_SWIZZLE_S1_Z_MASK							0x0000000c
+#define    NV34TCL_TX_SWIZZLE_S1_Z_W							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S1_Z_Z							0x00000004
+#define    NV34TCL_TX_SWIZZLE_S1_Z_Y							0x00000008
+#define    NV34TCL_TX_SWIZZLE_S1_Z_X							0x0000000c
+#define   NV34TCL_TX_SWIZZLE_S1_W_SHIFT							0
+#define   NV34TCL_TX_SWIZZLE_S1_W_MASK							0x00000003
+#define    NV34TCL_TX_SWIZZLE_S1_W_W							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S1_W_Z							0x00000001
+#define    NV34TCL_TX_SWIZZLE_S1_W_Y							0x00000002
+#define    NV34TCL_TX_SWIZZLE_S1_W_X							0x00000003
+#define   NV34TCL_TX_SWIZZLE_RECT_PITCH_SHIFT						16
+#define   NV34TCL_TX_SWIZZLE_RECT_PITCH_MASK						0xffff0000
+#define  NV34TCL_TX_FILTER(x)								(0x00001a14+((x)*32))
+#define  NV34TCL_TX_FILTER__SIZE							0x00000008
+#define   NV34TCL_TX_FILTER_LOD_BIAS_SHIFT						8
+#define   NV34TCL_TX_FILTER_LOD_BIAS_MASK						0x00000f00
+#define   NV34TCL_TX_FILTER_MINIFY_SHIFT						16
+#define   NV34TCL_TX_FILTER_MINIFY_MASK							0x000f0000
+#define    NV34TCL_TX_FILTER_MINIFY_NEAREST						0x00010000
+#define    NV34TCL_TX_FILTER_MINIFY_LINEAR						0x00020000
+#define    NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST				0x00030000
+#define    NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST				0x00040000
+#define    NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR				0x00050000
+#define    NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR				0x00060000
+#define   NV34TCL_TX_FILTER_MAGNIFY_SHIFT						24
+#define   NV34TCL_TX_FILTER_MAGNIFY_MASK						0x0f000000
+#define    NV34TCL_TX_FILTER_MAGNIFY_NEAREST						0x01000000
+#define    NV34TCL_TX_FILTER_MAGNIFY_LINEAR						0x02000000
+#define   NV34TCL_TX_FILTER_SIGNED_BLUE							(1 << 28)
+#define   NV34TCL_TX_FILTER_SIGNED_GREEN						(1 << 29)
+#define   NV34TCL_TX_FILTER_SIGNED_RED							(1 << 30)
+#define   NV34TCL_TX_FILTER_SIGNED_ALPHA						(1 << 31)
+#define  NV34TCL_TX_NPOT_SIZE(x)							(0x00001a18+((x)*32))
+#define  NV34TCL_TX_NPOT_SIZE__SIZE							0x00000008
+#define   NV34TCL_TX_NPOT_SIZE_H_SHIFT							0
+#define   NV34TCL_TX_NPOT_SIZE_H_MASK							0x0000ffff
+#define   NV34TCL_TX_NPOT_SIZE_W_SHIFT							16
+#define   NV34TCL_TX_NPOT_SIZE_W_MASK							0xffff0000
+#define  NV34TCL_TX_BORDER_COLOR(x)							(0x00001a1c+((x)*32))
+#define  NV34TCL_TX_BORDER_COLOR__SIZE							0x00000008
+#define   NV34TCL_TX_BORDER_COLOR_B_SHIFT						0
+#define   NV34TCL_TX_BORDER_COLOR_B_MASK						0x000000ff
+#define   NV34TCL_TX_BORDER_COLOR_G_SHIFT						8
+#define   NV34TCL_TX_BORDER_COLOR_G_MASK						0x0000ff00
+#define   NV34TCL_TX_BORDER_COLOR_R_SHIFT						16
+#define   NV34TCL_TX_BORDER_COLOR_R_MASK						0x00ff0000
+#define   NV34TCL_TX_BORDER_COLOR_A_SHIFT						24
+#define   NV34TCL_TX_BORDER_COLOR_A_MASK						0xff000000
+#define  NV34TCL_VTX_ATTR_4F_X(x)							(0x00001c00+((x)*16))
+#define  NV34TCL_VTX_ATTR_4F_X__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_4F_Y(x)							(0x00001c04+((x)*16))
+#define  NV34TCL_VTX_ATTR_4F_Y__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_4F_Z(x)							(0x00001c08+((x)*16))
+#define  NV34TCL_VTX_ATTR_4F_Z__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_4F_W(x)							(0x00001c0c+((x)*16))
+#define  NV34TCL_VTX_ATTR_4F_W__SIZE							0x00000010
+#define  NV34TCL_FP_CONTROL								0x00001d60
+#define   NV34TCL_FP_CONTROL_USES_KIL							(1 <<  7)
+#define   NV34TCL_FP_CONTROL_USED_REGS_MINUS1_DIV2_SHIFT				0
+#define   NV34TCL_FP_CONTROL_USED_REGS_MINUS1_DIV2_MASK					0x0000000f
+#define  NV34TCL_DEPTH_UNK17D8								0x00001d78
+#define   NV34TCL_DEPTH_UNK17D8_CLAMP_SHIFT						4
+#define   NV34TCL_DEPTH_UNK17D8_CLAMP_MASK						0x000000f0
+#define  NV34TCL_MULTISAMPLE_CONTROL							0x00001d7c
+#define   NV34TCL_MULTISAMPLE_CONTROL_ENABLE						(1 <<  0)
+#define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_ALPHA_TO_COVERAGE				(1 <<  4)
+#define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_ALPHA_TO_ONE				(1 <<  8)
+#define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_COVERAGE_SHIFT				16
+#define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_COVERAGE_MASK				0xffff0000
+#define  NV34TCL_CLEAR_DEPTH_VALUE							0x00001d8c
+#define  NV34TCL_CLEAR_COLOR_VALUE							0x00001d90
+#define   NV34TCL_CLEAR_COLOR_VALUE_B_SHIFT						0
+#define   NV34TCL_CLEAR_COLOR_VALUE_B_MASK						0x000000ff
+#define   NV34TCL_CLEAR_COLOR_VALUE_G_SHIFT						8
+#define   NV34TCL_CLEAR_COLOR_VALUE_G_MASK						0x0000ff00
+#define   NV34TCL_CLEAR_COLOR_VALUE_R_SHIFT						16
+#define   NV34TCL_CLEAR_COLOR_VALUE_R_MASK						0x00ff0000
+#define   NV34TCL_CLEAR_COLOR_VALUE_A_SHIFT						24
+#define   NV34TCL_CLEAR_COLOR_VALUE_A_MASK						0xff000000
+#define  NV34TCL_CLEAR_BUFFERS								0x00001d94
+#define   NV34TCL_CLEAR_BUFFERS_COLOR_A							(1 <<  7)
+#define   NV34TCL_CLEAR_BUFFERS_COLOR_B							(1 <<  6)
+#define   NV34TCL_CLEAR_BUFFERS_COLOR_G							(1 <<  5)
+#define   NV34TCL_CLEAR_BUFFERS_COLOR_R							(1 <<  4)
+#define   NV34TCL_CLEAR_BUFFERS_STENCIL							(1 <<  1)
+#define   NV34TCL_CLEAR_BUFFERS_DEPTH							(1 <<  0)
+#define  NV34TCL_DO_VERTICES								0x00001dac
+#define  NV34TCL_LINE_STIPPLE_ENABLE							0x00001db4
+#define  NV34TCL_LINE_STIPPLE_PATTERN							0x00001db8
+#define   NV34TCL_LINE_STIPPLE_PATTERN_FACTOR_SHIFT					0
+#define   NV34TCL_LINE_STIPPLE_PATTERN_FACTOR_MASK					0x0000ffff
+#define   NV34TCL_LINE_STIPPLE_PATTERN_PATTERN_SHIFT					16
+#define   NV34TCL_LINE_STIPPLE_PATTERN_PATTERN_MASK					0xffff0000
+#define  NV34TCL_BACK_MATERIAL_SHININESS(x)						(0x00001e20+((x)*4))
+#define  NV34TCL_BACK_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV34TCL_VTX_ATTR_1F(x)								(0x00001e40+((x)*4))
+#define  NV34TCL_VTX_ATTR_1F__SIZE							0x00000010
+#define  NV34TCL_ENGINE									0x00001e94
+#define   NV34TCL_ENGINE_FP								(1 <<  0)
+#define   NV34TCL_ENGINE_VP								(1 <<  1)
+#define   NV34TCL_ENGINE_FIXED								(1 <<  2)
+#define  NV34TCL_VP_UPLOAD_FROM_ID							0x00001e9c
+#define  NV34TCL_VP_START_FROM_ID							0x00001ea0
+#define  NV34TCL_POINT_PARAMETERS(x)							(0x00001ec0+((x)*4))
+#define  NV34TCL_POINT_PARAMETERS__SIZE							0x00000008
+#define  NV34TCL_POINT_SIZE								0x00001ee0
+#define  NV34TCL_POINT_PARAMETERS_ENABLE						0x00001ee4
+#define  NV34TCL_POINT_SPRITE								0x00001ee8
+#define   NV34TCL_POINT_SPRITE_ENABLE							(1 <<  0)
+#define   NV34TCL_POINT_SPRITE_R_MODE_SHIFT						1
+#define   NV34TCL_POINT_SPRITE_R_MODE_MASK						0x00000006
+#define    NV34TCL_POINT_SPRITE_R_MODE_ZERO						0x00000000
+#define    NV34TCL_POINT_SPRITE_R_MODE_R						0x00000002
+#define    NV34TCL_POINT_SPRITE_R_MODE_S						0x00000004
+#define   NV34TCL_POINT_SPRITE_COORD_REPLACE_0						(1 <<  8)
+#define   NV34TCL_POINT_SPRITE_COORD_REPLACE_1						(1 <<  9)
+#define   NV34TCL_POINT_SPRITE_COORD_REPLACE_2						(1 << 10)
+#define   NV34TCL_POINT_SPRITE_COORD_REPLACE_3						(1 << 11)
+#define   NV34TCL_POINT_SPRITE_COORD_REPLACE_4						(1 << 12)
+#define   NV34TCL_POINT_SPRITE_COORD_REPLACE_5						(1 << 13)
+#define   NV34TCL_POINT_SPRITE_COORD_REPLACE_6						(1 << 14)
+#define   NV34TCL_POINT_SPRITE_COORD_REPLACE_7						(1 << 15)
+#define  NV34TCL_VP_UPLOAD_CONST_ID							0x00001efc
+#define  NV34TCL_VP_UPLOAD_CONST_X(x)							(0x00001f00+((x)*16))
+#define  NV34TCL_VP_UPLOAD_CONST_X__SIZE						0x00000004
+#define  NV34TCL_VP_UPLOAD_CONST_Y(x)							(0x00001f04+((x)*16))
+#define  NV34TCL_VP_UPLOAD_CONST_Y__SIZE						0x00000004
+#define  NV34TCL_VP_UPLOAD_CONST_Z(x)							(0x00001f08+((x)*16))
+#define  NV34TCL_VP_UPLOAD_CONST_Z__SIZE						0x00000004
+#define  NV34TCL_VP_UPLOAD_CONST_W(x)							(0x00001f0c+((x)*16))
+#define  NV34TCL_VP_UPLOAD_CONST_W__SIZE						0x00000004
+#define  NV34TCL_UNK1f80(x)								(0x00001f80+((x)*4))
+#define  NV34TCL_UNK1f80__SIZE								0x00000010
+
+
+#define NV40TCL										0x00004097
+
+#define  NV40TCL_REF_CNT								0x00000050
+#define  NV40TCL_NOP									0x00000100
+#define  NV40TCL_NOTIFY									0x00000104
+#define  NV40TCL_DMA_NOTIFY								0x00000180
+#define  NV40TCL_DMA_TEXTURE0								0x00000184
+#define  NV40TCL_DMA_TEXTURE1								0x00000188
+#define  NV40TCL_DMA_COLOR1								0x0000018c
+#define  NV40TCL_DMA_COLOR0								0x00000194
+#define  NV40TCL_DMA_ZETA								0x00000198
+#define  NV40TCL_DMA_VTXBUF0								0x0000019c
+#define  NV40TCL_DMA_VTXBUF1								0x000001a0
+#define  NV40TCL_DMA_FENCE								0x000001a4
+#define  NV40TCL_DMA_QUERY								0x000001a8
+#define  NV40TCL_DMA_UNK01AC								0x000001ac
+#define  NV40TCL_DMA_UNK01B0								0x000001b0
+#define  NV40TCL_DMA_COLOR2								0x000001b4
+#define  NV40TCL_DMA_COLOR3								0x000001b8
+#define  NV40TCL_RT_HORIZ								0x00000200
+#define   NV40TCL_RT_HORIZ_W_SHIFT							16
+#define   NV40TCL_RT_HORIZ_W_MASK							0xffff0000
+#define   NV40TCL_RT_HORIZ_X_SHIFT							0
+#define   NV40TCL_RT_HORIZ_X_MASK							0x0000ffff
+#define  NV40TCL_RT_VERT								0x00000204
+#define   NV40TCL_RT_VERT_H_SHIFT							16
+#define   NV40TCL_RT_VERT_H_MASK							0xffff0000
+#define   NV40TCL_RT_VERT_Y_SHIFT							0
+#define   NV40TCL_RT_VERT_Y_MASK							0x0000ffff
+#define  NV40TCL_RT_FORMAT								0x00000208
+#define   NV40TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT						24
+#define   NV40TCL_RT_FORMAT_LOG2_HEIGHT_MASK						0xff000000
+#define   NV40TCL_RT_FORMAT_LOG2_WIDTH_SHIFT						16
+#define   NV40TCL_RT_FORMAT_LOG2_WIDTH_MASK						0x00ff0000
+#define   NV40TCL_RT_FORMAT_TYPE_SHIFT							8
+#define   NV40TCL_RT_FORMAT_TYPE_MASK							0x00000f00
+#define    NV40TCL_RT_FORMAT_TYPE_LINEAR						0x00000100
+#define    NV40TCL_RT_FORMAT_TYPE_SWIZZLED						0x00000200
+#define   NV40TCL_RT_FORMAT_ZETA_SHIFT							5
+#define   NV40TCL_RT_FORMAT_ZETA_MASK							0x000000e0
+#define    NV40TCL_RT_FORMAT_ZETA_Z16							0x00000020
+#define    NV40TCL_RT_FORMAT_ZETA_Z24S8							0x00000040
+#define   NV40TCL_RT_FORMAT_COLOR_SHIFT							0
+#define   NV40TCL_RT_FORMAT_COLOR_MASK							0x0000001f
+#define    NV40TCL_RT_FORMAT_COLOR_R5G6B5						0x00000003
+#define    NV40TCL_RT_FORMAT_COLOR_X8R8G8B8						0x00000005
+#define    NV40TCL_RT_FORMAT_COLOR_A8R8G8B8						0x00000008
+#define    NV40TCL_RT_FORMAT_COLOR_B8							0x00000009
+#define    NV40TCL_RT_FORMAT_COLOR_UNKNOWN						0x0000000d
+#define    NV40TCL_RT_FORMAT_COLOR_X8B8G8R8						0x0000000f
+#define    NV40TCL_RT_FORMAT_COLOR_A8B8G8R8						0x00000010
+#define  NV40TCL_COLOR0_PITCH								0x0000020c
+#define  NV40TCL_COLOR0_OFFSET								0x00000210
+#define  NV40TCL_ZETA_OFFSET								0x00000214
+#define  NV40TCL_COLOR1_OFFSET								0x00000218
+#define  NV40TCL_COLOR1_PITCH								0x0000021c
+#define  NV40TCL_RT_ENABLE								0x00000220
+#define   NV40TCL_RT_ENABLE_MRT								(1 <<  4)
+#define   NV40TCL_RT_ENABLE_COLOR3							(1 <<  3)
+#define   NV40TCL_RT_ENABLE_COLOR2							(1 <<  2)
+#define   NV40TCL_RT_ENABLE_COLOR1							(1 <<  1)
+#define   NV40TCL_RT_ENABLE_COLOR0							(1 <<  0)
+#define  NV40TCL_ZETA_PITCH								0x0000022c
+#define  NV40TCL_COLOR2_PITCH								0x00000280
+#define  NV40TCL_COLOR3_PITCH								0x00000284
+#define  NV40TCL_COLOR2_OFFSET								0x00000288
+#define  NV40TCL_COLOR3_OFFSET								0x0000028c
+#define  NV40TCL_VIEWPORT_CLIP_HORIZ(x)							(0x000002c0+((x)*8))
+#define  NV40TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define  NV40TCL_VIEWPORT_CLIP_VERT(x)							(0x000002c4+((x)*8))
+#define  NV40TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define  NV40TCL_DITHER_ENABLE								0x00000300
+#define  NV40TCL_ALPHA_TEST_ENABLE							0x00000304
+#define  NV40TCL_ALPHA_TEST_FUNC							0x00000308
+#define   NV40TCL_ALPHA_TEST_FUNC_NEVER							0x00000200
+#define   NV40TCL_ALPHA_TEST_FUNC_LESS							0x00000201
+#define   NV40TCL_ALPHA_TEST_FUNC_EQUAL							0x00000202
+#define   NV40TCL_ALPHA_TEST_FUNC_LEQUAL						0x00000203
+#define   NV40TCL_ALPHA_TEST_FUNC_GREATER						0x00000204
+#define   NV40TCL_ALPHA_TEST_FUNC_NOTEQUAL						0x00000205
+#define   NV40TCL_ALPHA_TEST_FUNC_GEQUAL						0x00000206
+#define   NV40TCL_ALPHA_TEST_FUNC_ALWAYS						0x00000207
+#define  NV40TCL_ALPHA_TEST_REF								0x0000030c
+#define  NV40TCL_BLEND_ENABLE								0x00000310
+#define  NV40TCL_BLEND_FUNC_SRC								0x00000314
+#define   NV40TCL_BLEND_FUNC_SRC_RGB_SHIFT						0
+#define   NV40TCL_BLEND_FUNC_SRC_RGB_MASK						0x0000ffff
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ZERO						0x00000000
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE						0x00000001
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR						0x00000300
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA						0x00000302
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA						0x00000304
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_DST_COLOR						0x00000306
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE				0x00000308
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR					0x00008001
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA					0x00008003
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV40TCL_BLEND_FUNC_SRC_ALPHA_SHIFT						16
+#define   NV40TCL_BLEND_FUNC_SRC_ALPHA_MASK						0xffff0000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ZERO						0x00000000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE						0x00010000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR					0x03000000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x03010000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA					0x03020000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x03030000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA					0x03040000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x03050000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR					0x03060000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x03070000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE				0x03080000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR					0x80010000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x80020000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA					0x80030000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x80040000
+#define  NV40TCL_BLEND_FUNC_DST								0x00000318
+#define   NV40TCL_BLEND_FUNC_DST_RGB_SHIFT						0
+#define   NV40TCL_BLEND_FUNC_DST_RGB_MASK						0x0000ffff
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ZERO						0x00000000
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE						0x00000001
+#define    NV40TCL_BLEND_FUNC_DST_RGB_SRC_COLOR						0x00000300
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define    NV40TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA						0x00000302
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define    NV40TCL_BLEND_FUNC_DST_RGB_DST_ALPHA						0x00000304
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define    NV40TCL_BLEND_FUNC_DST_RGB_DST_COLOR						0x00000306
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define    NV40TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE				0x00000308
+#define    NV40TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR					0x00008001
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define    NV40TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA					0x00008003
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV40TCL_BLEND_FUNC_DST_ALPHA_SHIFT						16
+#define   NV40TCL_BLEND_FUNC_DST_ALPHA_MASK						0xffff0000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ZERO						0x00000000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE						0x00010000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR					0x03000000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x03010000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA					0x03020000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x03030000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA					0x03040000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x03050000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR					0x03060000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR				0x03070000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE				0x03080000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR					0x80010000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x80020000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA					0x80030000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x80040000
+#define  NV40TCL_BLEND_COLOR								0x0000031c
+#define   NV40TCL_BLEND_COLOR_B_SHIFT							0
+#define   NV40TCL_BLEND_COLOR_B_MASK							0x000000ff
+#define   NV40TCL_BLEND_COLOR_G_SHIFT							8
+#define   NV40TCL_BLEND_COLOR_G_MASK							0x0000ff00
+#define   NV40TCL_BLEND_COLOR_R_SHIFT							16
+#define   NV40TCL_BLEND_COLOR_R_MASK							0x00ff0000
+#define   NV40TCL_BLEND_COLOR_A_SHIFT							24
+#define   NV40TCL_BLEND_COLOR_A_MASK							0xff000000
+#define  NV40TCL_BLEND_EQUATION								0x00000320
+#define   NV40TCL_BLEND_EQUATION_RGB_SHIFT						0
+#define   NV40TCL_BLEND_EQUATION_RGB_MASK						0x0000ffff
+#define    NV40TCL_BLEND_EQUATION_RGB_FUNC_ADD						0x00008006
+#define    NV40TCL_BLEND_EQUATION_RGB_MIN						0x00008007
+#define    NV40TCL_BLEND_EQUATION_RGB_MAX						0x00008008
+#define    NV40TCL_BLEND_EQUATION_RGB_FUNC_SUBTRACT					0x0000800a
+#define    NV40TCL_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define   NV40TCL_BLEND_EQUATION_ALPHA_SHIFT						16
+#define   NV40TCL_BLEND_EQUATION_ALPHA_MASK						0xffff0000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_FUNC_ADD					0x80060000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_MIN						0x80070000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_MAX						0x80080000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT					0x800a0000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT				0x800b0000
+#define  NV40TCL_COLOR_MASK								0x00000324
+#define   NV40TCL_COLOR_MASK_BUFFER0_B_SHIFT						0
+#define   NV40TCL_COLOR_MASK_BUFFER0_B_MASK						0x000000ff
+#define   NV40TCL_COLOR_MASK_BUFFER0_G_SHIFT						8
+#define   NV40TCL_COLOR_MASK_BUFFER0_G_MASK						0x0000ff00
+#define   NV40TCL_COLOR_MASK_BUFFER0_R_SHIFT						16
+#define   NV40TCL_COLOR_MASK_BUFFER0_R_MASK						0x00ff0000
+#define   NV40TCL_COLOR_MASK_BUFFER0_A_SHIFT						24
+#define   NV40TCL_COLOR_MASK_BUFFER0_A_MASK						0xff000000
+#define  NV40TCL_STENCIL_FRONT_ENABLE							0x00000328
+#define  NV40TCL_STENCIL_FRONT_MASK							0x0000032c
+#define  NV40TCL_STENCIL_FRONT_FUNC_FUNC						0x00000330
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_NEVER						0x00000200
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_LESS						0x00000201
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_EQUAL						0x00000202
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_LEQUAL					0x00000203
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_GEQUAL					0x00000206
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_ALWAYS					0x00000207
+#define  NV40TCL_STENCIL_FRONT_FUNC_REF							0x00000334
+#define  NV40TCL_STENCIL_FRONT_FUNC_MASK						0x00000338
+#define  NV40TCL_STENCIL_FRONT_OP_FAIL							0x0000033c
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_ZERO						0x00000000
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_REPLACE						0x00001e01
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_INCR						0x00001e02
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_DECR						0x00001e03
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_FRONT_OP_ZFAIL							0x00000340
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_ZERO						0x00000000
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_REPLACE					0x00001e01
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_INCR						0x00001e02
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_DECR						0x00001e03
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_FRONT_OP_ZPASS							0x00000344
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_ZERO						0x00000000
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_REPLACE					0x00001e01
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_INCR						0x00001e02
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_DECR						0x00001e03
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_BACK_ENABLE							0x00000348
+#define  NV40TCL_STENCIL_BACK_MASK							0x0000034c
+#define  NV40TCL_STENCIL_BACK_FUNC_FUNC							0x00000350
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_NEVER						0x00000200
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_LESS						0x00000201
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_EQUAL						0x00000202
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV40TCL_STENCIL_BACK_FUNC_REF							0x00000354
+#define  NV40TCL_STENCIL_BACK_FUNC_MASK							0x00000358
+#define  NV40TCL_STENCIL_BACK_OP_FAIL							0x0000035c
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_ZERO						0x00000000
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_REPLACE						0x00001e01
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_INCR						0x00001e02
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_DECR						0x00001e03
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_BACK_OP_ZFAIL							0x00000360
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_ZERO						0x00000000
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_INCR						0x00001e02
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_DECR						0x00001e03
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_BACK_OP_ZPASS							0x00000364
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_ZERO						0x00000000
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_REPLACE						0x00001e01
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_INCR						0x00001e02
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_DECR						0x00001e03
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV40TCL_SHADE_MODEL								0x00000368
+#define   NV40TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV40TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV40TCL_MRT_COLOR_MASK								0x00000370
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER1_A						(1 <<  4)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER1_R						(1 <<  5)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER1_G						(1 <<  6)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER1_B						(1 <<  7)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER2_A						(1 <<  8)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER2_R						(1 <<  9)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER2_G						(1 << 10)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER2_B						(1 << 11)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER3_A						(1 << 12)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER3_R						(1 << 13)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER3_G						(1 << 14)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER3_B						(1 << 15)
+#define  NV40TCL_COLOR_LOGIC_OP_ENABLE							0x00000374
+#define  NV40TCL_COLOR_LOGIC_OP								0x00000378
+#define   NV40TCL_COLOR_LOGIC_OP_CLEAR							0x00001500
+#define   NV40TCL_COLOR_LOGIC_OP_AND							0x00001501
+#define   NV40TCL_COLOR_LOGIC_OP_AND_REVERSE						0x00001502
+#define   NV40TCL_COLOR_LOGIC_OP_COPY							0x00001503
+#define   NV40TCL_COLOR_LOGIC_OP_AND_INVERTED						0x00001504
+#define   NV40TCL_COLOR_LOGIC_OP_NOOP							0x00001505
+#define   NV40TCL_COLOR_LOGIC_OP_XOR							0x00001506
+#define   NV40TCL_COLOR_LOGIC_OP_OR							0x00001507
+#define   NV40TCL_COLOR_LOGIC_OP_NOR							0x00001508
+#define   NV40TCL_COLOR_LOGIC_OP_EQUIV							0x00001509
+#define   NV40TCL_COLOR_LOGIC_OP_INVERT							0x0000150a
+#define   NV40TCL_COLOR_LOGIC_OP_OR_REVERSE						0x0000150b
+#define   NV40TCL_COLOR_LOGIC_OP_COPY_INVERTED						0x0000150c
+#define   NV40TCL_COLOR_LOGIC_OP_OR_INVERTED						0x0000150d
+#define   NV40TCL_COLOR_LOGIC_OP_NAND							0x0000150e
+#define   NV40TCL_COLOR_LOGIC_OP_SET							0x0000150f
+#define  NV40TCL_DEPTH_RANGE_NEAR							0x00000394
+#define  NV40TCL_DEPTH_RANGE_FAR							0x00000398
+#define  NV40TCL_LINE_WIDTH								0x000003b8
+#define  NV40TCL_LINE_SMOOTH_ENABLE							0x000003bc
+#define  NV40TCL_UNK03C0(x)								(0x000003c0+((x)*4))
+#define  NV40TCL_UNK03C0__SIZE								0x00000010
+#define  NV40TCL_UNK0400(x)								(0x00000400+((x)*4))
+#define  NV40TCL_UNK0400__SIZE								0x00000010
+#define  NV40TCL_UNK0440(x)								(0x00000440+((x)*4))
+#define  NV40TCL_UNK0440__SIZE								0x00000020
+#define  NV40TCL_SCISSOR_HORIZ								0x000008c0
+#define   NV40TCL_SCISSOR_HORIZ_X_SHIFT							0
+#define   NV40TCL_SCISSOR_HORIZ_X_MASK							0x0000ffff
+#define   NV40TCL_SCISSOR_HORIZ_W_SHIFT							16
+#define   NV40TCL_SCISSOR_HORIZ_W_MASK							0xffff0000
+#define  NV40TCL_SCISSOR_VERT								0x000008c4
+#define   NV40TCL_SCISSOR_VERT_Y_SHIFT							0
+#define   NV40TCL_SCISSOR_VERT_Y_MASK							0x0000ffff
+#define   NV40TCL_SCISSOR_VERT_H_SHIFT							16
+#define   NV40TCL_SCISSOR_VERT_H_MASK							0xffff0000
+#define  NV40TCL_FOG_MODE								0x000008cc
+#define  NV40TCL_FOG_EQUATION_CONSTANT							0x000008d0
+#define  NV40TCL_FOG_EQUATION_LINEAR							0x000008d4
+#define  NV40TCL_FOG_EQUATION_QUADRATIC							0x000008d8
+#define  NV40TCL_FP_ADDRESS								0x000008e4
+#define   NV40TCL_FP_ADDRESS_OFFSET_SHIFT						8
+#define   NV40TCL_FP_ADDRESS_OFFSET_MASK						0xffffff00
+#define   NV40TCL_FP_ADDRESS_DMA1							(1 <<  1)
+#define   NV40TCL_FP_ADDRESS_DMA0							(1 <<  0)
+#define  NV40TCL_VIEWPORT_HORIZ								0x00000a00
+#define   NV40TCL_VIEWPORT_HORIZ_W_SHIFT						16
+#define   NV40TCL_VIEWPORT_HORIZ_W_MASK							0xffff0000
+#define   NV40TCL_VIEWPORT_HORIZ_X_SHIFT						0
+#define   NV40TCL_VIEWPORT_HORIZ_X_MASK							0x0000ffff
+#define  NV40TCL_VIEWPORT_VERT								0x00000a04
+#define   NV40TCL_VIEWPORT_VERT_H_SHIFT							16
+#define   NV40TCL_VIEWPORT_VERT_H_MASK							0xffff0000
+#define   NV40TCL_VIEWPORT_VERT_Y_SHIFT							0
+#define   NV40TCL_VIEWPORT_VERT_Y_MASK							0x0000ffff
+#define  NV40TCL_VIEWPORT_TRANSLATE_X							0x00000a20
+#define  NV40TCL_VIEWPORT_TRANSLATE_Y							0x00000a24
+#define  NV40TCL_VIEWPORT_TRANSLATE_Z							0x00000a28
+#define  NV40TCL_VIEWPORT_TRANSLATE_W							0x00000a2c
+#define  NV40TCL_VIEWPORT_SCALE_X							0x00000a30
+#define  NV40TCL_VIEWPORT_SCALE_Y							0x00000a34
+#define  NV40TCL_VIEWPORT_SCALE_Z							0x00000a38
+#define  NV40TCL_VIEWPORT_SCALE_W							0x00000a3c
+#define  NV40TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000a60
+#define  NV40TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000a64
+#define  NV40TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000a68
+#define  NV40TCL_DEPTH_FUNC								0x00000a6c
+#define   NV40TCL_DEPTH_FUNC_NEVER							0x00000200
+#define   NV40TCL_DEPTH_FUNC_LESS							0x00000201
+#define   NV40TCL_DEPTH_FUNC_EQUAL							0x00000202
+#define   NV40TCL_DEPTH_FUNC_LEQUAL							0x00000203
+#define   NV40TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV40TCL_DEPTH_FUNC_NOTEQUAL							0x00000205
+#define   NV40TCL_DEPTH_FUNC_GEQUAL							0x00000206
+#define   NV40TCL_DEPTH_FUNC_ALWAYS							0x00000207
+#define  NV40TCL_DEPTH_WRITE_ENABLE							0x00000a70
+#define  NV40TCL_DEPTH_TEST_ENABLE							0x00000a74
+#define  NV40TCL_POLYGON_OFFSET_FACTOR							0x00000a78
+#define  NV40TCL_POLYGON_OFFSET_UNITS							0x00000a7c
+#define  NV40TCL_VTX_ATTR_3I_XY(x)							(0x00000a80+((x)*8))
+#define  NV40TCL_VTX_ATTR_3I_XY__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_3I_XY_X_SHIFT						0
+#define   NV40TCL_VTX_ATTR_3I_XY_X_MASK							0x0000ffff
+#define   NV40TCL_VTX_ATTR_3I_XY_Y_SHIFT						16
+#define   NV40TCL_VTX_ATTR_3I_XY_Y_MASK							0xffff0000
+#define  NV40TCL_VTX_ATTR_3I_Z(x)							(0x00000a84+((x)*8))
+#define  NV40TCL_VTX_ATTR_3I_Z__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_3I_Z_Z_SHIFT							0
+#define   NV40TCL_VTX_ATTR_3I_Z_Z_MASK							0x0000ffff
+#define  NV40TCL_TEX_FILTER_OPTIMIZATION						0x00000b00
+#define   NV40TCL_TEX_FILTER_OPTIMIZATION_TRILINEAR_SHIFT				0
+#define   NV40TCL_TEX_FILTER_OPTIMIZATION_TRILINEAR_MASK				0x0000001f
+#define    NV40TCL_TEX_FILTER_OPTIMIZATION_TRILINEAR_OFF				0x00000000
+#define    NV40TCL_TEX_FILTER_OPTIMIZATION_TRILINEAR_HIGH_QUALITY			0x00000004
+#define    NV40TCL_TEX_FILTER_OPTIMIZATION_TRILINEAR_QUALITY				0x00000006
+#define    NV40TCL_TEX_FILTER_OPTIMIZATION_TRILINEAR_PERFORMANCE			0x00000008
+#define    NV40TCL_TEX_FILTER_OPTIMIZATION_TRILINEAR_HIGH_PERFORMANCE			0x00000018
+#define   NV40TCL_TEX_FILTER_OPTIMIZATION_ANISO_SAMPLE_SHIFT				6
+#define   NV40TCL_TEX_FILTER_OPTIMIZATION_ANISO_SAMPLE_MASK				0x000001c0
+#define    NV40TCL_TEX_FILTER_OPTIMIZATION_ANISO_SAMPLE_OFF				0x00000000
+#define    NV40TCL_TEX_FILTER_OPTIMIZATION_ANISO_SAMPLE_HIGH_QUALITY			0x000000c0
+#define    NV40TCL_TEX_FILTER_OPTIMIZATION_ANISO_SAMPLE_QUALITY				0x000001c0
+#define    NV40TCL_TEX_FILTER_OPTIMIZATION_ANISO_SAMPLE_PERFORMANCE			0x00000140
+#define   NV40TCL_TEX_FILTER_OPTIMIZATION_UNKNOWN_SHIFT					10
+#define   NV40TCL_TEX_FILTER_OPTIMIZATION_UNKNOWN_MASK					0x00007c00
+#define    NV40TCL_TEX_FILTER_OPTIMIZATION_UNKNOWN_OFF					0x00000000
+#define    NV40TCL_TEX_FILTER_OPTIMIZATION_UNKNOWN_PARTIAL				0x00002c00
+#define    NV40TCL_TEX_FILTER_OPTIMIZATION_UNKNOWN_FULL					0x00007c00
+#define  NV40TCL_UNK0B40(x)								(0x00000b40+((x)*4))
+#define  NV40TCL_UNK0B40__SIZE								0x00000008
+#define  NV40TCL_VP_UPLOAD_INST(x)							(0x00000b80+((x)*4))
+#define  NV40TCL_VP_UPLOAD_INST__SIZE							0x00000004
+#define  NV40TCL_VERTEX_TWO_SIDE_ENABLE							0x0000142c
+#define  NV40TCL_CLIP_PLANE_ENABLE							0x00001478
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE0						(1 <<  1)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE1						(1 <<  5)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE2						(1 <<  9)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE3						(1 << 13)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE4						(1 << 17)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE5						(1 << 21)
+#define  NV40TCL_POLYGON_STIPPLE_ENABLE							0x0000147c
+#define  NV40TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001480+((x)*4))
+#define  NV40TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NV40TCL_VTX_ATTR_3F_X(x)							(0x00001500+((x)*16))
+#define  NV40TCL_VTX_ATTR_3F_X__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_3F_Y(x)							(0x00001504+((x)*16))
+#define  NV40TCL_VTX_ATTR_3F_Y__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_3F_Z(x)							(0x00001508+((x)*16))
+#define  NV40TCL_VTX_ATTR_3F_Z__SIZE							0x00000010
+#define  NV40TCL_VTXBUF_ADDRESS(x)							(0x00001680+((x)*4))
+#define  NV40TCL_VTXBUF_ADDRESS__SIZE							0x00000010
+#define   NV40TCL_VTXBUF_ADDRESS_DMA1							(1 << 31)
+#define   NV40TCL_VTXBUF_ADDRESS_OFFSET_SHIFT						0
+#define   NV40TCL_VTXBUF_ADDRESS_OFFSET_MASK						0x0fffffff
+#define  NV40TCL_VTX_CACHE_INVALIDATE							0x00001714
+#define  NV40TCL_VTXFMT(x)								(0x00001740+((x)*4))
+#define  NV40TCL_VTXFMT__SIZE								0x00000010
+#define   NV40TCL_VTXFMT_TYPE_SHIFT							0
+#define   NV40TCL_VTXFMT_TYPE_MASK							0x0000000f
+#define    NV40TCL_VTXFMT_TYPE_FLOAT							0x00000002
+#define    NV40TCL_VTXFMT_TYPE_UBYTE							0x00000004
+#define    NV40TCL_VTXFMT_TYPE_USHORT							0x00000005
+#define   NV40TCL_VTXFMT_SIZE_SHIFT							4
+#define   NV40TCL_VTXFMT_SIZE_MASK							0x000000f0
+#define   NV40TCL_VTXFMT_STRIDE_SHIFT							8
+#define   NV40TCL_VTXFMT_STRIDE_MASK							0x0000ff00
+#define  NV40TCL_QUERY_RESET								0x000017c8
+#define  NV40TCL_QUERY_UNK17CC								0x000017cc
+#define  NV40TCL_QUERY_GET								0x00001800
+#define   NV40TCL_QUERY_GET_UNK24_SHIFT							24
+#define   NV40TCL_QUERY_GET_UNK24_MASK							0xff000000
+#define   NV40TCL_QUERY_GET_OFFSET_SHIFT						0
+#define   NV40TCL_QUERY_GET_OFFSET_MASK							0x00ffffff
+#define  NV40TCL_BEGIN_END								0x00001808
+#define   NV40TCL_BEGIN_END_STOP							0x00000000
+#define   NV40TCL_BEGIN_END_POINTS							0x00000001
+#define   NV40TCL_BEGIN_END_LINES							0x00000002
+#define   NV40TCL_BEGIN_END_LINE_LOOP							0x00000003
+#define   NV40TCL_BEGIN_END_LINE_STRIP							0x00000004
+#define   NV40TCL_BEGIN_END_TRIANGLES							0x00000005
+#define   NV40TCL_BEGIN_END_TRIANGLE_STRIP						0x00000006
+#define   NV40TCL_BEGIN_END_TRIANGLE_FAN						0x00000007
+#define   NV40TCL_BEGIN_END_QUADS							0x00000008
+#define   NV40TCL_BEGIN_END_QUAD_STRIP							0x00000009
+#define   NV40TCL_BEGIN_END_POLYGON							0x0000000a
+#define  NV40TCL_VB_ELEMENT_U16								0x0000180c
+#define   NV40TCL_VB_ELEMENT_U16_1_SHIFT						16
+#define   NV40TCL_VB_ELEMENT_U16_1_MASK							0xffff0000
+#define   NV40TCL_VB_ELEMENT_U16_0_SHIFT						0
+#define   NV40TCL_VB_ELEMENT_U16_0_MASK							0x0000ffff
+#define  NV40TCL_VB_ELEMENT_U32								0x00001810
+#define  NV40TCL_VB_VERTEX_BATCH							0x00001814
+#define   NV40TCL_VB_VERTEX_BATCH_COUNT_SHIFT						24
+#define   NV40TCL_VB_VERTEX_BATCH_COUNT_MASK						0xff000000
+#define   NV40TCL_VB_VERTEX_BATCH_START_SHIFT						0
+#define   NV40TCL_VB_VERTEX_BATCH_START_MASK						0x00ffffff
+#define  NV40TCL_VERTEX_DATA								0x00001818
+#define  NV40TCL_IDXBUF_ADDRESS								0x0000181c
+#define  NV40TCL_IDXBUF_FORMAT								0x00001820
+#define   NV40TCL_IDXBUF_FORMAT_TYPE_SHIFT						4
+#define   NV40TCL_IDXBUF_FORMAT_TYPE_MASK						0x000000f0
+#define    NV40TCL_IDXBUF_FORMAT_TYPE_U32						0x00000000
+#define    NV40TCL_IDXBUF_FORMAT_TYPE_U16						0x00000010
+#define   NV40TCL_IDXBUF_FORMAT_DMA1							(1 <<  0)
+#define  NV40TCL_VB_INDEX_BATCH								0x00001824
+#define   NV40TCL_VB_INDEX_BATCH_COUNT_SHIFT						24
+#define   NV40TCL_VB_INDEX_BATCH_COUNT_MASK						0xff000000
+#define   NV40TCL_VB_INDEX_BATCH_START_SHIFT						0
+#define   NV40TCL_VB_INDEX_BATCH_START_MASK						0x00ffffff
+#define  NV40TCL_POLYGON_MODE_FRONT							0x00001828
+#define   NV40TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV40TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV40TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV40TCL_POLYGON_MODE_BACK							0x0000182c
+#define   NV40TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV40TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV40TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV40TCL_CULL_FACE								0x00001830
+#define   NV40TCL_CULL_FACE_FRONT							0x00000404
+#define   NV40TCL_CULL_FACE_BACK							0x00000405
+#define   NV40TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV40TCL_FRONT_FACE								0x00001834
+#define   NV40TCL_FRONT_FACE_CW								0x00000900
+#define   NV40TCL_FRONT_FACE_CCW							0x00000901
+#define  NV40TCL_POLYGON_SMOOTH_ENABLE							0x00001838
+#define  NV40TCL_CULL_FACE_ENABLE							0x0000183c
+#define  NV40TCL_TEX_SIZE1(x)								(0x00001840+((x)*4))
+#define  NV40TCL_TEX_SIZE1__SIZE							0x00000008
+#define   NV40TCL_TEX_SIZE1_DEPTH_SHIFT							20
+#define   NV40TCL_TEX_SIZE1_DEPTH_MASK							0xfff00000
+#define   NV40TCL_TEX_SIZE1_PITCH_SHIFT							0
+#define   NV40TCL_TEX_SIZE1_PITCH_MASK							0x0000ffff
+#define  NV40TCL_VTX_ATTR_2F_X(x)							(0x00001880+((x)*8))
+#define  NV40TCL_VTX_ATTR_2F_X__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_2F_Y(x)							(0x00001884+((x)*8))
+#define  NV40TCL_VTX_ATTR_2F_Y__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_2I(x)								(0x00001900+((x)*4))
+#define  NV40TCL_VTX_ATTR_2I__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_2I_X_SHIFT							0
+#define   NV40TCL_VTX_ATTR_2I_X_MASK							0x0000ffff
+#define   NV40TCL_VTX_ATTR_2I_Y_SHIFT							16
+#define   NV40TCL_VTX_ATTR_2I_Y_MASK							0xffff0000
+#define  NV40TCL_VTX_ATTR_4UB(x)							(0x00001940+((x)*4))
+#define  NV40TCL_VTX_ATTR_4UB__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_4UB_X_SHIFT							0
+#define   NV40TCL_VTX_ATTR_4UB_X_MASK							0x000000ff
+#define   NV40TCL_VTX_ATTR_4UB_Y_SHIFT							8
+#define   NV40TCL_VTX_ATTR_4UB_Y_MASK							0x0000ff00
+#define   NV40TCL_VTX_ATTR_4UB_Z_SHIFT							16
+#define   NV40TCL_VTX_ATTR_4UB_Z_MASK							0x00ff0000
+#define   NV40TCL_VTX_ATTR_4UB_W_SHIFT							24
+#define   NV40TCL_VTX_ATTR_4UB_W_MASK							0xff000000
+#define  NV40TCL_VTX_ATTR_4I_XY(x)							(0x00001980+((x)*8))
+#define  NV40TCL_VTX_ATTR_4I_XY__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_4I_XY_X_SHIFT						0
+#define   NV40TCL_VTX_ATTR_4I_XY_X_MASK							0x0000ffff
+#define   NV40TCL_VTX_ATTR_4I_XY_Y_SHIFT						16
+#define   NV40TCL_VTX_ATTR_4I_XY_Y_MASK							0xffff0000
+#define  NV40TCL_VTX_ATTR_4I_ZW(x)							(0x00001984+((x)*8))
+#define  NV40TCL_VTX_ATTR_4I_ZW__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_4I_ZW_Z_SHIFT						0
+#define   NV40TCL_VTX_ATTR_4I_ZW_Z_MASK							0x0000ffff
+#define   NV40TCL_VTX_ATTR_4I_ZW_W_SHIFT						16
+#define   NV40TCL_VTX_ATTR_4I_ZW_W_MASK							0xffff0000
+#define  NV40TCL_TEX_OFFSET(x)								(0x00001a00+((x)*32))
+#define  NV40TCL_TEX_OFFSET__SIZE							0x00000010
+#define  NV40TCL_TEX_FORMAT(x)								(0x00001a04+((x)*32))
+#define  NV40TCL_TEX_FORMAT__SIZE							0x00000010
+#define   NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT						16
+#define   NV40TCL_TEX_FORMAT_MIPMAP_COUNT_MASK						0x000f0000
+#define   NV40TCL_TEX_FORMAT_RECT							(1 << 14)
+#define   NV40TCL_TEX_FORMAT_LINEAR							(1 << 13)
+#define   NV40TCL_TEX_FORMAT_FORMAT_SHIFT						8
+#define   NV40TCL_TEX_FORMAT_FORMAT_MASK						0x00001f00
+#define    NV40TCL_TEX_FORMAT_FORMAT_L8							0x00000100
+#define    NV40TCL_TEX_FORMAT_FORMAT_A1R5G5B5						0x00000200
+#define    NV40TCL_TEX_FORMAT_FORMAT_A4R4G4B4						0x00000300
+#define    NV40TCL_TEX_FORMAT_FORMAT_R5G6B5						0x00000400
+#define    NV40TCL_TEX_FORMAT_FORMAT_A8R8G8B8						0x00000500
+#define    NV40TCL_TEX_FORMAT_FORMAT_DXT1						0x00000600
+#define    NV40TCL_TEX_FORMAT_FORMAT_DXT3						0x00000700
+#define    NV40TCL_TEX_FORMAT_FORMAT_DXT5						0x00000800
+#define    NV40TCL_TEX_FORMAT_FORMAT_A8L8						0x00000b00
+#define    NV40TCL_TEX_FORMAT_FORMAT_Z24						0x00001000
+#define    NV40TCL_TEX_FORMAT_FORMAT_Z16						0x00001200
+#define    NV40TCL_TEX_FORMAT_FORMAT_A16						0x00001400
+#define    NV40TCL_TEX_FORMAT_FORMAT_A16L16						0x00001500
+#define    NV40TCL_TEX_FORMAT_FORMAT_HILO8						0x00001800
+#define    NV40TCL_TEX_FORMAT_FORMAT_RGBA16F						0x00001a00
+#define    NV40TCL_TEX_FORMAT_FORMAT_RGBA32F						0x00001b00
+#define   NV40TCL_TEX_FORMAT_DIMS_SHIFT							4
+#define   NV40TCL_TEX_FORMAT_DIMS_MASK							0x000000f0
+#define    NV40TCL_TEX_FORMAT_DIMS_1D							0x00000010
+#define    NV40TCL_TEX_FORMAT_DIMS_2D							0x00000020
+#define    NV40TCL_TEX_FORMAT_DIMS_3D							0x00000030
+#define   NV40TCL_TEX_FORMAT_NO_BORDER							(1 <<  3)
+#define   NV40TCL_TEX_FORMAT_CUBIC							(1 <<  2)
+#define   NV40TCL_TEX_FORMAT_DMA1							(1 <<  1)
+#define   NV40TCL_TEX_FORMAT_DMA0							(1 <<  0)
+#define  NV40TCL_TEX_WRAP(x)								(0x00001a08+((x)*32))
+#define  NV40TCL_TEX_WRAP__SIZE								0x00000010
+#define   NV40TCL_TEX_WRAP_S_SHIFT							0
+#define   NV40TCL_TEX_WRAP_S_MASK							0x0000000f
+#define    NV40TCL_TEX_WRAP_S_REPEAT							0x00000001
+#define    NV40TCL_TEX_WRAP_S_MIRRORED_REPEAT						0x00000002
+#define    NV40TCL_TEX_WRAP_S_CLAMP_TO_EDGE						0x00000003
+#define    NV40TCL_TEX_WRAP_S_CLAMP_TO_BORDER						0x00000004
+#define    NV40TCL_TEX_WRAP_S_CLAMP							0x00000005
+#define    NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE					0x00000006
+#define    NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER					0x00000007
+#define    NV40TCL_TEX_WRAP_S_MIRROR_CLAMP						0x00000008
+#define   NV40TCL_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_SHIFT				4
+#define   NV40TCL_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_MASK				0x00000070
+#define    NV40TCL_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_OFF				0x00000000
+#define    NV40TCL_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_QUALITY			0x00000020
+#define    NV40TCL_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_PERFORMANCE			0x00000030
+#define    NV40TCL_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_HIGH_PERFORMANCE		0x00000070
+#define   NV40TCL_TEX_WRAP_T_SHIFT							8
+#define   NV40TCL_TEX_WRAP_T_MASK							0x00000f00
+#define    NV40TCL_TEX_WRAP_T_REPEAT							0x00000100
+#define    NV40TCL_TEX_WRAP_T_MIRRORED_REPEAT						0x00000200
+#define    NV40TCL_TEX_WRAP_T_CLAMP_TO_EDGE						0x00000300
+#define    NV40TCL_TEX_WRAP_T_CLAMP_TO_BORDER						0x00000400
+#define    NV40TCL_TEX_WRAP_T_CLAMP							0x00000500
+#define    NV40TCL_TEX_WRAP_T_MIRROR_CLAMP_TO_EDGE					0x00000600
+#define    NV40TCL_TEX_WRAP_T_MIRROR_CLAMP_TO_BORDER					0x00000700
+#define    NV40TCL_TEX_WRAP_T_MIRROR_CLAMP						0x00000800
+#define   NV40TCL_TEX_WRAP_EXPAND_NORMAL_SHIFT						12
+#define   NV40TCL_TEX_WRAP_EXPAND_NORMAL_MASK						0x0000f000
+#define   NV40TCL_TEX_WRAP_R_SHIFT							16
+#define   NV40TCL_TEX_WRAP_R_MASK							0x000f0000
+#define    NV40TCL_TEX_WRAP_R_REPEAT							0x00010000
+#define    NV40TCL_TEX_WRAP_R_MIRRORED_REPEAT						0x00020000
+#define    NV40TCL_TEX_WRAP_R_CLAMP_TO_EDGE						0x00030000
+#define    NV40TCL_TEX_WRAP_R_CLAMP_TO_BORDER						0x00040000
+#define    NV40TCL_TEX_WRAP_R_CLAMP							0x00050000
+#define    NV40TCL_TEX_WRAP_R_MIRROR_CLAMP_TO_EDGE					0x00060000
+#define    NV40TCL_TEX_WRAP_R_MIRROR_CLAMP_TO_BORDER					0x00070000
+#define    NV40TCL_TEX_WRAP_R_MIRROR_CLAMP						0x00080000
+#define   NV40TCL_TEX_WRAP_GAMMA_DECREASE_FILTER_SHIFT					20
+#define   NV40TCL_TEX_WRAP_GAMMA_DECREASE_FILTER_MASK					0x00f00000
+#define    NV40TCL_TEX_WRAP_GAMMA_DECREASE_FILTER_NONE					0x00000000
+#define    NV40TCL_TEX_WRAP_GAMMA_DECREASE_FILTER_RED					0x00100000
+#define    NV40TCL_TEX_WRAP_GAMMA_DECREASE_FILTER_GREEN					0x00200000
+#define    NV40TCL_TEX_WRAP_GAMMA_DECREASE_FILTER_BLUE					0x00400000
+#define    NV40TCL_TEX_WRAP_GAMMA_DECREASE_FILTER_ALL					0x00f00000
+#define   NV40TCL_TEX_WRAP_RCOMP_SHIFT							28
+#define   NV40TCL_TEX_WRAP_RCOMP_MASK							0xf0000000
+#define    NV40TCL_TEX_WRAP_RCOMP_NEVER							0x00000000
+#define    NV40TCL_TEX_WRAP_RCOMP_GREATER						0x10000000
+#define    NV40TCL_TEX_WRAP_RCOMP_EQUAL							0x20000000
+#define    NV40TCL_TEX_WRAP_RCOMP_GEQUAL						0x30000000
+#define    NV40TCL_TEX_WRAP_RCOMP_LESS							0x40000000
+#define    NV40TCL_TEX_WRAP_RCOMP_NOTEQUAL						0x50000000
+#define    NV40TCL_TEX_WRAP_RCOMP_LEQUAL						0x60000000
+#define    NV40TCL_TEX_WRAP_RCOMP_ALWAYS						0x70000000
+#define  NV40TCL_TEX_ENABLE(x)								(0x00001a0c+((x)*32))
+#define  NV40TCL_TEX_ENABLE__SIZE							0x00000010
+#define   NV40TCL_TEX_ENABLE_ENABLE							(1 << 31)
+#define   NV40TCL_TEX_ENABLE_MIPMAP_MIN_LOD_SHIFT					27
+#define   NV40TCL_TEX_ENABLE_MIPMAP_MIN_LOD_MASK					0x38000000
+#define   NV40TCL_TEX_ENABLE_MIPMAP_MAX_LOD_SHIFT					15
+#define   NV40TCL_TEX_ENABLE_MIPMAP_MAX_LOD_MASK					0x00038000
+#define   NV40TCL_TEX_ENABLE_ANISO_SHIFT						4
+#define   NV40TCL_TEX_ENABLE_ANISO_MASK							0x000000f0
+#define    NV40TCL_TEX_ENABLE_ANISO_NONE						0x00000000
+#define    NV40TCL_TEX_ENABLE_ANISO_2X							0x00000010
+#define    NV40TCL_TEX_ENABLE_ANISO_4X							0x00000020
+#define    NV40TCL_TEX_ENABLE_ANISO_6X							0x00000030
+#define    NV40TCL_TEX_ENABLE_ANISO_8X							0x00000040
+#define    NV40TCL_TEX_ENABLE_ANISO_10X							0x00000050
+#define    NV40TCL_TEX_ENABLE_ANISO_12X							0x00000060
+#define    NV40TCL_TEX_ENABLE_ANISO_16X							0x00000070
+#define  NV40TCL_TEX_SWIZZLE(x)								(0x00001a10+((x)*32))
+#define  NV40TCL_TEX_SWIZZLE__SIZE							0x00000010
+#define   NV40TCL_TEX_SWIZZLE_S0_X_SHIFT						14
+#define   NV40TCL_TEX_SWIZZLE_S0_X_MASK							0x0000c000
+#define    NV40TCL_TEX_SWIZZLE_S0_X_ZERO						0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S0_X_ONE							0x00004000
+#define    NV40TCL_TEX_SWIZZLE_S0_X_S1							0x00008000
+#define   NV40TCL_TEX_SWIZZLE_S0_Y_SHIFT						12
+#define   NV40TCL_TEX_SWIZZLE_S0_Y_MASK							0x00003000
+#define    NV40TCL_TEX_SWIZZLE_S0_Y_ZERO						0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S0_Y_ONE							0x00001000
+#define    NV40TCL_TEX_SWIZZLE_S0_Y_S1							0x00002000
+#define   NV40TCL_TEX_SWIZZLE_S0_Z_SHIFT						10
+#define   NV40TCL_TEX_SWIZZLE_S0_Z_MASK							0x00000c00
+#define    NV40TCL_TEX_SWIZZLE_S0_Z_ZERO						0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S0_Z_ONE							0x00000400
+#define    NV40TCL_TEX_SWIZZLE_S0_Z_S1							0x00000800
+#define   NV40TCL_TEX_SWIZZLE_S0_W_SHIFT						8
+#define   NV40TCL_TEX_SWIZZLE_S0_W_MASK							0x00000300
+#define    NV40TCL_TEX_SWIZZLE_S0_W_ZERO						0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S0_W_ONE							0x00000100
+#define    NV40TCL_TEX_SWIZZLE_S0_W_S1							0x00000200
+#define   NV40TCL_TEX_SWIZZLE_S1_X_SHIFT						6
+#define   NV40TCL_TEX_SWIZZLE_S1_X_MASK							0x000000c0
+#define    NV40TCL_TEX_SWIZZLE_S1_X_W							0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S1_X_Z							0x00000040
+#define    NV40TCL_TEX_SWIZZLE_S1_X_Y							0x00000080
+#define    NV40TCL_TEX_SWIZZLE_S1_X_X							0x000000c0
+#define   NV40TCL_TEX_SWIZZLE_S1_Y_SHIFT						4
+#define   NV40TCL_TEX_SWIZZLE_S1_Y_MASK							0x00000030
+#define    NV40TCL_TEX_SWIZZLE_S1_Y_W							0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S1_Y_Z							0x00000010
+#define    NV40TCL_TEX_SWIZZLE_S1_Y_Y							0x00000020
+#define    NV40TCL_TEX_SWIZZLE_S1_Y_X							0x00000030
+#define   NV40TCL_TEX_SWIZZLE_S1_Z_SHIFT						2
+#define   NV40TCL_TEX_SWIZZLE_S1_Z_MASK							0x0000000c
+#define    NV40TCL_TEX_SWIZZLE_S1_Z_W							0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S1_Z_Z							0x00000004
+#define    NV40TCL_TEX_SWIZZLE_S1_Z_Y							0x00000008
+#define    NV40TCL_TEX_SWIZZLE_S1_Z_X							0x0000000c
+#define   NV40TCL_TEX_SWIZZLE_S1_W_SHIFT						0
+#define   NV40TCL_TEX_SWIZZLE_S1_W_MASK							0x00000003
+#define    NV40TCL_TEX_SWIZZLE_S1_W_W							0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S1_W_Z							0x00000001
+#define    NV40TCL_TEX_SWIZZLE_S1_W_Y							0x00000002
+#define    NV40TCL_TEX_SWIZZLE_S1_W_X							0x00000003
+#define  NV40TCL_TEX_FILTER(x)								(0x00001a14+((x)*32))
+#define  NV40TCL_TEX_FILTER__SIZE							0x00000010
+#define   NV40TCL_TEX_FILTER_SIGNED_ALPHA						(1 << 31)
+#define   NV40TCL_TEX_FILTER_SIGNED_RED							(1 << 30)
+#define   NV40TCL_TEX_FILTER_SIGNED_GREEN						(1 << 29)
+#define   NV40TCL_TEX_FILTER_SIGNED_BLUE						(1 << 28)
+#define   NV40TCL_TEX_FILTER_MIN_SHIFT							16
+#define   NV40TCL_TEX_FILTER_MIN_MASK							0x000f0000
+#define    NV40TCL_TEX_FILTER_MIN_NEAREST						0x00010000
+#define    NV40TCL_TEX_FILTER_MIN_LINEAR						0x00020000
+#define    NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_NEAREST				0x00030000
+#define    NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_NEAREST					0x00040000
+#define    NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_LINEAR					0x00050000
+#define    NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_LINEAR					0x00060000
+#define   NV40TCL_TEX_FILTER_MAG_SHIFT							24
+#define   NV40TCL_TEX_FILTER_MAG_MASK							0x0f000000
+#define    NV40TCL_TEX_FILTER_MAG_NEAREST						0x01000000
+#define    NV40TCL_TEX_FILTER_MAG_LINEAR						0x02000000
+#define  NV40TCL_TEX_SIZE0(x)								(0x00001a18+((x)*32))
+#define  NV40TCL_TEX_SIZE0__SIZE							0x00000010
+#define   NV40TCL_TEX_SIZE0_H_SHIFT							0
+#define   NV40TCL_TEX_SIZE0_H_MASK							0x0000ffff
+#define   NV40TCL_TEX_SIZE0_W_SHIFT							16
+#define   NV40TCL_TEX_SIZE0_W_MASK							0xffff0000
+#define  NV40TCL_TEX_BORDER_COLOR(x)							(0x00001a1c+((x)*32))
+#define  NV40TCL_TEX_BORDER_COLOR__SIZE							0x00000010
+#define   NV40TCL_TEX_BORDER_COLOR_B_SHIFT						0
+#define   NV40TCL_TEX_BORDER_COLOR_B_MASK						0x000000ff
+#define   NV40TCL_TEX_BORDER_COLOR_G_SHIFT						8
+#define   NV40TCL_TEX_BORDER_COLOR_G_MASK						0x0000ff00
+#define   NV40TCL_TEX_BORDER_COLOR_R_SHIFT						16
+#define   NV40TCL_TEX_BORDER_COLOR_R_MASK						0x00ff0000
+#define   NV40TCL_TEX_BORDER_COLOR_A_SHIFT						24
+#define   NV40TCL_TEX_BORDER_COLOR_A_MASK						0xff000000
+#define  NV40TCL_VTX_ATTR_4F_X(x)							(0x00001c00+((x)*16))
+#define  NV40TCL_VTX_ATTR_4F_X__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_4F_Y(x)							(0x00001c04+((x)*16))
+#define  NV40TCL_VTX_ATTR_4F_Y__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_4F_Z(x)							(0x00001c08+((x)*16))
+#define  NV40TCL_VTX_ATTR_4F_Z__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_4F_W(x)							(0x00001c0c+((x)*16))
+#define  NV40TCL_VTX_ATTR_4F_W__SIZE							0x00000010
+#define  NV40TCL_FP_CONTROL								0x00001d60
+#define   NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT						24
+#define   NV40TCL_FP_CONTROL_TEMP_COUNT_MASK						0xff000000
+#define   NV40TCL_FP_CONTROL_KIL							(1 <<  7)
+#define  NV40TCL_MULTISAMPLE_CONTROL							0x00001d7c
+#define  NV40TCL_CLEAR_VALUE_DEPTH							0x00001d8c
+#define  NV40TCL_CLEAR_VALUE_COLOR							0x00001d90
+#define   NV40TCL_CLEAR_VALUE_COLOR_B_SHIFT						0
+#define   NV40TCL_CLEAR_VALUE_COLOR_B_MASK						0x000000ff
+#define   NV40TCL_CLEAR_VALUE_COLOR_G_SHIFT						8
+#define   NV40TCL_CLEAR_VALUE_COLOR_G_MASK						0x0000ff00
+#define   NV40TCL_CLEAR_VALUE_COLOR_R_SHIFT						16
+#define   NV40TCL_CLEAR_VALUE_COLOR_R_MASK						0x00ff0000
+#define   NV40TCL_CLEAR_VALUE_COLOR_A_SHIFT						24
+#define   NV40TCL_CLEAR_VALUE_COLOR_A_MASK						0xff000000
+#define  NV40TCL_CLEAR_BUFFERS								0x00001d94
+#define   NV40TCL_CLEAR_BUFFERS_COLOR_A							(1 <<  7)
+#define   NV40TCL_CLEAR_BUFFERS_COLOR_B							(1 <<  6)
+#define   NV40TCL_CLEAR_BUFFERS_COLOR_G							(1 <<  5)
+#define   NV40TCL_CLEAR_BUFFERS_COLOR_R							(1 <<  4)
+#define   NV40TCL_CLEAR_BUFFERS_STENCIL							(1 <<  1)
+#define   NV40TCL_CLEAR_BUFFERS_DEPTH							(1 <<  0)
+#define  NV40TCL_LINE_STIPPLE_ENABLE							0x00001db4
+#define  NV40TCL_LINE_STIPPLE_PATTERN							0x00001db8
+#define   NV40TCL_LINE_STIPPLE_PATTERN_FACTOR_SHIFT					0
+#define   NV40TCL_LINE_STIPPLE_PATTERN_FACTOR_MASK					0x0000ffff
+#define   NV40TCL_LINE_STIPPLE_PATTERN_PATTERN_SHIFT					16
+#define   NV40TCL_LINE_STIPPLE_PATTERN_PATTERN_MASK					0xffff0000
+#define  NV40TCL_VTX_ATTR_1F(x)								(0x00001e40+((x)*4))
+#define  NV40TCL_VTX_ATTR_1F__SIZE							0x00000010
+#define  NV40TCL_VP_UPLOAD_FROM_ID							0x00001e9c
+#define  NV40TCL_VP_START_FROM_ID							0x00001ea0
+#define  NV40TCL_POINT_SIZE								0x00001ee0
+#define  NV40TCL_POINT_SPRITE								0x00001ee8
+#define   NV40TCL_POINT_SPRITE_ENABLE							(1 <<  0)
+#define   NV40TCL_POINT_SPRITE_R_MODE_SHIFT						1
+#define   NV40TCL_POINT_SPRITE_R_MODE_MASK						0x00000006
+#define    NV40TCL_POINT_SPRITE_R_MODE_ZERO						0x00000000
+#define    NV40TCL_POINT_SPRITE_R_MODE_R						0x00000002
+#define    NV40TCL_POINT_SPRITE_R_MODE_S						0x00000004
+#define   NV40TCL_POINT_SPRITE_COORD_REPLACE_0						(1 <<  8)
+#define   NV40TCL_POINT_SPRITE_COORD_REPLACE_1						(1 <<  9)
+#define   NV40TCL_POINT_SPRITE_COORD_REPLACE_2						(1 << 10)
+#define   NV40TCL_POINT_SPRITE_COORD_REPLACE_3						(1 << 11)
+#define   NV40TCL_POINT_SPRITE_COORD_REPLACE_4						(1 << 12)
+#define   NV40TCL_POINT_SPRITE_COORD_REPLACE_5						(1 << 13)
+#define   NV40TCL_POINT_SPRITE_COORD_REPLACE_6						(1 << 14)
+#define   NV40TCL_POINT_SPRITE_COORD_REPLACE_7						(1 << 15)
+#define  NV40TCL_VP_UPLOAD_CONST_ID							0x00001efc
+#define  NV40TCL_VP_UPLOAD_CONST_X(x)							(0x00001f00+((x)*16))
+#define  NV40TCL_VP_UPLOAD_CONST_X__SIZE						0x00000004
+#define  NV40TCL_VP_UPLOAD_CONST_Y(x)							(0x00001f04+((x)*16))
+#define  NV40TCL_VP_UPLOAD_CONST_Y__SIZE						0x00000004
+#define  NV40TCL_VP_UPLOAD_CONST_Z(x)							(0x00001f08+((x)*16))
+#define  NV40TCL_VP_UPLOAD_CONST_Z__SIZE						0x00000004
+#define  NV40TCL_VP_UPLOAD_CONST_W(x)							(0x00001f0c+((x)*16))
+#define  NV40TCL_VP_UPLOAD_CONST_W__SIZE						0x00000004
+#define  NV40TCL_TEX_CACHE_CTL								0x00001fd8
+#define  NV40TCL_VP_ATTRIB_EN								0x00001ff0
+#define  NV40TCL_VP_RESULT_EN								0x00001ff4
+
+
+#define NV44TCL										0x00004497
+
+
+
+#define NV50_2D										0x0000502d
+
+#define  NV50_2D_NOP									0x00000100
+#define  NV50_2D_NOTIFY									0x00000104
+#define  NV50_2D_SERIALIZE								0x00000110
+#define  NV50_2D_DMA_NOTIFY								0x00000180
+#define  NV50_2D_DMA_DST								0x00000184
+#define  NV50_2D_DMA_SRC								0x00000188
+#define  NV50_2D_DMA_COND								0x0000018c
+#define  NV50_2D_DST_FORMAT								0x00000200
+#define   NV50_2D_DST_FORMAT_R32G32B32A32_FLOAT						0x000000c0
+#define   NV50_2D_DST_FORMAT_R32G32B32A32_SINT						0x000000c1
+#define   NV50_2D_DST_FORMAT_R32G32B32A32_UINT						0x000000c2
+#define   NV50_2D_DST_FORMAT_R32G32B32X32_FLOAT						0x000000c3
+#define   NV50_2D_DST_FORMAT_R16G16B16A16_UNORM						0x000000c6
+#define   NV50_2D_DST_FORMAT_R16G16B16A16_SNORM						0x000000c7
+#define   NV50_2D_DST_FORMAT_R16G16B16A16_SINT						0x000000c8
+#define   NV50_2D_DST_FORMAT_R16G16B16A16_UINT						0x000000c9
+#define   NV50_2D_DST_FORMAT_R16G16B16A16_FLOAT						0x000000ca
+#define   NV50_2D_DST_FORMAT_R32G32_FLOAT						0x000000cb
+#define   NV50_2D_DST_FORMAT_R32G32_SINT						0x000000cc
+#define   NV50_2D_DST_FORMAT_R32G32_UINT						0x000000cd
+#define   NV50_2D_DST_FORMAT_R16G16B16X16_FLOAT						0x000000ce
+#define   NV50_2D_DST_FORMAT_A8R8G8B8_UNORM						0x000000cf
+#define   NV50_2D_DST_FORMAT_A8R8G8B8_SRGB						0x000000d0
+#define   NV50_2D_DST_FORMAT_A2B10G10R10_UNORM						0x000000d1
+#define   NV50_2D_DST_FORMAT_A2B10G10R10_UINT						0x000000d2
+#define   NV50_2D_DST_FORMAT_A8B8G8R8_UNORM						0x000000d5
+#define   NV50_2D_DST_FORMAT_A8B8G8R8_SRGB						0x000000d6
+#define   NV50_2D_DST_FORMAT_A8B8G8R8_SNORM						0x000000d7
+#define   NV50_2D_DST_FORMAT_A8B8G8R8_SINT						0x000000d8
+#define   NV50_2D_DST_FORMAT_A8B8G8R8_UINT						0x000000d9
+#define   NV50_2D_DST_FORMAT_R16G16_UNORM						0x000000da
+#define   NV50_2D_DST_FORMAT_R16G16_SNORM						0x000000db
+#define   NV50_2D_DST_FORMAT_R16G16_SINT						0x000000dc
+#define   NV50_2D_DST_FORMAT_R16G16_UINT						0x000000dd
+#define   NV50_2D_DST_FORMAT_R16G16_FLOAT						0x000000de
+#define   NV50_2D_DST_FORMAT_A2R10G10B10_UNORM						0x000000df
+#define   NV50_2D_DST_FORMAT_B10G11R11_FLOAT						0x000000e0
+#define   NV50_2D_DST_FORMAT_R32_FLOAT							0x000000e5
+#define   NV50_2D_DST_FORMAT_X8R8G8B8_UNORM						0x000000e6
+#define   NV50_2D_DST_FORMAT_X8R8G8B8_SRGB						0x000000e7
+#define   NV50_2D_DST_FORMAT_R5G6B5_UNORM						0x000000e8
+#define   NV50_2D_DST_FORMAT_A1R5G5B5_UNORM						0x000000e9
+#define   NV50_2D_DST_FORMAT_R8G8_UNORM							0x000000ea
+#define   NV50_2D_DST_FORMAT_R8G8_SNORM							0x000000eb
+#define   NV50_2D_DST_FORMAT_R8G8_SINT							0x000000ec
+#define   NV50_2D_DST_FORMAT_R8G8_UINT							0x000000ed
+#define   NV50_2D_DST_FORMAT_R16_UNORM							0x000000ee
+#define   NV50_2D_DST_FORMAT_R16_SNORM							0x000000ef
+#define   NV50_2D_DST_FORMAT_R16_SINT							0x000000f0
+#define   NV50_2D_DST_FORMAT_R16_UINT							0x000000f1
+#define   NV50_2D_DST_FORMAT_R16_FLOAT							0x000000f2
+#define   NV50_2D_DST_FORMAT_R8_UNORM							0x000000f3
+#define   NV50_2D_DST_FORMAT_R8_SNORM							0x000000f4
+#define   NV50_2D_DST_FORMAT_R8_SINT							0x000000f5
+#define   NV50_2D_DST_FORMAT_R8_UINT							0x000000f6
+#define   NV50_2D_DST_FORMAT_A8_UNORM							0x000000f7
+#define   NV50_2D_DST_FORMAT_X1R5G5B5_UNORM						0x000000f8
+#define   NV50_2D_DST_FORMAT_X8B8G8R8_UNORM						0x000000f9
+#define   NV50_2D_DST_FORMAT_X8B8G8R8_SRGB						0x000000fa
+#define  NV50_2D_DST_LINEAR								0x00000204
+#define  NV50_2D_DST_TILE_MODE								0x00000208
+#define  NV50_2D_DST_DEPTH								0x0000020c
+#define  NV50_2D_DST_LAYER								0x00000210
+#define  NV50_2D_DST_PITCH								0x00000214
+#define  NV50_2D_DST_WIDTH								0x00000218
+#define  NV50_2D_DST_HEIGHT								0x0000021c
+#define  NV50_2D_DST_ADDRESS_HIGH							0x00000220
+#define  NV50_2D_DST_ADDRESS_LOW							0x00000224
+#define  NV50_2D_SRC_FORMAT								0x00000230
+#define   NV50_2D_SRC_FORMAT_R32G32B32A32_FLOAT						0x000000c0
+#define   NV50_2D_SRC_FORMAT_R32G32B32A32_SINT						0x000000c1
+#define   NV50_2D_SRC_FORMAT_R32G32B32A32_UINT						0x000000c2
+#define   NV50_2D_SRC_FORMAT_R32G32B32X32_FLOAT						0x000000c3
+#define   NV50_2D_SRC_FORMAT_R16G16B16A16_UNORM						0x000000c6
+#define   NV50_2D_SRC_FORMAT_R16G16B16A16_SNORM						0x000000c7
+#define   NV50_2D_SRC_FORMAT_R16G16B16A16_SINT						0x000000c8
+#define   NV50_2D_SRC_FORMAT_R16G16B16A16_UINT						0x000000c9
+#define   NV50_2D_SRC_FORMAT_R16G16B16A16_FLOAT						0x000000ca
+#define   NV50_2D_SRC_FORMAT_R32G32_FLOAT						0x000000cb
+#define   NV50_2D_SRC_FORMAT_R32G32_SINT						0x000000cc
+#define   NV50_2D_SRC_FORMAT_R32G32_UINT						0x000000cd
+#define   NV50_2D_SRC_FORMAT_R16G16B16X16_FLOAT						0x000000ce
+#define   NV50_2D_SRC_FORMAT_A8R8G8B8_UNORM						0x000000cf
+#define   NV50_2D_SRC_FORMAT_A8R8G8B8_SRGB						0x000000d0
+#define   NV50_2D_SRC_FORMAT_A2B10G10R10_UNORM						0x000000d1
+#define   NV50_2D_SRC_FORMAT_A2B10G10R10_UINT						0x000000d2
+#define   NV50_2D_SRC_FORMAT_A8B8G8R8_UNORM						0x000000d5
+#define   NV50_2D_SRC_FORMAT_A8B8G8R8_SRGB						0x000000d6
+#define   NV50_2D_SRC_FORMAT_A8B8G8R8_SNORM						0x000000d7
+#define   NV50_2D_SRC_FORMAT_A8B8G8R8_SINT						0x000000d8
+#define   NV50_2D_SRC_FORMAT_A8B8G8R8_UINT						0x000000d9
+#define   NV50_2D_SRC_FORMAT_R16G16_UNORM						0x000000da
+#define   NV50_2D_SRC_FORMAT_R16G16_SNORM						0x000000db
+#define   NV50_2D_SRC_FORMAT_R16G16_SINT						0x000000dc
+#define   NV50_2D_SRC_FORMAT_R16G16_UINT						0x000000dd
+#define   NV50_2D_SRC_FORMAT_R16G16_FLOAT						0x000000de
+#define   NV50_2D_SRC_FORMAT_A2R10G10B10_UNORM						0x000000df
+#define   NV50_2D_SRC_FORMAT_B10G11R11_FLOAT						0x000000e0
+#define   NV50_2D_SRC_FORMAT_R32_FLOAT							0x000000e5
+#define   NV50_2D_SRC_FORMAT_X8R8G8B8_UNORM						0x000000e6
+#define   NV50_2D_SRC_FORMAT_X8R8G8B8_SRGB						0x000000e7
+#define   NV50_2D_SRC_FORMAT_R5G6B5_UNORM						0x000000e8
+#define   NV50_2D_SRC_FORMAT_A1R5G5B5_UNORM						0x000000e9
+#define   NV50_2D_SRC_FORMAT_R8G8_UNORM							0x000000ea
+#define   NV50_2D_SRC_FORMAT_R8G8_SNORM							0x000000eb
+#define   NV50_2D_SRC_FORMAT_R8G8_SINT							0x000000ec
+#define   NV50_2D_SRC_FORMAT_R8G8_UINT							0x000000ed
+#define   NV50_2D_SRC_FORMAT_R16_UNORM							0x000000ee
+#define   NV50_2D_SRC_FORMAT_R16_SNORM							0x000000ef
+#define   NV50_2D_SRC_FORMAT_R16_SINT							0x000000f0
+#define   NV50_2D_SRC_FORMAT_R16_UINT							0x000000f1
+#define   NV50_2D_SRC_FORMAT_R16_FLOAT							0x000000f2
+#define   NV50_2D_SRC_FORMAT_R8_UNORM							0x000000f3
+#define   NV50_2D_SRC_FORMAT_R8_SNORM							0x000000f4
+#define   NV50_2D_SRC_FORMAT_R8_SINT							0x000000f5
+#define   NV50_2D_SRC_FORMAT_R8_UINT							0x000000f6
+#define   NV50_2D_SRC_FORMAT_A8_UNORM							0x000000f7
+#define   NV50_2D_SRC_FORMAT_X1R5G5B5_UNORM						0x000000f8
+#define   NV50_2D_SRC_FORMAT_X8B8G8R8_UNORM						0x000000f9
+#define   NV50_2D_SRC_FORMAT_X8B8G8R8_SRGB						0x000000fa
+#define  NV50_2D_SRC_LINEAR								0x00000234
+#define  NV50_2D_SRC_TILE_MODE								0x00000238
+#define  NV50_2D_SRC_DEPTH								0x0000023c
+#define  NV50_2D_SRC_LAYER								0x00000240
+#define  NV50_2D_SRC_PITCH								0x00000244
+#define  NV50_2D_SRC_WIDTH								0x00000248
+#define  NV50_2D_SRC_HEIGHT								0x0000024c
+#define  NV50_2D_SRC_ADDRESS_HIGH							0x00000250
+#define  NV50_2D_SRC_ADDRESS_LOW							0x00000254
+#define  NV50_2D_COND_ADDRESS_HIGH							0x00000264
+#define  NV50_2D_COND_ADDRESS_LOW							0x00000268
+#define  NV50_2D_COND_MODE								0x0000026c
+#define   NV50_2D_COND_MODE_NEVER							0x00000000
+#define   NV50_2D_COND_MODE_ALWAYS							0x00000001
+#define   NV50_2D_COND_MODE_RES								0x00000002
+#define   NV50_2D_COND_MODE_NOT_RES_AND_NOT_ID						0x00000003
+#define   NV50_2D_COND_MODE_RES_OR_ID							0x00000004
+#define  NV50_2D_CLIP_X									0x00000280
+#define  NV50_2D_CLIP_Y									0x00000284
+#define  NV50_2D_CLIP_W									0x00000288
+#define  NV50_2D_CLIP_H									0x0000028c
+#define  NV50_2D_CLIP_ENABLE								0x00000290
+#define  NV50_2D_COLOR_KEY_FORMAT							0x00000294
+#define   NV50_2D_COLOR_KEY_FORMAT_16BPP						0x00000000
+#define   NV50_2D_COLOR_KEY_FORMAT_15BPP						0x00000001
+#define   NV50_2D_COLOR_KEY_FORMAT_24BPP						0x00000002
+#define   NV50_2D_COLOR_KEY_FORMAT_30BPP						0x00000003
+#define   NV50_2D_COLOR_KEY_FORMAT_8BPP							0x00000004
+#define   NV50_2D_COLOR_KEY_FORMAT_16BPP2						0x00000005
+#define   NV50_2D_COLOR_KEY_FORMAT_32BPP						0x00000006
+#define  NV50_2D_COLOR_KEY								0x00000298
+#define  NV50_2D_COLOR_KEY_ENABLE							0x0000029c
+#define  NV50_2D_ROP									0x000002a0
+#define  NV50_2D_OPERATION								0x000002ac
+#define   NV50_2D_OPERATION_SRCCOPY_AND							0x00000000
+#define   NV50_2D_OPERATION_ROP_AND							0x00000001
+#define   NV50_2D_OPERATION_BLEND_AND							0x00000002
+#define   NV50_2D_OPERATION_SRCCOPY							0x00000003
+#define   NV50_2D_OPERATION_SRCCOPY_PREMULT						0x00000004
+#define   NV50_2D_OPERATION_BLEND_PREMULT						0x00000005
+#define  NV50_2D_PATTERN_FORMAT								0x000002e8
+#define   NV50_2D_PATTERN_FORMAT_16BPP							0x00000000
+#define   NV50_2D_PATTERN_FORMAT_15BPP							0x00000001
+#define   NV50_2D_PATTERN_FORMAT_32BPP							0x00000002
+#define   NV50_2D_PATTERN_FORMAT_8BPP							0x00000003
+#define  NV50_2D_PATTERN_COLOR(x)							(0x000002f0+((x)*4))
+#define  NV50_2D_PATTERN_COLOR__SIZE							0x00000002
+#define  NV50_2D_PATTERN_BITMAP(x)							(0x000002f8+((x)*4))
+#define  NV50_2D_PATTERN_BITMAP__SIZE							0x00000002
+#define  NV50_2D_DRAW_SHAPE								0x00000580
+#define   NV50_2D_DRAW_SHAPE_POINTS							0x00000000
+#define   NV50_2D_DRAW_SHAPE_LINES							0x00000001
+#define   NV50_2D_DRAW_SHAPE_LINE_STRIP							0x00000002
+#define   NV50_2D_DRAW_SHAPE_TRIANGLES							0x00000003
+#define   NV50_2D_DRAW_SHAPE_RECTANGLES							0x00000004
+#define  NV50_2D_DRAW_COLOR_FORMAT							0x00000584
+#define   NV50_2D_DRAW_COLOR_FORMAT_R32G32B32A32_FLOAT					0x000000c0
+#define   NV50_2D_DRAW_COLOR_FORMAT_R32G32B32A32_SINT					0x000000c1
+#define   NV50_2D_DRAW_COLOR_FORMAT_R32G32B32A32_UINT					0x000000c2
+#define   NV50_2D_DRAW_COLOR_FORMAT_R32G32B32X32_FLOAT					0x000000c3
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16G16B16A16_UNORM					0x000000c6
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16G16B16A16_SNORM					0x000000c7
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16G16B16A16_SINT					0x000000c8
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16G16B16A16_UINT					0x000000c9
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16G16B16A16_FLOAT					0x000000ca
+#define   NV50_2D_DRAW_COLOR_FORMAT_R32G32_FLOAT					0x000000cb
+#define   NV50_2D_DRAW_COLOR_FORMAT_R32G32_SINT						0x000000cc
+#define   NV50_2D_DRAW_COLOR_FORMAT_R32G32_UINT						0x000000cd
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16G16B16X16_FLOAT					0x000000ce
+#define   NV50_2D_DRAW_COLOR_FORMAT_A8R8G8B8_UNORM					0x000000cf
+#define   NV50_2D_DRAW_COLOR_FORMAT_A8R8G8B8_SRGB					0x000000d0
+#define   NV50_2D_DRAW_COLOR_FORMAT_A2B10G10R10_UNORM					0x000000d1
+#define   NV50_2D_DRAW_COLOR_FORMAT_A2B10G10R10_UINT					0x000000d2
+#define   NV50_2D_DRAW_COLOR_FORMAT_A8B8G8R8_UNORM					0x000000d5
+#define   NV50_2D_DRAW_COLOR_FORMAT_A8B8G8R8_SRGB					0x000000d6
+#define   NV50_2D_DRAW_COLOR_FORMAT_A8B8G8R8_SNORM					0x000000d7
+#define   NV50_2D_DRAW_COLOR_FORMAT_A8B8G8R8_SINT					0x000000d8
+#define   NV50_2D_DRAW_COLOR_FORMAT_A8B8G8R8_UINT					0x000000d9
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16G16_UNORM					0x000000da
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16G16_SNORM					0x000000db
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16G16_SINT						0x000000dc
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16G16_UINT						0x000000dd
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16G16_FLOAT					0x000000de
+#define   NV50_2D_DRAW_COLOR_FORMAT_A2R10G10B10_UNORM					0x000000df
+#define   NV50_2D_DRAW_COLOR_FORMAT_B10G11R11_FLOAT					0x000000e0
+#define   NV50_2D_DRAW_COLOR_FORMAT_R32_FLOAT						0x000000e5
+#define   NV50_2D_DRAW_COLOR_FORMAT_X8R8G8B8_UNORM					0x000000e6
+#define   NV50_2D_DRAW_COLOR_FORMAT_X8R8G8B8_SRGB					0x000000e7
+#define   NV50_2D_DRAW_COLOR_FORMAT_R5G6B5_UNORM					0x000000e8
+#define   NV50_2D_DRAW_COLOR_FORMAT_A1R5G5B5_UNORM					0x000000e9
+#define   NV50_2D_DRAW_COLOR_FORMAT_R8G8_UNORM						0x000000ea
+#define   NV50_2D_DRAW_COLOR_FORMAT_R8G8_SNORM						0x000000eb
+#define   NV50_2D_DRAW_COLOR_FORMAT_R8G8_SINT						0x000000ec
+#define   NV50_2D_DRAW_COLOR_FORMAT_R8G8_UINT						0x000000ed
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16_UNORM						0x000000ee
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16_SNORM						0x000000ef
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16_SINT						0x000000f0
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16_UINT						0x000000f1
+#define   NV50_2D_DRAW_COLOR_FORMAT_R16_FLOAT						0x000000f2
+#define   NV50_2D_DRAW_COLOR_FORMAT_R8_UNORM						0x000000f3
+#define   NV50_2D_DRAW_COLOR_FORMAT_R8_SNORM						0x000000f4
+#define   NV50_2D_DRAW_COLOR_FORMAT_R8_SINT						0x000000f5
+#define   NV50_2D_DRAW_COLOR_FORMAT_R8_UINT						0x000000f6
+#define   NV50_2D_DRAW_COLOR_FORMAT_A8_UNORM						0x000000f7
+#define   NV50_2D_DRAW_COLOR_FORMAT_X1R5G5B5_UNORM					0x000000f8
+#define   NV50_2D_DRAW_COLOR_FORMAT_X8B8G8R8_UNORM					0x000000f9
+#define   NV50_2D_DRAW_COLOR_FORMAT_X8B8G8R8_SRGB					0x000000fa
+#define  NV50_2D_DRAW_COLOR								0x00000588
+#define  NV50_2D_DRAW_POINT16								0x000005e0
+#define   NV50_2D_DRAW_POINT16_X_SHIFT							0
+#define   NV50_2D_DRAW_POINT16_X_MASK							0x0000ffff
+#define   NV50_2D_DRAW_POINT16_Y_SHIFT							16
+#define   NV50_2D_DRAW_POINT16_Y_MASK							0xffff0000
+#define  NV50_2D_DRAW_POINT32_X(x)							(0x00000600+((x)*8))
+#define  NV50_2D_DRAW_POINT32_X__SIZE							0x00000040
+#define  NV50_2D_DRAW_POINT32_Y(x)							(0x00000604+((x)*8))
+#define  NV50_2D_DRAW_POINT32_Y__SIZE							0x00000040
+#define  NV50_2D_SIFC_BITMAP_ENABLE							0x00000800
+#define  NV50_2D_SIFC_FORMAT								0x00000804
+#define   NV50_2D_SIFC_FORMAT_R32G32B32A32_FLOAT					0x000000c0
+#define   NV50_2D_SIFC_FORMAT_R32G32B32A32_SINT						0x000000c1
+#define   NV50_2D_SIFC_FORMAT_R32G32B32A32_UINT						0x000000c2
+#define   NV50_2D_SIFC_FORMAT_R32G32B32X32_FLOAT					0x000000c3
+#define   NV50_2D_SIFC_FORMAT_R16G16B16A16_UNORM					0x000000c6
+#define   NV50_2D_SIFC_FORMAT_R16G16B16A16_SNORM					0x000000c7
+#define   NV50_2D_SIFC_FORMAT_R16G16B16A16_SINT						0x000000c8
+#define   NV50_2D_SIFC_FORMAT_R16G16B16A16_UINT						0x000000c9
+#define   NV50_2D_SIFC_FORMAT_R16G16B16A16_FLOAT					0x000000ca
+#define   NV50_2D_SIFC_FORMAT_R32G32_FLOAT						0x000000cb
+#define   NV50_2D_SIFC_FORMAT_R32G32_SINT						0x000000cc
+#define   NV50_2D_SIFC_FORMAT_R32G32_UINT						0x000000cd
+#define   NV50_2D_SIFC_FORMAT_R16G16B16X16_FLOAT					0x000000ce
+#define   NV50_2D_SIFC_FORMAT_A8R8G8B8_UNORM						0x000000cf
+#define   NV50_2D_SIFC_FORMAT_A8R8G8B8_SRGB						0x000000d0
+#define   NV50_2D_SIFC_FORMAT_A2B10G10R10_UNORM						0x000000d1
+#define   NV50_2D_SIFC_FORMAT_A2B10G10R10_UINT						0x000000d2
+#define   NV50_2D_SIFC_FORMAT_A8B8G8R8_UNORM						0x000000d5
+#define   NV50_2D_SIFC_FORMAT_A8B8G8R8_SRGB						0x000000d6
+#define   NV50_2D_SIFC_FORMAT_A8B8G8R8_SNORM						0x000000d7
+#define   NV50_2D_SIFC_FORMAT_A8B8G8R8_SINT						0x000000d8
+#define   NV50_2D_SIFC_FORMAT_A8B8G8R8_UINT						0x000000d9
+#define   NV50_2D_SIFC_FORMAT_R16G16_UNORM						0x000000da
+#define   NV50_2D_SIFC_FORMAT_R16G16_SNORM						0x000000db
+#define   NV50_2D_SIFC_FORMAT_R16G16_SINT						0x000000dc
+#define   NV50_2D_SIFC_FORMAT_R16G16_UINT						0x000000dd
+#define   NV50_2D_SIFC_FORMAT_R16G16_FLOAT						0x000000de
+#define   NV50_2D_SIFC_FORMAT_A2R10G10B10_UNORM						0x000000df
+#define   NV50_2D_SIFC_FORMAT_B10G11R11_FLOAT						0x000000e0
+#define   NV50_2D_SIFC_FORMAT_R32_FLOAT							0x000000e5
+#define   NV50_2D_SIFC_FORMAT_X8R8G8B8_UNORM						0x000000e6
+#define   NV50_2D_SIFC_FORMAT_X8R8G8B8_SRGB						0x000000e7
+#define   NV50_2D_SIFC_FORMAT_R5G6B5_UNORM						0x000000e8
+#define   NV50_2D_SIFC_FORMAT_A1R5G5B5_UNORM						0x000000e9
+#define   NV50_2D_SIFC_FORMAT_R8G8_UNORM						0x000000ea
+#define   NV50_2D_SIFC_FORMAT_R8G8_SNORM						0x000000eb
+#define   NV50_2D_SIFC_FORMAT_R8G8_SINT							0x000000ec
+#define   NV50_2D_SIFC_FORMAT_R8G8_UINT							0x000000ed
+#define   NV50_2D_SIFC_FORMAT_R16_UNORM							0x000000ee
+#define   NV50_2D_SIFC_FORMAT_R16_SNORM							0x000000ef
+#define   NV50_2D_SIFC_FORMAT_R16_SINT							0x000000f0
+#define   NV50_2D_SIFC_FORMAT_R16_UINT							0x000000f1
+#define   NV50_2D_SIFC_FORMAT_R16_FLOAT							0x000000f2
+#define   NV50_2D_SIFC_FORMAT_R8_UNORM							0x000000f3
+#define   NV50_2D_SIFC_FORMAT_R8_SNORM							0x000000f4
+#define   NV50_2D_SIFC_FORMAT_R8_SINT							0x000000f5
+#define   NV50_2D_SIFC_FORMAT_R8_UINT							0x000000f6
+#define   NV50_2D_SIFC_FORMAT_A8_UNORM							0x000000f7
+#define   NV50_2D_SIFC_FORMAT_X1R5G5B5_UNORM						0x000000f8
+#define   NV50_2D_SIFC_FORMAT_X8B8G8R8_UNORM						0x000000f9
+#define   NV50_2D_SIFC_FORMAT_X8B8G8R8_SRGB						0x000000fa
+#define  NV50_2D_SIFC_BITMAP_UNK808							0x00000808
+#define  NV50_2D_SIFC_BITMAP_LSB_FIRST							0x0000080c
+#define  NV50_2D_SIFC_BITMAP_LINE_PACK_MODE						0x00000810
+#define   NV50_2D_SIFC_BITMAP_LINE_PACK_MODE_PACKED					0x00000000
+#define   NV50_2D_SIFC_BITMAP_LINE_PACK_MODE_ALIGN_BYTE					0x00000001
+#define   NV50_2D_SIFC_BITMAP_LINE_PACK_MODE_ALIGN_WORD					0x00000002
+#define  NV50_2D_SIFC_BITMAP_COLOR_BIT0							0x00000814
+#define  NV50_2D_SIFC_BITMAP_COLOR_BIT1							0x00000818
+#define  NV50_2D_SIFC_BITMAP_WRITE_BIT0_ENABLE						0x0000081c
+#define  NV50_2D_SIFC_WIDTH								0x00000838
+#define  NV50_2D_SIFC_HEIGHT								0x0000083c
+#define  NV50_2D_SIFC_DX_DU_FRACT							0x00000840
+#define  NV50_2D_SIFC_DX_DU_INT								0x00000844
+#define  NV50_2D_SIFC_DY_DV_FRACT							0x00000848
+#define  NV50_2D_SIFC_DY_DV_INT								0x0000084c
+#define  NV50_2D_SIFC_DST_X_FRACT							0x00000850
+#define  NV50_2D_SIFC_DST_X_INT								0x00000854
+#define  NV50_2D_SIFC_DST_Y_FRACT							0x00000858
+#define  NV50_2D_SIFC_DST_Y_INT								0x0000085c
+#define  NV50_2D_SIFC_DATA								0x00000860
+#define  NV50_2D_BLIT_DST_X								0x000008b0
+#define  NV50_2D_BLIT_DST_Y								0x000008b4
+#define  NV50_2D_BLIT_DST_W								0x000008b8
+#define  NV50_2D_BLIT_DST_H								0x000008bc
+#define  NV50_2D_BLIT_DU_DX_FRACT							0x000008c0
+#define  NV50_2D_BLIT_DU_DX_INT								0x000008c4
+#define  NV50_2D_BLIT_DV_DY_FRACT							0x000008c8
+#define  NV50_2D_BLIT_DV_DY_INT								0x000008cc
+#define  NV50_2D_BLIT_SRC_X_FRACT							0x000008d0
+#define  NV50_2D_BLIT_SRC_X_INT								0x000008d4
+#define  NV50_2D_BLIT_SRC_Y_FRACT							0x000008d8
+#define  NV50_2D_BLIT_SRC_Y_INT								0x000008dc
+
+
+#define NV50TCL										0x00005097
+
+#define  NV50TCL_NOP									0x00000100
+#define  NV50TCL_NOTIFY									0x00000104
+#define  NV50TCL_SERIALIZE								0x00000110
+#define  NV50TCL_DMA_NOTIFY								0x00000180
+#define  NV50TCL_DMA_ZETA								0x00000184
+#define  NV50TCL_DMA_QUERY								0x00000188
+#define  NV50TCL_DMA_VTXBUF0								0x0000018c
+#define  NV50TCL_DMA_LOCAL								0x00000190
+#define  NV50TCL_DMA_STACK								0x00000194
+#define  NV50TCL_DMA_CODE_CB								0x00000198
+#define  NV50TCL_DMA_TSC								0x0000019c
+#define  NV50TCL_DMA_TIC								0x000001a0
+#define  NV50TCL_DMA_TEXTURE								0x000001a4
+#define  NV50TCL_DMA_STRMOUT								0x000001a8
+#define  NV50TCL_DMA_UNK01AC								0x000001ac
+#define  NV50TCL_DMA_COLOR(x)								(0x000001c0+((x)*4))
+#define  NV50TCL_DMA_COLOR__SIZE							0x00000008
+#define  NV50TCL_RT_ADDRESS_HIGH(x)							(0x00000200+((x)*32))
+#define  NV50TCL_RT_ADDRESS_HIGH__SIZE							0x00000008
+#define  NV50TCL_RT_ADDRESS_LOW(x)							(0x00000204+((x)*32))
+#define  NV50TCL_RT_ADDRESS_LOW__SIZE							0x00000008
+#define  NV50TCL_RT_FORMAT(x)								(0x00000208+((x)*32))
+#define  NV50TCL_RT_FORMAT__SIZE							0x00000008
+#define   NV50TCL_RT_FORMAT_R32G32B32A32_FLOAT						0x000000c0
+#define   NV50TCL_RT_FORMAT_R32G32B32A32_SINT						0x000000c1
+#define   NV50TCL_RT_FORMAT_R32G32B32A32_UINT						0x000000c2
+#define   NV50TCL_RT_FORMAT_R32G32B32X32_FLOAT						0x000000c3
+#define   NV50TCL_RT_FORMAT_R16G16B16A16_UNORM						0x000000c6
+#define   NV50TCL_RT_FORMAT_R16G16B16A16_SNORM						0x000000c7
+#define   NV50TCL_RT_FORMAT_R16G16B16A16_SINT						0x000000c8
+#define   NV50TCL_RT_FORMAT_R16G16B16A16_UINT						0x000000c9
+#define   NV50TCL_RT_FORMAT_R16G16B16A16_FLOAT						0x000000ca
+#define   NV50TCL_RT_FORMAT_R32G32_FLOAT						0x000000cb
+#define   NV50TCL_RT_FORMAT_R32G32_SINT							0x000000cc
+#define   NV50TCL_RT_FORMAT_R32G32_UINT							0x000000cd
+#define   NV50TCL_RT_FORMAT_R16G16B16X16_FLOAT						0x000000ce
+#define   NV50TCL_RT_FORMAT_A8R8G8B8_UNORM						0x000000cf
+#define   NV50TCL_RT_FORMAT_A8R8G8B8_SRGB						0x000000d0
+#define   NV50TCL_RT_FORMAT_A2B10G10R10_UNORM						0x000000d1
+#define   NV50TCL_RT_FORMAT_A2B10G10R10_UINT						0x000000d2
+#define   NV50TCL_RT_FORMAT_A8B8G8R8_UNORM						0x000000d5
+#define   NV50TCL_RT_FORMAT_A8B8G8R8_SRGB						0x000000d6
+#define   NV50TCL_RT_FORMAT_A8B8G8R8_SNORM						0x000000d7
+#define   NV50TCL_RT_FORMAT_A8B8G8R8_SINT						0x000000d8
+#define   NV50TCL_RT_FORMAT_A8B8G8R8_UINT						0x000000d9
+#define   NV50TCL_RT_FORMAT_R16G16_UNORM						0x000000da
+#define   NV50TCL_RT_FORMAT_R16G16_SNORM						0x000000db
+#define   NV50TCL_RT_FORMAT_R16G16_SINT							0x000000dc
+#define   NV50TCL_RT_FORMAT_R16G16_UINT							0x000000dd
+#define   NV50TCL_RT_FORMAT_R16G16_FLOAT						0x000000de
+#define   NV50TCL_RT_FORMAT_A2R10G10B10_UNORM						0x000000df
+#define   NV50TCL_RT_FORMAT_B10G11R11_FLOAT						0x000000e0
+#define   NV50TCL_RT_FORMAT_R32_FLOAT							0x000000e5
+#define   NV50TCL_RT_FORMAT_X8R8G8B8_UNORM						0x000000e6
+#define   NV50TCL_RT_FORMAT_X8R8G8B8_SRGB						0x000000e7
+#define   NV50TCL_RT_FORMAT_R5G6B5_UNORM						0x000000e8
+#define   NV50TCL_RT_FORMAT_A1R5G5B5_UNORM						0x000000e9
+#define   NV50TCL_RT_FORMAT_R8G8_UNORM							0x000000ea
+#define   NV50TCL_RT_FORMAT_R8G8_SNORM							0x000000eb
+#define   NV50TCL_RT_FORMAT_R8G8_SINT							0x000000ec
+#define   NV50TCL_RT_FORMAT_R8G8_UINT							0x000000ed
+#define   NV50TCL_RT_FORMAT_R16_UNORM							0x000000ee
+#define   NV50TCL_RT_FORMAT_R16_SNORM							0x000000ef
+#define   NV50TCL_RT_FORMAT_R16_SINT							0x000000f0
+#define   NV50TCL_RT_FORMAT_R16_UINT							0x000000f1
+#define   NV50TCL_RT_FORMAT_R16_FLOAT							0x000000f2
+#define   NV50TCL_RT_FORMAT_R8_UNORM							0x000000f3
+#define   NV50TCL_RT_FORMAT_R8_SNORM							0x000000f4
+#define   NV50TCL_RT_FORMAT_R8_SINT							0x000000f5
+#define   NV50TCL_RT_FORMAT_R8_UINT							0x000000f6
+#define   NV50TCL_RT_FORMAT_A8_UNORM							0x000000f7
+#define   NV50TCL_RT_FORMAT_X1R5G5B5_UNORM						0x000000f8
+#define   NV50TCL_RT_FORMAT_X8B8G8R8_UNORM						0x000000f9
+#define   NV50TCL_RT_FORMAT_X8B8G8R8_SRGB						0x000000fa
+#define  NV50TCL_RT_TILE_MODE(x)							(0x0000020c+((x)*32))
+#define  NV50TCL_RT_TILE_MODE__SIZE							0x00000008
+#define  NV50TCL_RT_LAYER_STRIDE(x)							(0x00000210+((x)*32))
+#define  NV50TCL_RT_LAYER_STRIDE__SIZE							0x00000008
+#define  NV50TCL_VTX_ATTR_1F(x)								(0x00000300+((x)*4))
+#define  NV50TCL_VTX_ATTR_1F__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_2H(x)								(0x00000340+((x)*4))
+#define  NV50TCL_VTX_ATTR_2H__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_2H_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_2H_X_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_2H_Y_SHIFT							16
+#define   NV50TCL_VTX_ATTR_2H_Y_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_2F_X(x)							(0x00000380+((x)*8))
+#define  NV50TCL_VTX_ATTR_2F_X__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_2F_Y(x)							(0x00000384+((x)*8))
+#define  NV50TCL_VTX_ATTR_2F_Y__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_3F_X(x)							(0x00000400+((x)*16))
+#define  NV50TCL_VTX_ATTR_3F_X__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_3F_Y(x)							(0x00000404+((x)*16))
+#define  NV50TCL_VTX_ATTR_3F_Y__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_3F_Z(x)							(0x00000408+((x)*16))
+#define  NV50TCL_VTX_ATTR_3F_Z__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4F_X(x)							(0x00000500+((x)*16))
+#define  NV50TCL_VTX_ATTR_4F_X__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4F_Y(x)							(0x00000504+((x)*16))
+#define  NV50TCL_VTX_ATTR_4F_Y__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4F_Z(x)							(0x00000508+((x)*16))
+#define  NV50TCL_VTX_ATTR_4F_Z__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4F_W(x)							(0x0000050c+((x)*16))
+#define  NV50TCL_VTX_ATTR_4F_W__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4H_0(x)							(0x00000600+((x)*8))
+#define  NV50TCL_VTX_ATTR_4H_0__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4H_0_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_4H_0_X_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4H_0_Y_SHIFT							16
+#define   NV50TCL_VTX_ATTR_4H_0_Y_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4H_1(x)							(0x00000604+((x)*8))
+#define  NV50TCL_VTX_ATTR_4H_1__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4H_1_Z_SHIFT							0
+#define   NV50TCL_VTX_ATTR_4H_1_Z_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4H_1_W_SHIFT							16
+#define   NV50TCL_VTX_ATTR_4H_1_W_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_2I(x)								(0x00000680+((x)*4))
+#define  NV50TCL_VTX_ATTR_2I__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_2I_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_2I_X_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_2I_Y_SHIFT							16
+#define   NV50TCL_VTX_ATTR_2I_Y_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_2NI(x)							(0x000006c0+((x)*4))
+#define  NV50TCL_VTX_ATTR_2NI__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_2NI_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_2NI_X_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_2NI_Y_SHIFT							16
+#define   NV50TCL_VTX_ATTR_2NI_Y_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4I_0(x)							(0x00000700+((x)*8))
+#define  NV50TCL_VTX_ATTR_4I_0__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4I_0_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_4I_0_X_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4I_0_Y_SHIFT							16
+#define   NV50TCL_VTX_ATTR_4I_0_Y_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4I_1(x)							(0x00000704+((x)*8))
+#define  NV50TCL_VTX_ATTR_4I_1__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4I_1_Z_SHIFT							0
+#define   NV50TCL_VTX_ATTR_4I_1_Z_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4I_1_W_SHIFT							16
+#define   NV50TCL_VTX_ATTR_4I_1_W_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4NI_0(x)							(0x00000780+((x)*8))
+#define  NV50TCL_VTX_ATTR_4NI_0__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4NI_0_X_SHIFT						0
+#define   NV50TCL_VTX_ATTR_4NI_0_X_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4NI_0_Y_SHIFT						16
+#define   NV50TCL_VTX_ATTR_4NI_0_Y_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4NI_1(x)							(0x00000784+((x)*8))
+#define  NV50TCL_VTX_ATTR_4NI_1__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4NI_1_Z_SHIFT						0
+#define   NV50TCL_VTX_ATTR_4NI_1_Z_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4NI_1_W_SHIFT						16
+#define   NV50TCL_VTX_ATTR_4NI_1_W_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4UB(x)							(0x00000800+((x)*4))
+#define  NV50TCL_VTX_ATTR_4UB__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4UB_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_4UB_X_MASK							0x000000ff
+#define   NV50TCL_VTX_ATTR_4UB_Y_SHIFT							8
+#define   NV50TCL_VTX_ATTR_4UB_Y_MASK							0x0000ff00
+#define   NV50TCL_VTX_ATTR_4UB_Z_SHIFT							16
+#define   NV50TCL_VTX_ATTR_4UB_Z_MASK							0x00ff0000
+#define   NV50TCL_VTX_ATTR_4UB_W_SHIFT							24
+#define   NV50TCL_VTX_ATTR_4UB_W_MASK							0xff000000
+#define  NV50TCL_VTX_ATTR_4B(x)								(0x00000840+((x)*4))
+#define  NV50TCL_VTX_ATTR_4B__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4B_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_4B_X_MASK							0x000000ff
+#define   NV50TCL_VTX_ATTR_4B_Y_SHIFT							8
+#define   NV50TCL_VTX_ATTR_4B_Y_MASK							0x0000ff00
+#define   NV50TCL_VTX_ATTR_4B_Z_SHIFT							16
+#define   NV50TCL_VTX_ATTR_4B_Z_MASK							0x00ff0000
+#define   NV50TCL_VTX_ATTR_4B_W_SHIFT							24
+#define   NV50TCL_VTX_ATTR_4B_W_MASK							0xff000000
+#define  NV50TCL_VTX_ATTR_4NUB(x)							(0x00000880+((x)*4))
+#define  NV50TCL_VTX_ATTR_4NUB__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4NUB_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_4NUB_X_MASK							0x000000ff
+#define   NV50TCL_VTX_ATTR_4NUB_Y_SHIFT							8
+#define   NV50TCL_VTX_ATTR_4NUB_Y_MASK							0x0000ff00
+#define   NV50TCL_VTX_ATTR_4NUB_Z_SHIFT							16
+#define   NV50TCL_VTX_ATTR_4NUB_Z_MASK							0x00ff0000
+#define   NV50TCL_VTX_ATTR_4NUB_W_SHIFT							24
+#define   NV50TCL_VTX_ATTR_4NUB_W_MASK							0xff000000
+#define  NV50TCL_VTX_ATTR_4NB(x)							(0x000008c0+((x)*4))
+#define  NV50TCL_VTX_ATTR_4NB__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4NB_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_4NB_X_MASK							0x000000ff
+#define   NV50TCL_VTX_ATTR_4NB_Y_SHIFT							8
+#define   NV50TCL_VTX_ATTR_4NB_Y_MASK							0x0000ff00
+#define   NV50TCL_VTX_ATTR_4NB_Z_SHIFT							16
+#define   NV50TCL_VTX_ATTR_4NB_Z_MASK							0x00ff0000
+#define   NV50TCL_VTX_ATTR_4NB_W_SHIFT							24
+#define   NV50TCL_VTX_ATTR_4NB_W_MASK							0xff000000
+#define  NV50TCL_VERTEX_ARRAY_FORMAT(x)							(0x00000900+((x)*16))
+#define  NV50TCL_VERTEX_ARRAY_FORMAT__SIZE						0x00000010
+#define   NV50TCL_VERTEX_ARRAY_FORMAT_STRIDE_SHIFT					0
+#define   NV50TCL_VERTEX_ARRAY_FORMAT_STRIDE_MASK					0x00000fff
+#define   NV50TCL_VERTEX_ARRAY_FORMAT_ENABLE						(1 << 29)
+#define  NV50TCL_VERTEX_ARRAY_START_HIGH(x)						(0x00000904+((x)*16))
+#define  NV50TCL_VERTEX_ARRAY_START_HIGH__SIZE						0x00000010
+#define  NV50TCL_VERTEX_ARRAY_START_LOW(x)						(0x00000908+((x)*16))
+#define  NV50TCL_VERTEX_ARRAY_START_LOW__SIZE						0x00000010
+#define  NV50TCL_VIEWPORT_SCALE_X(x)							(0x00000a00+((x)*32))
+#define  NV50TCL_VIEWPORT_SCALE_X__SIZE							0x00000010
+#define  NV50TCL_VIEWPORT_SCALE_Y(x)							(0x00000a04+((x)*32))
+#define  NV50TCL_VIEWPORT_SCALE_Y__SIZE							0x00000010
+#define  NV50TCL_VIEWPORT_SCALE_Z(x)							(0x00000a08+((x)*32))
+#define  NV50TCL_VIEWPORT_SCALE_Z__SIZE							0x00000010
+#define  NV50TCL_VIEWPORT_TRANSLATE_X(x)						(0x00000a0c+((x)*32))
+#define  NV50TCL_VIEWPORT_TRANSLATE_X__SIZE						0x00000010
+#define  NV50TCL_VIEWPORT_TRANSLATE_Y(x)						(0x00000a10+((x)*32))
+#define  NV50TCL_VIEWPORT_TRANSLATE_Y__SIZE						0x00000010
+#define  NV50TCL_VIEWPORT_TRANSLATE_Z(x)						(0x00000a14+((x)*32))
+#define  NV50TCL_VIEWPORT_TRANSLATE_Z__SIZE						0x00000010
+#define  NV50TCL_VIEWPORT_HORIZ(x)							(0x00000c00+((x)*16))
+#define  NV50TCL_VIEWPORT_HORIZ__SIZE							0x00000010
+#define   NV50TCL_VIEWPORT_HORIZ_X_SHIFT						0
+#define   NV50TCL_VIEWPORT_HORIZ_X_MASK							0x0000ffff
+#define   NV50TCL_VIEWPORT_HORIZ_W_SHIFT						16
+#define   NV50TCL_VIEWPORT_HORIZ_W_MASK							0xffff0000
+#define  NV50TCL_VIEWPORT_VERT(x)							(0x00000c04+((x)*16))
+#define  NV50TCL_VIEWPORT_VERT__SIZE							0x00000010
+#define   NV50TCL_VIEWPORT_VERT_Y_SHIFT							0
+#define   NV50TCL_VIEWPORT_VERT_Y_MASK							0x0000ffff
+#define   NV50TCL_VIEWPORT_VERT_H_SHIFT							16
+#define   NV50TCL_VIEWPORT_VERT_H_MASK							0xffff0000
+#define  NV50TCL_DEPTH_RANGE_NEAR(x)							(0x00000c08+((x)*16))
+#define  NV50TCL_DEPTH_RANGE_NEAR__SIZE							0x00000010
+#define  NV50TCL_DEPTH_RANGE_FAR(x)							(0x00000c0c+((x)*16))
+#define  NV50TCL_DEPTH_RANGE_FAR__SIZE							0x00000010
+#define  NV50TCL_VIEWPORT_CLIP_HORIZ(x)							(0x00000d00+((x)*8))
+#define  NV50TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define  NV50TCL_VIEWPORT_CLIP_VERT(x)							(0x00000d04+((x)*8))
+#define  NV50TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define  NV50TCL_VERTEX_BUFFER_FIRST							0x00000d74
+#define  NV50TCL_VERTEX_BUFFER_COUNT							0x00000d78
+#define  NV50TCL_CLEAR_COLOR(x)								(0x00000d80+((x)*4))
+#define  NV50TCL_CLEAR_COLOR__SIZE							0x00000004
+#define  NV50TCL_CLEAR_DEPTH								0x00000d90
+#define  NV50TCL_STACK_ADDRESS_HIGH							0x00000d94
+#define  NV50TCL_STACK_ADDRESS_LOW							0x00000d98
+#define  NV50TCL_STACK_SIZE_LOG								0x00000d9c
+#define  NV50TCL_CLEAR_STENCIL								0x00000da0
+#define  NV50TCL_STRMOUT_PRIMITIVE_COUNT						0x00000da8
+#define  NV50TCL_POLYGON_MODE_FRONT							0x00000dac
+#define   NV50TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV50TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV50TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV50TCL_POLYGON_MODE_BACK							0x00000db0
+#define   NV50TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV50TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV50TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV50TCL_POLYGON_SMOOTH_ENABLE							0x00000db4
+#define  NV50TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000dc0
+#define  NV50TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000dc4
+#define  NV50TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000dc8
+#define  NV50TCL_WATCHDOG_TIMER								0x00000de4
+#define  NV50TCL_WINDOW_OFFSET_X							0x00000df8
+#define  NV50TCL_WINDOW_OFFSET_Y							0x00000dfc
+#define  NV50TCL_SCISSOR_ENABLE(x)							(0x00000e00+((x)*16))
+#define  NV50TCL_SCISSOR_ENABLE__SIZE							0x00000010
+#define  NV50TCL_SCISSOR_HORIZ(x)							(0x00000e04+((x)*16))
+#define  NV50TCL_SCISSOR_HORIZ__SIZE							0x00000010
+#define   NV50TCL_SCISSOR_HORIZ_MIN_SHIFT						0
+#define   NV50TCL_SCISSOR_HORIZ_MIN_MASK						0x0000ffff
+#define   NV50TCL_SCISSOR_HORIZ_MAX_SHIFT						16
+#define   NV50TCL_SCISSOR_HORIZ_MAX_MASK						0xffff0000
+#define  NV50TCL_SCISSOR_VERT(x)							(0x00000e08+((x)*16))
+#define  NV50TCL_SCISSOR_VERT__SIZE							0x00000010
+#define   NV50TCL_SCISSOR_VERT_MIN_SHIFT						0
+#define   NV50TCL_SCISSOR_VERT_MIN_MASK							0x0000ffff
+#define   NV50TCL_SCISSOR_VERT_MAX_SHIFT						16
+#define   NV50TCL_SCISSOR_VERT_MAX_MASK							0xffff0000
+#define  NV50TCL_CB_ADDR								0x00000f00
+#define   NV50TCL_CB_ADDR_ID_SHIFT							8
+#define   NV50TCL_CB_ADDR_ID_MASK							0x003fff00
+#define   NV50TCL_CB_ADDR_BUFFER_SHIFT							0
+#define   NV50TCL_CB_ADDR_BUFFER_MASK							0x0000007f
+#define  NV50TCL_CB_DATA(x)								(0x00000f04+((x)*4))
+#define  NV50TCL_CB_DATA__SIZE								0x00000010
+#define  NV50TCL_LOCAL_WARPS_LOG_ALLOC							0x00000f44
+#define  NV50TCL_LOCAL_WARPS_NO_CLAMP							0x00000f48
+#define  NV50TCL_STACK_WARPS_LOG_ALLOC							0x00000f4c
+#define  NV50TCL_STACK_WARPS_NO_CLAMP							0x00000f50
+#define  NV50TCL_STENCIL_BACK_FUNC_REF							0x00000f54
+#define  NV50TCL_STENCIL_BACK_MASK							0x00000f58
+#define  NV50TCL_STENCIL_BACK_FUNC_MASK							0x00000f5c
+#define  NV50TCL_GP_ADDRESS_HIGH							0x00000f70
+#define  NV50TCL_GP_ADDRESS_LOW								0x00000f74
+#define  NV50TCL_VP_ADDRESS_HIGH							0x00000f7c
+#define  NV50TCL_VP_ADDRESS_LOW								0x00000f80
+#define  NV50TCL_UNK0F84_ADDRESS_HIGH							0x00000f84
+#define  NV50TCL_UNK0F84_ADDRESS_LOW							0x00000f88
+#define  NV50TCL_DEPTH_BOUNDS(x)							(0x00000f9c+((x)*4))
+#define  NV50TCL_DEPTH_BOUNDS__SIZE							0x00000002
+#define  NV50TCL_FP_ADDRESS_HIGH							0x00000fa4
+#define  NV50TCL_FP_ADDRESS_LOW								0x00000fa8
+#define  NV50TCL_MSAA_MASK(x)								(0x00000fbc+((x)*4))
+#define  NV50TCL_MSAA_MASK__SIZE							0x00000004
+#define  NV50TCL_ZETA_ADDRESS_HIGH							0x00000fe0
+#define  NV50TCL_ZETA_ADDRESS_LOW							0x00000fe4
+#define  NV50TCL_ZETA_FORMAT								0x00000fe8
+#define   NV50TCL_ZETA_FORMAT_Z32_FLOAT							0x0000000a
+#define   NV50TCL_ZETA_FORMAT_Z16_UNORM							0x00000013
+#define   NV50TCL_ZETA_FORMAT_Z24S8_UNORM						0x00000014
+#define   NV50TCL_ZETA_FORMAT_X8Z24_UNORM						0x00000015
+#define   NV50TCL_ZETA_FORMAT_S8Z24_UNORM						0x00000016
+#define   NV50TCL_ZETA_FORMAT_Z32_FLOAT_X24S8_UNORM					0x00000019
+#define  NV50TCL_ZETA_TILE_MODE								0x00000fec
+#define  NV50TCL_ZETA_LAYER_STRIDE							0x00000ff0
+#define  NV50TCL_SCREEN_SCISSOR_HORIZ							0x00000ff4
+#define   NV50TCL_SCREEN_SCISSOR_HORIZ_W_SHIFT						16
+#define   NV50TCL_SCREEN_SCISSOR_HORIZ_W_MASK						0xffff0000
+#define   NV50TCL_SCREEN_SCISSOR_HORIZ_X_SHIFT						0
+#define   NV50TCL_SCREEN_SCISSOR_HORIZ_X_MASK						0x0000ffff
+#define  NV50TCL_SCREEN_SCISSOR_VERT							0x00000ff8
+#define   NV50TCL_SCREEN_SCISSOR_VERT_H_SHIFT						16
+#define   NV50TCL_SCREEN_SCISSOR_VERT_H_MASK						0xffff0000
+#define   NV50TCL_SCREEN_SCISSOR_VERT_Y_SHIFT						0
+#define   NV50TCL_SCREEN_SCISSOR_VERT_Y_MASK						0x0000ffff
+#define  NV50TCL_VERTEX_ARRAY_LIMIT_HIGH(x)						(0x00001080+((x)*8))
+#define  NV50TCL_VERTEX_ARRAY_LIMIT_HIGH__SIZE						0x00000010
+#define  NV50TCL_VERTEX_ARRAY_LIMIT_LOW(x)						(0x00001084+((x)*8))
+#define  NV50TCL_VERTEX_ARRAY_LIMIT_LOW__SIZE						0x00000010
+#define  NV50TCL_RT_CONTROL								0x0000121c
+#define   NV50TCL_RT_CONTROL_COUNT_SHIFT						0
+#define   NV50TCL_RT_CONTROL_COUNT_MASK							0x0000000f
+#define   NV50TCL_RT_CONTROL_MAP0_SHIFT							4
+#define   NV50TCL_RT_CONTROL_MAP0_MASK							0x00000070
+#define   NV50TCL_RT_CONTROL_MAP1_SHIFT							7
+#define   NV50TCL_RT_CONTROL_MAP1_MASK							0x00000380
+#define   NV50TCL_RT_CONTROL_MAP2_SHIFT							10
+#define   NV50TCL_RT_CONTROL_MAP2_MASK							0x00001c00
+#define   NV50TCL_RT_CONTROL_MAP3_SHIFT							13
+#define   NV50TCL_RT_CONTROL_MAP3_MASK							0x0000e000
+#define   NV50TCL_RT_CONTROL_MAP4_SHIFT							16
+#define   NV50TCL_RT_CONTROL_MAP4_MASK							0x00070000
+#define   NV50TCL_RT_CONTROL_MAP5_SHIFT							19
+#define   NV50TCL_RT_CONTROL_MAP5_MASK							0x00380000
+#define   NV50TCL_RT_CONTROL_MAP6_SHIFT							22
+#define   NV50TCL_RT_CONTROL_MAP6_MASK							0x01c00000
+#define   NV50TCL_RT_CONTROL_MAP7_SHIFT							25
+#define   NV50TCL_RT_CONTROL_MAP7_MASK							0x0e000000
+#define  NV50TCL_RT_ARRAY_MODE								0x00001224
+#define   NV50TCL_RT_ARRAY_MODE_LAYERS_SHIFT						0
+#define   NV50TCL_RT_ARRAY_MODE_LAYERS_MASK						0x0000ffff
+#define   NV50TCL_RT_ARRAY_MODE_VOLUME							(1 << 16)
+#define  NV50TCL_ZETA_HORIZ								0x00001228
+#define  NV50TCL_ZETA_VERT								0x0000122c
+#define  NV50TCL_ZETA_ARRAY_MODE							0x00001230
+#define   NV50TCL_ZETA_ARRAY_MODE_LAYERS_SHIFT						0
+#define   NV50TCL_ZETA_ARRAY_MODE_LAYERS_MASK						0x0000ffff
+#define   NV50TCL_ZETA_ARRAY_MODE_UNK							(1 << 16)
+#define  NV50TCL_LINKED_TSC								0x00001234
+#define  NV50TCL_RT_HORIZ(x)								(0x00001240+((x)*8))
+#define  NV50TCL_RT_HORIZ__SIZE								0x00000008
+#define  NV50TCL_RT_VERT(x)								(0x00001244+((x)*8))
+#define  NV50TCL_RT_VERT__SIZE								0x00000008
+#define  NV50TCL_CB_DEF_ADDRESS_HIGH							0x00001280
+#define  NV50TCL_CB_DEF_ADDRESS_LOW							0x00001284
+#define  NV50TCL_CB_DEF_SET								0x00001288
+#define   NV50TCL_CB_DEF_SET_SIZE_SHIFT							0
+#define   NV50TCL_CB_DEF_SET_SIZE_MASK							0x0000ffff
+#define   NV50TCL_CB_DEF_SET_BUFFER_SHIFT						16
+#define   NV50TCL_CB_DEF_SET_BUFFER_MASK						0x007f0000
+#define  NV50TCL_STRMOUT_BUFFERS_CTRL							0x00001294
+#define   NV50TCL_STRMOUT_BUFFERS_CTRL_INTERLEAVED					(1 <<  0)
+#define   NV50TCL_STRMOUT_BUFFERS_CTRL_SEPARATE_SHIFT					4
+#define   NV50TCL_STRMOUT_BUFFERS_CTRL_SEPARATE_MASK					0x000000f0
+#define   NV50TCL_STRMOUT_BUFFERS_CTRL_STRIDE_SHIFT					8
+#define   NV50TCL_STRMOUT_BUFFERS_CTRL_STRIDE_MASK					0x0000ff00
+#define  NV50TCL_FP_RESULT_COUNT							0x00001298
+#define  NV50TCL_DEPTH_TEST_ENABLE							0x000012cc
+#define  NV50TCL_SHADE_MODEL								0x000012d4
+#define   NV50TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV50TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV50TCL_LOCAL_ADDRESS_HIGH							0x000012d8
+#define  NV50TCL_LOCAL_ADDRESS_LOW							0x000012dc
+#define  NV50TCL_LOCAL_SIZE_LOG								0x000012e0
+#define  NV50TCL_DEPTH_WRITE_ENABLE							0x000012e8
+#define  NV50TCL_ALPHA_TEST_ENABLE							0x000012ec
+#define  NV50TCL_PM_SET(x)								(0x000012f0+((x)*4))
+#define  NV50TCL_PM_SET__SIZE								0x00000004
+#define  NV50TCL_VB_ELEMENT_U8_SETUP							0x00001300
+#define   NV50TCL_VB_ELEMENT_U8_SETUP_OFFSET_SHIFT					30
+#define   NV50TCL_VB_ELEMENT_U8_SETUP_OFFSET_MASK					0xc0000000
+#define   NV50TCL_VB_ELEMENT_U8_SETUP_COUNT_SHIFT					0
+#define   NV50TCL_VB_ELEMENT_U8_SETUP_COUNT_MASK					0x3fffffff
+#define  NV50TCL_VB_ELEMENT_U8								0x00001304
+#define   NV50TCL_VB_ELEMENT_U8_I0_SHIFT						0
+#define   NV50TCL_VB_ELEMENT_U8_I0_MASK							0x000000ff
+#define   NV50TCL_VB_ELEMENT_U8_I1_SHIFT						8
+#define   NV50TCL_VB_ELEMENT_U8_I1_MASK							0x0000ff00
+#define   NV50TCL_VB_ELEMENT_U8_I2_SHIFT						16
+#define   NV50TCL_VB_ELEMENT_U8_I2_MASK							0x00ff0000
+#define   NV50TCL_VB_ELEMENT_U8_I3_SHIFT						24
+#define   NV50TCL_VB_ELEMENT_U8_I3_MASK							0xff000000
+#define  NV50TCL_DEPTH_TEST_FUNC							0x0000130c
+#define   NV50TCL_DEPTH_TEST_FUNC_NEVER							0x00000200
+#define   NV50TCL_DEPTH_TEST_FUNC_LESS							0x00000201
+#define   NV50TCL_DEPTH_TEST_FUNC_EQUAL							0x00000202
+#define   NV50TCL_DEPTH_TEST_FUNC_LEQUAL						0x00000203
+#define   NV50TCL_DEPTH_TEST_FUNC_GREATER						0x00000204
+#define   NV50TCL_DEPTH_TEST_FUNC_NOTEQUAL						0x00000205
+#define   NV50TCL_DEPTH_TEST_FUNC_GEQUAL						0x00000206
+#define   NV50TCL_DEPTH_TEST_FUNC_ALWAYS						0x00000207
+#define  NV50TCL_ALPHA_TEST_REF								0x00001310
+#define  NV50TCL_ALPHA_TEST_FUNC							0x00001314
+#define   NV50TCL_ALPHA_TEST_FUNC_NEVER							0x00000200
+#define   NV50TCL_ALPHA_TEST_FUNC_LESS							0x00000201
+#define   NV50TCL_ALPHA_TEST_FUNC_EQUAL							0x00000202
+#define   NV50TCL_ALPHA_TEST_FUNC_LEQUAL						0x00000203
+#define   NV50TCL_ALPHA_TEST_FUNC_GREATER						0x00000204
+#define   NV50TCL_ALPHA_TEST_FUNC_NOTEQUAL						0x00000205
+#define   NV50TCL_ALPHA_TEST_FUNC_GEQUAL						0x00000206
+#define   NV50TCL_ALPHA_TEST_FUNC_ALWAYS						0x00000207
+#define  NV50TCL_BLEND_COLOR(x)								(0x0000131c+((x)*4))
+#define  NV50TCL_BLEND_COLOR__SIZE							0x00000004
+#define  NV50TCL_TIC_FLUSH								0x00001330
+#define  NV50TCL_TSC_FLUSH								0x00001334
+#define  NV50TCL_TEX_CACHE_CTL								0x00001338
+#define  NV50TCL_BLEND_EQUATION_RGB							0x00001340
+#define   NV50TCL_BLEND_EQUATION_RGB_FUNC_ADD						0x00008006
+#define   NV50TCL_BLEND_EQUATION_RGB_MIN						0x00008007
+#define   NV50TCL_BLEND_EQUATION_RGB_MAX						0x00008008
+#define   NV50TCL_BLEND_EQUATION_RGB_FUNC_SUBTRACT					0x0000800a
+#define   NV50TCL_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define  NV50TCL_BLEND_FUNC_SRC_RGB							0x00001344
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ZERO						0x00000000
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE						0x00000001
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR						0x00000300
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA						0x00000302
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA						0x00000304
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_DST_COLOR						0x00000306
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE					0x00000308
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR					0x00008001
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA					0x00008003
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV50TCL_BLEND_FUNC_DST_RGB							0x00001348
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ZERO						0x00000000
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE						0x00000001
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_COLOR						0x00000300
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA						0x00000302
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define   NV50TCL_BLEND_FUNC_DST_RGB_DST_ALPHA						0x00000304
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define   NV50TCL_BLEND_FUNC_DST_RGB_DST_COLOR						0x00000306
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE					0x00000308
+#define   NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR					0x00008001
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA					0x00008003
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV50TCL_BLEND_EQUATION_ALPHA							0x0000134c
+#define   NV50TCL_BLEND_EQUATION_ALPHA_FUNC_ADD						0x00008006
+#define   NV50TCL_BLEND_EQUATION_ALPHA_MIN						0x00008007
+#define   NV50TCL_BLEND_EQUATION_ALPHA_MAX						0x00008008
+#define   NV50TCL_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT					0x0000800a
+#define   NV50TCL_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define  NV50TCL_BLEND_FUNC_SRC_ALPHA							0x00001350
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ZERO						0x00000000
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE						0x00000001
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR					0x00000300
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x00000301
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA					0x00000302
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x00000303
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA					0x00000304
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x00000305
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR					0x00000306
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x00000307
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE				0x00000308
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR					0x00008001
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA					0x00008003
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV50TCL_BLEND_FUNC_DST_ALPHA							0x00001358
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ZERO						0x00000000
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE						0x00000001
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR					0x00000300
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x00000301
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA					0x00000302
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x00000303
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA					0x00000304
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x00000305
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR					0x00000306
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR				0x00000307
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE				0x00000308
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR					0x00008001
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA					0x00008003
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV50TCL_BLEND_ENABLE(x)							(0x00001360+((x)*4))
+#define  NV50TCL_BLEND_ENABLE__SIZE							0x00000008
+#define  NV50TCL_STENCIL_FRONT_ENABLE							0x00001380
+#define  NV50TCL_STENCIL_FRONT_OP_FAIL							0x00001384
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_ZERO						0x00000000
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_REPLACE						0x00001e01
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_INCR						0x00001e02
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_DECR						0x00001e03
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_FRONT_OP_ZFAIL							0x00001388
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_ZERO						0x00000000
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_REPLACE					0x00001e01
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_INCR						0x00001e02
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_DECR						0x00001e03
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_FRONT_OP_ZPASS							0x0000138c
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_ZERO						0x00000000
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_REPLACE					0x00001e01
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_INCR						0x00001e02
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_DECR						0x00001e03
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_FRONT_FUNC_FUNC						0x00001390
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_NEVER						0x00000200
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_LESS						0x00000201
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_EQUAL						0x00000202
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_LEQUAL					0x00000203
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_GEQUAL					0x00000206
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_ALWAYS					0x00000207
+#define  NV50TCL_STENCIL_FRONT_FUNC_REF							0x00001394
+#define  NV50TCL_STENCIL_FRONT_MASK							0x00001398
+#define  NV50TCL_STENCIL_FRONT_FUNC_MASK						0x0000139c
+#define  NV50TCL_FRAG_COLOR_CLAMP_EN							0x000013a8
+#define  NV50TCL_Y_ORIGIN_BOTTOM							0x000013ac
+#define  NV50TCL_LINE_WIDTH								0x000013b0
+#define  NV50TCL_TEX_LIMITS(x)								(0x000013b4+((x)*4))
+#define  NV50TCL_TEX_LIMITS__SIZE							0x00000003
+#define   NV50TCL_TEX_LIMITS_SAMPLERS_LOG2_SHIFT					0
+#define   NV50TCL_TEX_LIMITS_SAMPLERS_LOG2_MASK						0x0000000f
+#define   NV50TCL_TEX_LIMITS_TEXTURES_LOG2_SHIFT					4
+#define   NV50TCL_TEX_LIMITS_TEXTURES_LOG2_MASK						0x000000f0
+#define  NV50TCL_POINT_COORD_REPLACE_MAP(x)						(0x000013c0+((x)*4))
+#define  NV50TCL_POINT_COORD_REPLACE_MAP__SIZE						0x00000008
+#define  NV50TCL_VP_START_ID								0x0000140c
+#define  NV50TCL_GP_START_ID								0x00001410
+#define  NV50TCL_FP_START_ID								0x00001414
+#define  NV50TCL_GP_VERTEX_OUTPUT_COUNT							0x00001420
+#define  NV50TCL_VB_ELEMENT_BASE							0x00001434
+#define  NV50TCL_CODE_CB_FLUSH								0x00001440
+#define  NV50TCL_BIND_TSC(x)								(0x00001444+((x)*8))
+#define  NV50TCL_BIND_TSC__SIZE								0x00000003
+#define   NV50TCL_BIND_TSC_VALID							(1 <<  0)
+#define   NV50TCL_BIND_TSC_SAMPLER_SHIFT						4
+#define   NV50TCL_BIND_TSC_SAMPLER_MASK							0x000000f0
+#define   NV50TCL_BIND_TSC_TSC_SHIFT							12
+#define   NV50TCL_BIND_TSC_TSC_MASK							0x001ff000
+#define  NV50TCL_BIND_TIC(x)								(0x00001448+((x)*8))
+#define  NV50TCL_BIND_TIC__SIZE								0x00000003
+#define   NV50TCL_BIND_TIC_VALID							(1 <<  0)
+#define   NV50TCL_BIND_TIC_TEXTURE_SHIFT						1
+#define   NV50TCL_BIND_TIC_TEXTURE_MASK							0x000001fe
+#define   NV50TCL_BIND_TIC_TIC_SHIFT							9
+#define   NV50TCL_BIND_TIC_TIC_MASK							0x7ffffe00
+#define  NV50TCL_STRMOUT_MAP(x)								(0x00001480+((x)*4))
+#define  NV50TCL_STRMOUT_MAP__SIZE							0x00000020
+#define  NV50TCL_VP_CLIP_DISTANCE_ENABLE						0x00001510
+#define   NV50TCL_VP_CLIP_DISTANCE_ENABLE_0						(1 <<  0)
+#define   NV50TCL_VP_CLIP_DISTANCE_ENABLE_1						(1 <<  1)
+#define   NV50TCL_VP_CLIP_DISTANCE_ENABLE_2						(1 <<  2)
+#define   NV50TCL_VP_CLIP_DISTANCE_ENABLE_3						(1 <<  3)
+#define   NV50TCL_VP_CLIP_DISTANCE_ENABLE_4						(1 <<  4)
+#define   NV50TCL_VP_CLIP_DISTANCE_ENABLE_5						(1 <<  5)
+#define   NV50TCL_VP_CLIP_DISTANCE_ENABLE_6						(1 <<  6)
+#define   NV50TCL_VP_CLIP_DISTANCE_ENABLE_7						(1 <<  7)
+#define  NV50TCL_SAMPLECNT_ENABLE							0x00001514
+#define  NV50TCL_POINT_SIZE								0x00001518
+#define  NV50TCL_POINT_SPRITE_ENABLE							0x00001520
+#define  NV50TCL_SAMPLECNT_RESET							0x00001530
+#define  NV50TCL_ZETA_ENABLE								0x00001538
+#define  NV50TCL_MULTISAMPLE_CTRL							0x0000153c
+#define   NV50TCL_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE					(1 <<  0)
+#define   NV50TCL_MULTISAMPLE_CTRL_ALPHA_TO_ONE						(1 <<  4)
+#define  NV50TCL_NOPERSPECTIVE_BITMAP(x)						(0x00001540+((x)*4))
+#define  NV50TCL_NOPERSPECTIVE_BITMAP__SIZE						0x00000004
+#define  NV50TCL_COND_ADDRESS_HIGH							0x00001550
+#define  NV50TCL_COND_ADDRESS_LOW							0x00001554
+#define  NV50TCL_COND_MODE								0x00001558
+#define   NV50TCL_COND_MODE_NEVER							0x00000000
+#define   NV50TCL_COND_MODE_ALWAYS							0x00000001
+#define   NV50TCL_COND_MODE_RES								0x00000002
+#define   NV50TCL_COND_MODE_NOT_RES_AND_NOT_ID						0x00000003
+#define   NV50TCL_COND_MODE_RES_OR_ID							0x00000004
+#define  NV50TCL_TSC_ADDRESS_HIGH							0x0000155c
+#define  NV50TCL_TSC_ADDRESS_LOW							0x00001560
+#define  NV50TCL_TSC_LIMIT								0x00001564
+#define  NV50TCL_POLYGON_OFFSET_FACTOR							0x0000156c
+#define  NV50TCL_LINE_SMOOTH_ENABLE							0x00001570
+#define  NV50TCL_TIC_ADDRESS_HIGH							0x00001574
+#define  NV50TCL_TIC_ADDRESS_LOW							0x00001578
+#define  NV50TCL_TIC_LIMIT								0x0000157c
+#define  NV50TCL_PM_CONTROL(x)								(0x00001580+((x)*4))
+#define  NV50TCL_PM_CONTROL__SIZE							0x00000004
+#define   NV50TCL_PM_CONTROL_UNK0							(1 <<  0)
+#define   NV50TCL_PM_CONTROL_UNK1_SHIFT							4
+#define   NV50TCL_PM_CONTROL_UNK1_MASK							0x00000070
+#define   NV50TCL_PM_CONTROL_UNK2_SHIFT							8
+#define   NV50TCL_PM_CONTROL_UNK2_MASK							0xffffff00
+#define  NV50TCL_STENCIL_BACK_ENABLE							0x00001594
+#define  NV50TCL_STENCIL_BACK_OP_FAIL							0x00001598
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_ZERO						0x00000000
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_REPLACE						0x00001e01
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_INCR						0x00001e02
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_DECR						0x00001e03
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_BACK_OP_ZFAIL							0x0000159c
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_ZERO						0x00000000
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_INCR						0x00001e02
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_DECR						0x00001e03
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_BACK_OP_ZPASS							0x000015a0
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_ZERO						0x00000000
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_REPLACE						0x00001e01
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_INCR						0x00001e02
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_DECR						0x00001e03
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_BACK_FUNC_FUNC							0x000015a4
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_NEVER						0x00000200
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_LESS						0x00000201
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_EQUAL						0x00000202
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV50TCL_FRAMEBUFFER_SRGB							0x000015b8
+#define  NV50TCL_POLYGON_OFFSET_UNITS							0x000015bc
+#define  NV50TCL_GP_BUILTIN_RESULT_EN							0x000015cc
+#define   NV50TCL_GP_BUILTIN_RESULT_EN_VPORT_IDX					(1 <<  0)
+#define   NV50TCL_GP_BUILTIN_RESULT_EN_LAYER_IDX					(1 << 16)
+#define  NV50TCL_MULTISAMPLE_SAMPLES_LOG2						0x000015d0
+#define  NV50TCL_VERTEX_BEGIN								0x000015dc
+#define   NV50TCL_VERTEX_BEGIN_POINTS							0x00000000
+#define   NV50TCL_VERTEX_BEGIN_LINES							0x00000001
+#define   NV50TCL_VERTEX_BEGIN_LINE_LOOP						0x00000002
+#define   NV50TCL_VERTEX_BEGIN_LINE_STRIP						0x00000003
+#define   NV50TCL_VERTEX_BEGIN_TRIANGLES						0x00000004
+#define   NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP						0x00000005
+#define   NV50TCL_VERTEX_BEGIN_TRIANGLE_FAN						0x00000006
+#define   NV50TCL_VERTEX_BEGIN_QUADS							0x00000007
+#define   NV50TCL_VERTEX_BEGIN_QUAD_STRIP						0x00000008
+#define   NV50TCL_VERTEX_BEGIN_POLYGON							0x00000009
+#define   NV50TCL_VERTEX_BEGIN_LINES_ADJACENCY						0x0000000a
+#define   NV50TCL_VERTEX_BEGIN_LINE_STRIP_ADJACENCY					0x0000000b
+#define   NV50TCL_VERTEX_BEGIN_TRIANGLES_ADJACENCY					0x0000000c
+#define   NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP_ADJACENCY					0x0000000d
+#define  NV50TCL_VERTEX_END								0x000015e0
+#define  NV50TCL_EDGEFLAG_ENABLE							0x000015e4
+#define  NV50TCL_VB_ELEMENT_U32								0x000015e8
+#define  NV50TCL_VB_ELEMENT_U16_SETUP							0x000015ec
+#define   NV50TCL_VB_ELEMENT_U16_SETUP_OFFSET_SHIFT					30
+#define   NV50TCL_VB_ELEMENT_U16_SETUP_OFFSET_MASK					0xc0000000
+#define   NV50TCL_VB_ELEMENT_U16_SETUP_COUNT_SHIFT					0
+#define   NV50TCL_VB_ELEMENT_U16_SETUP_COUNT_MASK					0x3fffffff
+#define  NV50TCL_VB_ELEMENT_U16								0x000015f0
+#define   NV50TCL_VB_ELEMENT_U16_I0_SHIFT						0
+#define   NV50TCL_VB_ELEMENT_U16_I0_MASK						0x0000ffff
+#define   NV50TCL_VB_ELEMENT_U16_I1_SHIFT						16
+#define   NV50TCL_VB_ELEMENT_U16_I1_MASK						0xffff0000
+#define  NV50TCL_VERTEX_DATA								0x00001640
+#define  NV50TCL_PRIM_RESTART_ENABLE							0x00001644
+#define  NV50TCL_PRIM_RESTART_INDEX							0x00001648
+#define  NV50TCL_VP_GP_BUILTIN_ATTR_EN							0x0000164c
+#define   NV50TCL_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID					(1 <<  0)
+#define   NV50TCL_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID					(1 <<  4)
+#define   NV50TCL_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID					(1 <<  8)
+#define   NV50TCL_VP_GP_BUILTIN_ATTR_EN_UNK12						(1 << 12)
+#define  NV50TCL_VP_ATTR_EN_0								0x00001650
+#define   NV50TCL_VP_ATTR_EN_0_7_SHIFT							28
+#define   NV50TCL_VP_ATTR_EN_0_7_MASK							0xf0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XNNN							0x10000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NYNN							0x20000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XYNN							0x30000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NNZN							0x40000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XNZN							0x50000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NYZN							0x60000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XYZN							0x70000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NNNW							0x80000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XNNW							0x90000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NYNW							0xa0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XYNW							0xb0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NNZW							0xc0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XNZW							0xd0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NYZW							0xe0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XYZW							0xf0000000
+#define   NV50TCL_VP_ATTR_EN_0_6_SHIFT							24
+#define   NV50TCL_VP_ATTR_EN_0_6_MASK							0x0f000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XNNN							0x01000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NYNN							0x02000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XYNN							0x03000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NNZN							0x04000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XNZN							0x05000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NYZN							0x06000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XYZN							0x07000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NNNW							0x08000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XNNW							0x09000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NYNW							0x0a000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XYNW							0x0b000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NNZW							0x0c000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XNZW							0x0d000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NYZW							0x0e000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XYZW							0x0f000000
+#define   NV50TCL_VP_ATTR_EN_0_5_SHIFT							20
+#define   NV50TCL_VP_ATTR_EN_0_5_MASK							0x00f00000
+#define    NV50TCL_VP_ATTR_EN_0_5_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_5_XNNN							0x00100000
+#define    NV50TCL_VP_ATTR_EN_0_5_NYNN							0x00200000
+#define    NV50TCL_VP_ATTR_EN_0_5_XYNN							0x00300000
+#define    NV50TCL_VP_ATTR_EN_0_5_NNZN							0x00400000
+#define    NV50TCL_VP_ATTR_EN_0_5_XNZN							0x00500000
+#define    NV50TCL_VP_ATTR_EN_0_5_NYZN							0x00600000
+#define    NV50TCL_VP_ATTR_EN_0_5_XYZN							0x00700000
+#define    NV50TCL_VP_ATTR_EN_0_5_NNNW							0x00800000
+#define    NV50TCL_VP_ATTR_EN_0_5_XNNW							0x00900000
+#define    NV50TCL_VP_ATTR_EN_0_5_NYNW							0x00a00000
+#define    NV50TCL_VP_ATTR_EN_0_5_XYNW							0x00b00000
+#define    NV50TCL_VP_ATTR_EN_0_5_NNZW							0x00c00000
+#define    NV50TCL_VP_ATTR_EN_0_5_XNZW							0x00d00000
+#define    NV50TCL_VP_ATTR_EN_0_5_NYZW							0x00e00000
+#define    NV50TCL_VP_ATTR_EN_0_5_XYZW							0x00f00000
+#define   NV50TCL_VP_ATTR_EN_0_4_SHIFT							16
+#define   NV50TCL_VP_ATTR_EN_0_4_MASK							0x000f0000
+#define    NV50TCL_VP_ATTR_EN_0_4_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_4_XNNN							0x00010000
+#define    NV50TCL_VP_ATTR_EN_0_4_NYNN							0x00020000
+#define    NV50TCL_VP_ATTR_EN_0_4_XYNN							0x00030000
+#define    NV50TCL_VP_ATTR_EN_0_4_NNZN							0x00040000
+#define    NV50TCL_VP_ATTR_EN_0_4_XNZN							0x00050000
+#define    NV50TCL_VP_ATTR_EN_0_4_NYZN							0x00060000
+#define    NV50TCL_VP_ATTR_EN_0_4_XYZN							0x00070000
+#define    NV50TCL_VP_ATTR_EN_0_4_NNNW							0x00080000
+#define    NV50TCL_VP_ATTR_EN_0_4_XNNW							0x00090000
+#define    NV50TCL_VP_ATTR_EN_0_4_NYNW							0x000a0000
+#define    NV50TCL_VP_ATTR_EN_0_4_XYNW							0x000b0000
+#define    NV50TCL_VP_ATTR_EN_0_4_NNZW							0x000c0000
+#define    NV50TCL_VP_ATTR_EN_0_4_XNZW							0x000d0000
+#define    NV50TCL_VP_ATTR_EN_0_4_NYZW							0x000e0000
+#define    NV50TCL_VP_ATTR_EN_0_4_XYZW							0x000f0000
+#define   NV50TCL_VP_ATTR_EN_0_3_SHIFT							12
+#define   NV50TCL_VP_ATTR_EN_0_3_MASK							0x0000f000
+#define    NV50TCL_VP_ATTR_EN_0_3_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_3_XNNN							0x00001000
+#define    NV50TCL_VP_ATTR_EN_0_3_NYNN							0x00002000
+#define    NV50TCL_VP_ATTR_EN_0_3_XYNN							0x00003000
+#define    NV50TCL_VP_ATTR_EN_0_3_NNZN							0x00004000
+#define    NV50TCL_VP_ATTR_EN_0_3_XNZN							0x00005000
+#define    NV50TCL_VP_ATTR_EN_0_3_NYZN							0x00006000
+#define    NV50TCL_VP_ATTR_EN_0_3_XYZN							0x00007000
+#define    NV50TCL_VP_ATTR_EN_0_3_NNNW							0x00008000
+#define    NV50TCL_VP_ATTR_EN_0_3_XNNW							0x00009000
+#define    NV50TCL_VP_ATTR_EN_0_3_NYNW							0x0000a000
+#define    NV50TCL_VP_ATTR_EN_0_3_XYNW							0x0000b000
+#define    NV50TCL_VP_ATTR_EN_0_3_NNZW							0x0000c000
+#define    NV50TCL_VP_ATTR_EN_0_3_XNZW							0x0000d000
+#define    NV50TCL_VP_ATTR_EN_0_3_NYZW							0x0000e000
+#define    NV50TCL_VP_ATTR_EN_0_3_XYZW							0x0000f000
+#define   NV50TCL_VP_ATTR_EN_0_2_SHIFT							8
+#define   NV50TCL_VP_ATTR_EN_0_2_MASK							0x00000f00
+#define    NV50TCL_VP_ATTR_EN_0_2_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_2_XNNN							0x00000100
+#define    NV50TCL_VP_ATTR_EN_0_2_NYNN							0x00000200
+#define    NV50TCL_VP_ATTR_EN_0_2_XYNN							0x00000300
+#define    NV50TCL_VP_ATTR_EN_0_2_NNZN							0x00000400
+#define    NV50TCL_VP_ATTR_EN_0_2_XNZN							0x00000500
+#define    NV50TCL_VP_ATTR_EN_0_2_NYZN							0x00000600
+#define    NV50TCL_VP_ATTR_EN_0_2_XYZN							0x00000700
+#define    NV50TCL_VP_ATTR_EN_0_2_NNNW							0x00000800
+#define    NV50TCL_VP_ATTR_EN_0_2_XNNW							0x00000900
+#define    NV50TCL_VP_ATTR_EN_0_2_NYNW							0x00000a00
+#define    NV50TCL_VP_ATTR_EN_0_2_XYNW							0x00000b00
+#define    NV50TCL_VP_ATTR_EN_0_2_NNZW							0x00000c00
+#define    NV50TCL_VP_ATTR_EN_0_2_XNZW							0x00000d00
+#define    NV50TCL_VP_ATTR_EN_0_2_NYZW							0x00000e00
+#define    NV50TCL_VP_ATTR_EN_0_2_XYZW							0x00000f00
+#define   NV50TCL_VP_ATTR_EN_0_1_SHIFT							4
+#define   NV50TCL_VP_ATTR_EN_0_1_MASK							0x000000f0
+#define    NV50TCL_VP_ATTR_EN_0_1_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_1_XNNN							0x00000010
+#define    NV50TCL_VP_ATTR_EN_0_1_NYNN							0x00000020
+#define    NV50TCL_VP_ATTR_EN_0_1_XYNN							0x00000030
+#define    NV50TCL_VP_ATTR_EN_0_1_NNZN							0x00000040
+#define    NV50TCL_VP_ATTR_EN_0_1_XNZN							0x00000050
+#define    NV50TCL_VP_ATTR_EN_0_1_NYZN							0x00000060
+#define    NV50TCL_VP_ATTR_EN_0_1_XYZN							0x00000070
+#define    NV50TCL_VP_ATTR_EN_0_1_NNNW							0x00000080
+#define    NV50TCL_VP_ATTR_EN_0_1_XNNW							0x00000090
+#define    NV50TCL_VP_ATTR_EN_0_1_NYNW							0x000000a0
+#define    NV50TCL_VP_ATTR_EN_0_1_XYNW							0x000000b0
+#define    NV50TCL_VP_ATTR_EN_0_1_NNZW							0x000000c0
+#define    NV50TCL_VP_ATTR_EN_0_1_XNZW							0x000000d0
+#define    NV50TCL_VP_ATTR_EN_0_1_NYZW							0x000000e0
+#define    NV50TCL_VP_ATTR_EN_0_1_XYZW							0x000000f0
+#define   NV50TCL_VP_ATTR_EN_0_0_SHIFT							0
+#define   NV50TCL_VP_ATTR_EN_0_0_MASK							0x0000000f
+#define    NV50TCL_VP_ATTR_EN_0_0_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_0_XNNN							0x00000001
+#define    NV50TCL_VP_ATTR_EN_0_0_NYNN							0x00000002
+#define    NV50TCL_VP_ATTR_EN_0_0_XYNN							0x00000003
+#define    NV50TCL_VP_ATTR_EN_0_0_NNZN							0x00000004
+#define    NV50TCL_VP_ATTR_EN_0_0_XNZN							0x00000005
+#define    NV50TCL_VP_ATTR_EN_0_0_NYZN							0x00000006
+#define    NV50TCL_VP_ATTR_EN_0_0_XYZN							0x00000007
+#define    NV50TCL_VP_ATTR_EN_0_0_NNNW							0x00000008
+#define    NV50TCL_VP_ATTR_EN_0_0_XNNW							0x00000009
+#define    NV50TCL_VP_ATTR_EN_0_0_NYNW							0x0000000a
+#define    NV50TCL_VP_ATTR_EN_0_0_XYNW							0x0000000b
+#define    NV50TCL_VP_ATTR_EN_0_0_NNZW							0x0000000c
+#define    NV50TCL_VP_ATTR_EN_0_0_XNZW							0x0000000d
+#define    NV50TCL_VP_ATTR_EN_0_0_NYZW							0x0000000e
+#define    NV50TCL_VP_ATTR_EN_0_0_XYZW							0x0000000f
+#define  NV50TCL_VP_ATTR_EN_1								0x00001654
+#define   NV50TCL_VP_ATTR_EN_1_15_SHIFT							28
+#define   NV50TCL_VP_ATTR_EN_1_15_MASK							0xf0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XNNN							0x10000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NYNN							0x20000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XYNN							0x30000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NNZN							0x40000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XNZN							0x50000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NYZN							0x60000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XYZN							0x70000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NNNW							0x80000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XNNW							0x90000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NYNW							0xa0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XYNW							0xb0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NNZW							0xc0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XNZW							0xd0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NYZW							0xe0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XYZW							0xf0000000
+#define   NV50TCL_VP_ATTR_EN_1_14_SHIFT							24
+#define   NV50TCL_VP_ATTR_EN_1_14_MASK							0x0f000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XNNN							0x01000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NYNN							0x02000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XYNN							0x03000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NNZN							0x04000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XNZN							0x05000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NYZN							0x06000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XYZN							0x07000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NNNW							0x08000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XNNW							0x09000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NYNW							0x0a000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XYNW							0x0b000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NNZW							0x0c000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XNZW							0x0d000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NYZW							0x0e000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XYZW							0x0f000000
+#define   NV50TCL_VP_ATTR_EN_1_13_SHIFT							20
+#define   NV50TCL_VP_ATTR_EN_1_13_MASK							0x00f00000
+#define    NV50TCL_VP_ATTR_EN_1_13_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_13_XNNN							0x00100000
+#define    NV50TCL_VP_ATTR_EN_1_13_NYNN							0x00200000
+#define    NV50TCL_VP_ATTR_EN_1_13_XYNN							0x00300000
+#define    NV50TCL_VP_ATTR_EN_1_13_NNZN							0x00400000
+#define    NV50TCL_VP_ATTR_EN_1_13_XNZN							0x00500000
+#define    NV50TCL_VP_ATTR_EN_1_13_NYZN							0x00600000
+#define    NV50TCL_VP_ATTR_EN_1_13_XYZN							0x00700000
+#define    NV50TCL_VP_ATTR_EN_1_13_NNNW							0x00800000
+#define    NV50TCL_VP_ATTR_EN_1_13_XNNW							0x00900000
+#define    NV50TCL_VP_ATTR_EN_1_13_NYNW							0x00a00000
+#define    NV50TCL_VP_ATTR_EN_1_13_XYNW							0x00b00000
+#define    NV50TCL_VP_ATTR_EN_1_13_NNZW							0x00c00000
+#define    NV50TCL_VP_ATTR_EN_1_13_XNZW							0x00d00000
+#define    NV50TCL_VP_ATTR_EN_1_13_NYZW							0x00e00000
+#define    NV50TCL_VP_ATTR_EN_1_13_XYZW							0x00f00000
+#define   NV50TCL_VP_ATTR_EN_1_12_SHIFT							16
+#define   NV50TCL_VP_ATTR_EN_1_12_MASK							0x000f0000
+#define    NV50TCL_VP_ATTR_EN_1_12_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_12_XNNN							0x00010000
+#define    NV50TCL_VP_ATTR_EN_1_12_NYNN							0x00020000
+#define    NV50TCL_VP_ATTR_EN_1_12_XYNN							0x00030000
+#define    NV50TCL_VP_ATTR_EN_1_12_NNZN							0x00040000
+#define    NV50TCL_VP_ATTR_EN_1_12_XNZN							0x00050000
+#define    NV50TCL_VP_ATTR_EN_1_12_NYZN							0x00060000
+#define    NV50TCL_VP_ATTR_EN_1_12_XYZN							0x00070000
+#define    NV50TCL_VP_ATTR_EN_1_12_NNNW							0x00080000
+#define    NV50TCL_VP_ATTR_EN_1_12_XNNW							0x00090000
+#define    NV50TCL_VP_ATTR_EN_1_12_NYNW							0x000a0000
+#define    NV50TCL_VP_ATTR_EN_1_12_XYNW							0x000b0000
+#define    NV50TCL_VP_ATTR_EN_1_12_NNZW							0x000c0000
+#define    NV50TCL_VP_ATTR_EN_1_12_XNZW							0x000d0000
+#define    NV50TCL_VP_ATTR_EN_1_12_NYZW							0x000e0000
+#define    NV50TCL_VP_ATTR_EN_1_12_XYZW							0x000f0000
+#define   NV50TCL_VP_ATTR_EN_1_11_SHIFT							12
+#define   NV50TCL_VP_ATTR_EN_1_11_MASK							0x0000f000
+#define    NV50TCL_VP_ATTR_EN_1_11_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_11_XNNN							0x00001000
+#define    NV50TCL_VP_ATTR_EN_1_11_NYNN							0x00002000
+#define    NV50TCL_VP_ATTR_EN_1_11_XYNN							0x00003000
+#define    NV50TCL_VP_ATTR_EN_1_11_NNZN							0x00004000
+#define    NV50TCL_VP_ATTR_EN_1_11_XNZN							0x00005000
+#define    NV50TCL_VP_ATTR_EN_1_11_NYZN							0x00006000
+#define    NV50TCL_VP_ATTR_EN_1_11_XYZN							0x00007000
+#define    NV50TCL_VP_ATTR_EN_1_11_NNNW							0x00008000
+#define    NV50TCL_VP_ATTR_EN_1_11_XNNW							0x00009000
+#define    NV50TCL_VP_ATTR_EN_1_11_NYNW							0x0000a000
+#define    NV50TCL_VP_ATTR_EN_1_11_XYNW							0x0000b000
+#define    NV50TCL_VP_ATTR_EN_1_11_NNZW							0x0000c000
+#define    NV50TCL_VP_ATTR_EN_1_11_XNZW							0x0000d000
+#define    NV50TCL_VP_ATTR_EN_1_11_NYZW							0x0000e000
+#define    NV50TCL_VP_ATTR_EN_1_11_XYZW							0x0000f000
+#define   NV50TCL_VP_ATTR_EN_1_10_SHIFT							8
+#define   NV50TCL_VP_ATTR_EN_1_10_MASK							0x00000f00
+#define    NV50TCL_VP_ATTR_EN_1_10_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_10_XNNN							0x00000100
+#define    NV50TCL_VP_ATTR_EN_1_10_NYNN							0x00000200
+#define    NV50TCL_VP_ATTR_EN_1_10_XYNN							0x00000300
+#define    NV50TCL_VP_ATTR_EN_1_10_NNZN							0x00000400
+#define    NV50TCL_VP_ATTR_EN_1_10_XNZN							0x00000500
+#define    NV50TCL_VP_ATTR_EN_1_10_NYZN							0x00000600
+#define    NV50TCL_VP_ATTR_EN_1_10_XYZN							0x00000700
+#define    NV50TCL_VP_ATTR_EN_1_10_NNNW							0x00000800
+#define    NV50TCL_VP_ATTR_EN_1_10_XNNW							0x00000900
+#define    NV50TCL_VP_ATTR_EN_1_10_NYNW							0x00000a00
+#define    NV50TCL_VP_ATTR_EN_1_10_XYNW							0x00000b00
+#define    NV50TCL_VP_ATTR_EN_1_10_NNZW							0x00000c00
+#define    NV50TCL_VP_ATTR_EN_1_10_XNZW							0x00000d00
+#define    NV50TCL_VP_ATTR_EN_1_10_NYZW							0x00000e00
+#define    NV50TCL_VP_ATTR_EN_1_10_XYZW							0x00000f00
+#define   NV50TCL_VP_ATTR_EN_1_9_SHIFT							4
+#define   NV50TCL_VP_ATTR_EN_1_9_MASK							0x000000f0
+#define    NV50TCL_VP_ATTR_EN_1_9_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_9_XNNN							0x00000010
+#define    NV50TCL_VP_ATTR_EN_1_9_NYNN							0x00000020
+#define    NV50TCL_VP_ATTR_EN_1_9_XYNN							0x00000030
+#define    NV50TCL_VP_ATTR_EN_1_9_NNZN							0x00000040
+#define    NV50TCL_VP_ATTR_EN_1_9_XNZN							0x00000050
+#define    NV50TCL_VP_ATTR_EN_1_9_NYZN							0x00000060
+#define    NV50TCL_VP_ATTR_EN_1_9_XYZN							0x00000070
+#define    NV50TCL_VP_ATTR_EN_1_9_NNNW							0x00000080
+#define    NV50TCL_VP_ATTR_EN_1_9_XNNW							0x00000090
+#define    NV50TCL_VP_ATTR_EN_1_9_NYNW							0x000000a0
+#define    NV50TCL_VP_ATTR_EN_1_9_XYNW							0x000000b0
+#define    NV50TCL_VP_ATTR_EN_1_9_NNZW							0x000000c0
+#define    NV50TCL_VP_ATTR_EN_1_9_XNZW							0x000000d0
+#define    NV50TCL_VP_ATTR_EN_1_9_NYZW							0x000000e0
+#define    NV50TCL_VP_ATTR_EN_1_9_XYZW							0x000000f0
+#define   NV50TCL_VP_ATTR_EN_1_8_SHIFT							0
+#define   NV50TCL_VP_ATTR_EN_1_8_MASK							0x0000000f
+#define    NV50TCL_VP_ATTR_EN_1_8_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_8_XNNN							0x00000001
+#define    NV50TCL_VP_ATTR_EN_1_8_NYNN							0x00000002
+#define    NV50TCL_VP_ATTR_EN_1_8_XYNN							0x00000003
+#define    NV50TCL_VP_ATTR_EN_1_8_NNZN							0x00000004
+#define    NV50TCL_VP_ATTR_EN_1_8_XNZN							0x00000005
+#define    NV50TCL_VP_ATTR_EN_1_8_NYZN							0x00000006
+#define    NV50TCL_VP_ATTR_EN_1_8_XYZN							0x00000007
+#define    NV50TCL_VP_ATTR_EN_1_8_NNNW							0x00000008
+#define    NV50TCL_VP_ATTR_EN_1_8_XNNW							0x00000009
+#define    NV50TCL_VP_ATTR_EN_1_8_NYNW							0x0000000a
+#define    NV50TCL_VP_ATTR_EN_1_8_XYNW							0x0000000b
+#define    NV50TCL_VP_ATTR_EN_1_8_NNZW							0x0000000c
+#define    NV50TCL_VP_ATTR_EN_1_8_XNZW							0x0000000d
+#define    NV50TCL_VP_ATTR_EN_1_8_NYZW							0x0000000e
+#define    NV50TCL_VP_ATTR_EN_1_8_XYZW							0x0000000f
+#define  NV50TCL_POINT_SPRITE_CTRL							0x00001660
+#define  NV50TCL_LINE_STIPPLE_ENABLE							0x0000166c
+#define  NV50TCL_LINE_STIPPLE_PATTERN							0x00001680
+#define  NV50TCL_PROVOKING_VERTEX_LAST							0x00001684
+#define  NV50TCL_VERTEX_TWO_SIDE_ENABLE							0x00001688
+#define  NV50TCL_POLYGON_STIPPLE_ENABLE							0x0000168c
+#define  NV50TCL_SET_PROGRAM_CB								0x00001694
+#define   NV50TCL_SET_PROGRAM_CB_PROGRAM_SHIFT						4
+#define   NV50TCL_SET_PROGRAM_CB_PROGRAM_MASK						0x000000f0
+#define    NV50TCL_SET_PROGRAM_CB_PROGRAM_VERTEX					0x00000000
+#define    NV50TCL_SET_PROGRAM_CB_PROGRAM_GEOMETRY					0x00000020
+#define    NV50TCL_SET_PROGRAM_CB_PROGRAM_FRAGMENT					0x00000030
+#define   NV50TCL_SET_PROGRAM_CB_INDEX_SHIFT						8
+#define   NV50TCL_SET_PROGRAM_CB_INDEX_MASK						0x00000f00
+#define   NV50TCL_SET_PROGRAM_CB_BUFFER_SHIFT						12
+#define   NV50TCL_SET_PROGRAM_CB_BUFFER_MASK						0x0007f000
+#define   NV50TCL_SET_PROGRAM_CB_VALID							(1 <<  0)
+#define  NV50TCL_VP_RESULT_MAP_SIZE							0x000016ac
+#define  NV50TCL_VP_REG_ALLOC_TEMP							0x000016b0
+#define  NV50TCL_VP_REG_ALLOC_RESULT							0x000016b8
+#define  NV50TCL_VP_RESULT_MAP(x)							(0x000016bc+((x)*4))
+#define  NV50TCL_VP_RESULT_MAP__SIZE							0x00000010
+#define   NV50TCL_VP_RESULT_MAP_0_SHIFT							0
+#define   NV50TCL_VP_RESULT_MAP_0_MASK							0x000000ff
+#define   NV50TCL_VP_RESULT_MAP_1_SHIFT							8
+#define   NV50TCL_VP_RESULT_MAP_1_MASK							0x0000ff00
+#define   NV50TCL_VP_RESULT_MAP_2_SHIFT							16
+#define   NV50TCL_VP_RESULT_MAP_2_MASK							0x00ff0000
+#define   NV50TCL_VP_RESULT_MAP_3_SHIFT							24
+#define   NV50TCL_VP_RESULT_MAP_3_MASK							0xff000000
+#define  NV50TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001700+((x)*4))
+#define  NV50TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NV50TCL_GP_ENABLE								0x00001798
+#define  NV50TCL_GP_REG_ALLOC_TEMP							0x000017a0
+#define  NV50TCL_GP_REG_ALLOC_RESULT							0x000017a8
+#define  NV50TCL_GP_RESULT_MAP_SIZE							0x000017ac
+#define  NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE						0x000017b0
+#define   NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_POINTS					0x00000001
+#define   NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP					0x00000002
+#define   NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP				0x00000003
+#define  NV50TCL_RASTERIZE_ENABLE							0x000017b4
+#define  NV50TCL_STRMOUT_ENABLE								0x000017b8
+#define  NV50TCL_GP_RESULT_MAP(x)							(0x000017fc+((x)*4))
+#define  NV50TCL_GP_RESULT_MAP__SIZE							0x00000020
+#define   NV50TCL_GP_RESULT_MAP_0_SHIFT							0
+#define   NV50TCL_GP_RESULT_MAP_0_MASK							0x000000ff
+#define   NV50TCL_GP_RESULT_MAP_1_SHIFT							8
+#define   NV50TCL_GP_RESULT_MAP_1_MASK							0x0000ff00
+#define   NV50TCL_GP_RESULT_MAP_2_SHIFT							16
+#define   NV50TCL_GP_RESULT_MAP_2_MASK							0x00ff0000
+#define   NV50TCL_GP_RESULT_MAP_3_SHIFT							24
+#define   NV50TCL_GP_RESULT_MAP_3_MASK							0xff000000
+#define  NV50TCL_MAP_SEMANTIC_0								0x00001904
+#define   NV50TCL_MAP_SEMANTIC_0_FFC0_ID_SHIFT						0
+#define   NV50TCL_MAP_SEMANTIC_0_FFC0_ID_MASK						0x000000ff
+#define   NV50TCL_MAP_SEMANTIC_0_BFC0_ID_SHIFT						8
+#define   NV50TCL_MAP_SEMANTIC_0_BFC0_ID_MASK						0x0000ff00
+#define   NV50TCL_MAP_SEMANTIC_0_COLR_NR_SHIFT						16
+#define   NV50TCL_MAP_SEMANTIC_0_COLR_NR_MASK						0x00ff0000
+#define   NV50TCL_MAP_SEMANTIC_0_CLMP_EN_SHIFT						24
+#define   NV50TCL_MAP_SEMANTIC_0_CLMP_EN_MASK						0xff000000
+#define  NV50TCL_MAP_SEMANTIC_1								0x00001908
+#define   NV50TCL_MAP_SEMANTIC_1_CLIP_LO_SHIFT						0
+#define   NV50TCL_MAP_SEMANTIC_1_CLIP_LO_MASK						0x000000ff
+#define   NV50TCL_MAP_SEMANTIC_1_CLIP_HI_SHIFT						8
+#define   NV50TCL_MAP_SEMANTIC_1_CLIP_HI_MASK						0x0000ff00
+#define  NV50TCL_MAP_SEMANTIC_2								0x0000190c
+#define   NV50TCL_MAP_SEMANTIC_2_LAYER_ID_SHIFT						0
+#define   NV50TCL_MAP_SEMANTIC_2_LAYER_ID_MASK						0x000000ff
+#define  NV50TCL_MAP_SEMANTIC_3								0x00001910
+#define   NV50TCL_MAP_SEMANTIC_3_PTSZ_EN						(1 <<  0)
+#define   NV50TCL_MAP_SEMANTIC_3_PTSZ_ID_SHIFT						4
+#define   NV50TCL_MAP_SEMANTIC_3_PTSZ_ID_MASK						0x00000ff0
+#define  NV50TCL_MAP_SEMANTIC_4								0x00001914
+#define   NV50TCL_MAP_SEMANTIC_4_PRIM_ID_SHIFT						0
+#define   NV50TCL_MAP_SEMANTIC_4_PRIM_ID_MASK						0x000000ff
+#define  NV50TCL_CULL_FACE_ENABLE							0x00001918
+#define  NV50TCL_FRONT_FACE								0x0000191c
+#define   NV50TCL_FRONT_FACE_CW								0x00000900
+#define   NV50TCL_FRONT_FACE_CCW							0x00000901
+#define  NV50TCL_CULL_FACE								0x00001920
+#define   NV50TCL_CULL_FACE_FRONT							0x00000404
+#define   NV50TCL_CULL_FACE_BACK							0x00000405
+#define   NV50TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV50TCL_VIEWPORT_TRANSFORM_EN							0x0000192c
+#define  NV50TCL_VIEW_VOLUME_CLIP_CTRL							0x0000193c
+#define  NV50TCL_VIEWPORT_CLIP_RECTS_EN							0x0000194c
+#define  NV50TCL_FP_CTRL_UNK196C							0x0000196c
+#define  NV50TCL_FP_INTERPOLANT_CTRL							0x00001988
+#define   NV50TCL_FP_INTERPOLANT_CTRL_UMASK_SHIFT					24
+#define   NV50TCL_FP_INTERPOLANT_CTRL_UMASK_MASK					0xff000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_NONE					0x00000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_XNNN					0x01000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_NYNN					0x02000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_XYNN					0x03000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_NNZN					0x04000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_XNZN					0x05000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_NYZN					0x06000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_XYZN					0x07000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_NNNW					0x08000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_XNNW					0x09000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_NYNW					0x0a000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_XYNW					0x0b000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_NNZW					0x0c000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_XNZW					0x0d000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_NYZW					0x0e000000
+#define    NV50TCL_FP_INTERPOLANT_CTRL_UMASK_XYZW					0x0f000000
+#define   NV50TCL_FP_INTERPOLANT_CTRL_COUNT_NONFLAT_SHIFT				16
+#define   NV50TCL_FP_INTERPOLANT_CTRL_COUNT_NONFLAT_MASK				0x00ff0000
+#define   NV50TCL_FP_INTERPOLANT_CTRL_OFFSET_SHIFT					8
+#define   NV50TCL_FP_INTERPOLANT_CTRL_OFFSET_MASK					0x0000ff00
+#define   NV50TCL_FP_INTERPOLANT_CTRL_COUNT_SHIFT					0
+#define   NV50TCL_FP_INTERPOLANT_CTRL_COUNT_MASK					0x000000ff
+#define  NV50TCL_FP_REG_ALLOC_TEMP							0x0000198c
+#define  NV50TCL_REG_MODE								0x000019a0
+#define   NV50TCL_REG_MODE_PACKED							0x00000001
+#define   NV50TCL_REG_MODE_STRIPED							0x00000002
+#define  NV50TCL_FP_CONTROL								0x000019a8
+#define   NV50TCL_FP_CONTROL_MULTIPLE_RESULTS						(1 <<  0)
+#define   NV50TCL_FP_CONTROL_EXPORTS_Z							(1 <<  8)
+#define   NV50TCL_FP_CONTROL_USES_KIL							(1 << 20)
+#define  NV50TCL_DEPTH_BOUNDS_EN							0x000019bc
+#define  NV50TCL_LOGIC_OP_ENABLE							0x000019c4
+#define  NV50TCL_LOGIC_OP								0x000019c8
+#define   NV50TCL_LOGIC_OP_CLEAR							0x00001500
+#define   NV50TCL_LOGIC_OP_AND								0x00001501
+#define   NV50TCL_LOGIC_OP_AND_REVERSE							0x00001502
+#define   NV50TCL_LOGIC_OP_COPY								0x00001503
+#define   NV50TCL_LOGIC_OP_AND_INVERTED							0x00001504
+#define   NV50TCL_LOGIC_OP_NOOP								0x00001505
+#define   NV50TCL_LOGIC_OP_XOR								0x00001506
+#define   NV50TCL_LOGIC_OP_OR								0x00001507
+#define   NV50TCL_LOGIC_OP_NOR								0x00001508
+#define   NV50TCL_LOGIC_OP_EQUIV							0x00001509
+#define   NV50TCL_LOGIC_OP_INVERT							0x0000150a
+#define   NV50TCL_LOGIC_OP_OR_REVERSE							0x0000150b
+#define   NV50TCL_LOGIC_OP_COPY_INVERTED						0x0000150c
+#define   NV50TCL_LOGIC_OP_OR_INVERTED							0x0000150d
+#define   NV50TCL_LOGIC_OP_NAND								0x0000150e
+#define   NV50TCL_LOGIC_OP_SET								0x0000150f
+#define  NV50TCL_CLEAR_BUFFERS								0x000019d0
+#define   NV50TCL_CLEAR_BUFFERS_Z							(1 <<  0)
+#define   NV50TCL_CLEAR_BUFFERS_S							(1 <<  1)
+#define   NV50TCL_CLEAR_BUFFERS_R							(1 <<  2)
+#define   NV50TCL_CLEAR_BUFFERS_G							(1 <<  3)
+#define   NV50TCL_CLEAR_BUFFERS_B							(1 <<  4)
+#define   NV50TCL_CLEAR_BUFFERS_A							(1 <<  5)
+#define   NV50TCL_CLEAR_BUFFERS_RT_SHIFT						6
+#define   NV50TCL_CLEAR_BUFFERS_RT_MASK							0x000003c0
+#define   NV50TCL_CLEAR_BUFFERS_LAYER_SHIFT						10
+#define   NV50TCL_CLEAR_BUFFERS_LAYER_MASK						0x0007fc00
+#define  NV50TCL_COLOR_MASK(x)								(0x00001a00+((x)*4))
+#define  NV50TCL_COLOR_MASK__SIZE							0x00000008
+#define   NV50TCL_COLOR_MASK_R_SHIFT							0
+#define   NV50TCL_COLOR_MASK_R_MASK							0x0000000f
+#define   NV50TCL_COLOR_MASK_G_SHIFT							4
+#define   NV50TCL_COLOR_MASK_G_MASK							0x000000f0
+#define   NV50TCL_COLOR_MASK_B_SHIFT							8
+#define   NV50TCL_COLOR_MASK_B_MASK							0x00000f00
+#define   NV50TCL_COLOR_MASK_A_SHIFT							12
+#define   NV50TCL_COLOR_MASK_A_MASK							0x0000f000
+#define  NV50TCL_STRMOUT_ADDRESS_HIGH(x)						(0x00001a80+((x)*16))
+#define  NV50TCL_STRMOUT_ADDRESS_HIGH__SIZE						0x00000004
+#define  NV50TCL_STRMOUT_ADDRESS_LOW(x)							(0x00001a84+((x)*16))
+#define  NV50TCL_STRMOUT_ADDRESS_LOW__SIZE						0x00000004
+#define  NV50TCL_STRMOUT_NUM_ATTRIBS(x)							(0x00001a88+((x)*16))
+#define  NV50TCL_STRMOUT_NUM_ATTRIBS__SIZE						0x00000004
+#define  NV50TCL_VERTEX_ARRAY_ATTRIB(x)							(0x00001ac0+((x)*4))
+#define  NV50TCL_VERTEX_ARRAY_ATTRIB__SIZE						0x00000010
+#define   NV50TCL_VERTEX_ARRAY_ATTRIB_BUFFER_SHIFT					0
+#define   NV50TCL_VERTEX_ARRAY_ATTRIB_BUFFER_MASK					0x0000000f
+#define   NV50TCL_VERTEX_ARRAY_ATTRIB_CONST						(1 <<  4)
+#define   NV50TCL_VERTEX_ARRAY_ATTRIB_OFFSET_SHIFT					5
+#define   NV50TCL_VERTEX_ARRAY_ATTRIB_OFFSET_MASK					0x0007ffe0
+#define   NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_SHIFT					19
+#define   NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_MASK					0x01f80000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32_32				0x00080000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32					0x00100000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16_16				0x00180000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32					0x00200000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16					0x00280000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8_8					0x00500000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16					0x00780000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32					0x00900000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8					0x00980000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8					0x00c00000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16					0x00d80000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8						0x00e80000
+#define   NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SHIFT					25
+#define   NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_MASK						0x7e000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT					0x7e000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM					0x24000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM					0x12000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED					0x5a000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED					0x6c000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT					0x48000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT					0x36000000
+#define   NV50TCL_VERTEX_ARRAY_ATTRIB_BGRA						(1 << 31)
+#define  NV50TCL_QUERY_ADDRESS_HIGH							0x00001b00
+#define  NV50TCL_QUERY_ADDRESS_LOW							0x00001b04
+#define  NV50TCL_QUERY_COUNTER								0x00001b08
+#define  NV50TCL_QUERY_GET								0x00001b0c
+
+
+#define NV84TCL										0x00008297
+
+
+
+#define NVA0TCL										0x00008397
+
+
+
+#define NVA8TCL										0x00008597
+
+
+
+#define NV50_COMPUTE									0x000050c0
+
+#define  NV50_COMPUTE_NOP								0x00000100
+#define  NV50_COMPUTE_NOTIFY								0x00000104
+#define  NV50_COMPUTE_SERIALIZE								0x00000110
+#define  NV50_COMPUTE_DMA_NOTIFY							0x00000180
+#define  NV50_COMPUTE_DMA_GLOBAL							0x000001a0
+#define  NV50_COMPUTE_DMA_QUERY								0x000001a4
+#define  NV50_COMPUTE_DMA_LOCAL								0x000001b8
+#define  NV50_COMPUTE_DMA_STACK								0x000001bc
+#define  NV50_COMPUTE_DMA_CODE_CB							0x000001c0
+#define  NV50_COMPUTE_DMA_TSC								0x000001c4
+#define  NV50_COMPUTE_DMA_TIC								0x000001c8
+#define  NV50_COMPUTE_DMA_TEXTURE							0x000001cc
+#define  NV50_COMPUTE_CP_ADDRESS_HIGH							0x00000210
+#define  NV50_COMPUTE_CP_ADDRESS_LOW							0x00000214
+#define  NV50_COMPUTE_STACK_ADDRESS_HIGH						0x00000218
+#define  NV50_COMPUTE_STACK_ADDRESS_LOW							0x0000021c
+#define  NV50_COMPUTE_STACK_SIZE_LOG							0x00000220
+#define  NV50_COMPUTE_TSC_ADDRESS_HIGH							0x0000022c
+#define  NV50_COMPUTE_TSC_ADDRESS_LOW							0x00000230
+#define  NV50_COMPUTE_TSC_LIMIT								0x00000234
+#define  NV50_COMPUTE_CB_ADDR								0x00000238
+#define   NV50_COMPUTE_CB_ADDR_ID_SHIFT							8
+#define   NV50_COMPUTE_CB_ADDR_ID_MASK							0x003fff00
+#define   NV50_COMPUTE_CB_ADDR_BUFFER_SHIFT						0
+#define   NV50_COMPUTE_CB_ADDR_BUFFER_MASK						0x0000007f
+#define  NV50_COMPUTE_CB_DATA(x)							(0x0000023c+((x)*4))
+#define  NV50_COMPUTE_CB_DATA__SIZE							0x00000010
+#define  NV50_COMPUTE_DELAY1								0x00000284
+#define  NV50_COMPUTE_WATCHDOG_TIMER							0x00000288
+#define  NV50_COMPUTE_DELAY2								0x0000028c
+#define  NV50_COMPUTE_LOCAL_ADDRESS_HIGH						0x00000294
+#define  NV50_COMPUTE_LOCAL_ADDRESS_LOW							0x00000298
+#define  NV50_COMPUTE_LOCAL_SIZE_LOG							0x0000029c
+#define  NV50_COMPUTE_CB_DEF_ADDRESS_HIGH						0x000002a4
+#define  NV50_COMPUTE_CB_DEF_ADDRESS_LOW						0x000002a8
+#define  NV50_COMPUTE_CB_DEF_SET							0x000002ac
+#define   NV50_COMPUTE_CB_DEF_SET_SIZE_SHIFT						0
+#define   NV50_COMPUTE_CB_DEF_SET_SIZE_MASK						0x0000ffff
+#define   NV50_COMPUTE_CB_DEF_SET_BUFFER_SHIFT						16
+#define   NV50_COMPUTE_CB_DEF_SET_BUFFER_MASK						0x007f0000
+#define  NV50_COMPUTE_BLOCK_ALLOC							0x000002b4
+#define   NV50_COMPUTE_BLOCK_ALLOC_THREADS_SHIFT					0
+#define   NV50_COMPUTE_BLOCK_ALLOC_THREADS_MASK						0x0000ffff
+#define   NV50_COMPUTE_BLOCK_ALLOC_BARRIERS_SHIFT					16
+#define   NV50_COMPUTE_BLOCK_ALLOC_BARRIERS_MASK					0xffff0000
+#define  NV50_COMPUTE_LANES32_ENABLE							0x000002b8
+#define  NV50_COMPUTE_CP_REG_ALLOC_TEMP							0x000002c0
+#define  NV50_COMPUTE_TIC_ADDRESS_HIGH							0x000002c4
+#define  NV50_COMPUTE_TIC_ADDRESS_LOW							0x000002c8
+#define  NV50_COMPUTE_TIC_LIMIT								0x000002cc
+#define  NV50_COMPUTE_PM_SET(x)								(0x000002d0+((x)*4))
+#define  NV50_COMPUTE_PM_SET__SIZE							0x00000004
+#define  NV50_COMPUTE_PM_CONTROL(x)							(0x000002e0+((x)*4))
+#define  NV50_COMPUTE_PM_CONTROL__SIZE							0x00000004
+#define   NV50_COMPUTE_PM_CONTROL_UNK0							(1 <<  0)
+#define   NV50_COMPUTE_PM_CONTROL_UNK1_SHIFT						4
+#define   NV50_COMPUTE_PM_CONTROL_UNK1_MASK						0x00000070
+#define   NV50_COMPUTE_PM_CONTROL_UNK2_SHIFT						8
+#define   NV50_COMPUTE_PM_CONTROL_UNK2_MASK						0xffffff00
+#define  NV50_COMPUTE_LOCAL_WARPS_LOG_ALLOC						0x000002fc
+#define  NV50_COMPUTE_LOCAL_WARPS_NO_CLAMP						0x00000300
+#define  NV50_COMPUTE_STACK_WARPS_LOG_ALLOC						0x00000304
+#define  NV50_COMPUTE_STACK_WARPS_NO_CLAMP						0x00000308
+#define  NV50_COMPUTE_QUERY_ADDRESS_HIGH						0x00000310
+#define  NV50_COMPUTE_QUERY_ADDRESS_LOW							0x00000314
+#define  NV50_COMPUTE_QUERY_COUNTER							0x00000318
+#define  NV50_COMPUTE_QUERY_GET								0x0000031c
+#define  NV50_COMPUTE_COND_ADDRESS_HIGH							0x00000320
+#define  NV50_COMPUTE_COND_ADDRESS_LOW							0x00000324
+#define  NV50_COMPUTE_COND_MODE								0x00000328
+#define   NV50_COMPUTE_COND_MODE_NEVER							0x00000000
+#define   NV50_COMPUTE_COND_MODE_ALWAYS							0x00000001
+#define   NV50_COMPUTE_COND_MODE_RES							0x00000002
+#define   NV50_COMPUTE_COND_MODE_NOT_RES_AND_NOT_ID					0x00000003
+#define   NV50_COMPUTE_COND_MODE_RES_OR_ID						0x00000004
+#define  NV50_COMPUTE_LAUNCH								0x00000368
+#define  NV50_COMPUTE_USER_PARAM_COUNT							0x00000374
+#define   NV50_COMPUTE_USER_PARAM_COUNT_COUNT_SHIFT					8
+#define   NV50_COMPUTE_USER_PARAM_COUNT_COUNT_MASK					0x0000ff00
+#define  NV50_COMPUTE_LINKED_TSC							0x00000378
+#define  NV50_COMPUTE_CODE_CB_FLUSH							0x00000380
+#define  NV50_COMPUTE_GRIDDIM								0x000003a4
+#define   NV50_COMPUTE_GRIDDIM_X_SHIFT							0
+#define   NV50_COMPUTE_GRIDDIM_X_MASK							0x0000ffff
+#define   NV50_COMPUTE_GRIDDIM_Y_SHIFT							16
+#define   NV50_COMPUTE_GRIDDIM_Y_MASK							0xffff0000
+#define  NV50_COMPUTE_SHARED_SIZE							0x000003a8
+#define  NV50_COMPUTE_BLOCKDIM_YX							0x000003ac
+#define   NV50_COMPUTE_BLOCKDIM_YX_X_SHIFT						0
+#define   NV50_COMPUTE_BLOCKDIM_YX_X_MASK						0x0000ffff
+#define   NV50_COMPUTE_BLOCKDIM_YX_Y_SHIFT						16
+#define   NV50_COMPUTE_BLOCKDIM_YX_Y_MASK						0xffff0000
+#define  NV50_COMPUTE_BLOCKDIM_Z							0x000003b0
+#define  NV50_COMPUTE_CP_START_ID							0x000003b4
+#define  NV50_COMPUTE_REG_MODE								0x000003b8
+#define   NV50_COMPUTE_REG_MODE_PACKED							0x00000001
+#define   NV50_COMPUTE_REG_MODE_STRIPED							0x00000002
+#define  NV50_COMPUTE_TEX_LIMITS							0x000003bc
+#define   NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2_SHIFT					0
+#define   NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2_MASK					0x0000000f
+#define   NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2_SHIFT					4
+#define   NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2_MASK					0x000000f0
+#define  NV50_COMPUTE_BIND_TSC								0x000003c0
+#define   NV50_COMPUTE_BIND_TSC_VALID							(1 <<  0)
+#define   NV50_COMPUTE_BIND_TSC_SAMPLER_SHIFT						4
+#define   NV50_COMPUTE_BIND_TSC_SAMPLER_MASK						0x000000f0
+#define   NV50_COMPUTE_BIND_TSC_TSC_SHIFT						12
+#define   NV50_COMPUTE_BIND_TSC_TSC_MASK						0x001ff000
+#define  NV50_COMPUTE_BIND_TIC								0x000003c4
+#define   NV50_COMPUTE_BIND_TIC_VALID							(1 <<  0)
+#define   NV50_COMPUTE_BIND_TIC_TEXTURE_SHIFT						1
+#define   NV50_COMPUTE_BIND_TIC_TEXTURE_MASK						0x000001fe
+#define   NV50_COMPUTE_BIND_TIC_TIC_SHIFT						9
+#define   NV50_COMPUTE_BIND_TIC_TIC_MASK						0x7ffffe00
+#define  NV50_COMPUTE_SET_PROGRAM_CB							0x000003c8
+#define   NV50_COMPUTE_SET_PROGRAM_CB_INDEX_SHIFT					8
+#define   NV50_COMPUTE_SET_PROGRAM_CB_INDEX_MASK					0x00000f00
+#define   NV50_COMPUTE_SET_PROGRAM_CB_BUFFER_SHIFT					12
+#define   NV50_COMPUTE_SET_PROGRAM_CB_BUFFER_MASK					0x0007f000
+#define   NV50_COMPUTE_SET_PROGRAM_CB_VALID						(1 <<  0)
+#define  NV50_COMPUTE_GLOBAL_ADDRESS_HIGH(x)						(0x00000400+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_ADDRESS_HIGH__SIZE						0x00000010
+#define  NV50_COMPUTE_GLOBAL_ADDRESS_LOW(x)						(0x00000404+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_ADDRESS_LOW__SIZE						0x00000010
+#define  NV50_COMPUTE_GLOBAL_PITCH(x)							(0x00000408+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_PITCH__SIZE						0x00000010
+#define  NV50_COMPUTE_GLOBAL_LIMIT(x)							(0x0000040c+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_LIMIT__SIZE						0x00000010
+#define  NV50_COMPUTE_GLOBAL_MODE(x)							(0x00000410+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_MODE__SIZE							0x00000010
+#define   NV50_COMPUTE_GLOBAL_MODE_LINEAR						(1 <<  0)
+#define   NV50_COMPUTE_GLOBAL_MODE_TILE_MODE_SHIFT					8
+#define   NV50_COMPUTE_GLOBAL_MODE_TILE_MODE_MASK					0x00000f00
+#define  NV50_COMPUTE_USER_PARAM(x)							(0x00000600+((x)*4))
+#define  NV50_COMPUTE_USER_PARAM__SIZE							0x00000040
+
+
+#endif /* NOUVEAU_REG_H */
diff --git a/src/gallium/drivers/nouveau/nouveau_gldefs.h b/src/gallium/drivers/nouveau/nouveau_gldefs.h
new file mode 100644
index 0000000000..ff97aaa9af
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_gldefs.h
@@ -0,0 +1,196 @@
+#ifndef __NOUVEAU_GLDEFS_H__
+#define __NOUVEAU_GLDEFS_H__
+
+static INLINE unsigned
+nvgl_blend_func(unsigned factor)
+{
+	switch (factor) {
+	case PIPE_BLENDFACTOR_ZERO:
+		return 0x0000;
+	case PIPE_BLENDFACTOR_ONE:
+		return 0x0001;
+	case PIPE_BLENDFACTOR_SRC_COLOR:
+		return 0x0300;
+	case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+		return 0x0301;
+	case PIPE_BLENDFACTOR_SRC_ALPHA:
+		return 0x0302;
+	case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+		return 0x0303;
+	case PIPE_BLENDFACTOR_DST_ALPHA:
+		return 0x0304;
+	case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+		return 0x0305;
+	case PIPE_BLENDFACTOR_DST_COLOR:
+		return 0x0306;
+	case PIPE_BLENDFACTOR_INV_DST_COLOR:
+		return 0x0307;
+	case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+		return 0x0308;
+	case PIPE_BLENDFACTOR_CONST_COLOR:
+		return 0x8001;
+	case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+		return 0x8002;
+	case PIPE_BLENDFACTOR_CONST_ALPHA:
+		return 0x8003;
+	case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+		return 0x8004;
+	default:
+		return 0x0000;
+	}
+}
+
+static INLINE unsigned
+nvgl_blend_eqn(unsigned func)
+{
+	switch (func) {
+	case PIPE_BLEND_ADD:
+		return 0x8006;
+	case PIPE_BLEND_MIN:
+		return 0x8007;
+	case PIPE_BLEND_MAX:
+		return 0x8008;
+	case PIPE_BLEND_SUBTRACT:
+		return 0x800a;
+	case PIPE_BLEND_REVERSE_SUBTRACT:
+		return 0x800b;
+	default:
+		return 0x8006;
+	}
+}
+
+static INLINE unsigned
+nvgl_logicop_func(unsigned func)
+{
+	switch (func) {
+	case PIPE_LOGICOP_CLEAR:
+		return 0x1500;
+	case PIPE_LOGICOP_NOR:
+		return 0x1508;
+	case PIPE_LOGICOP_AND_INVERTED:
+		return 0x1504;
+	case PIPE_LOGICOP_COPY_INVERTED:
+		return 0x150c;
+	case PIPE_LOGICOP_AND_REVERSE:
+		return 0x1502;
+	case PIPE_LOGICOP_INVERT:
+		return 0x150a;
+	case PIPE_LOGICOP_XOR:
+		return 0x1506;
+	case PIPE_LOGICOP_NAND:
+		return 0x150e;
+	case PIPE_LOGICOP_AND:
+		return 0x1501;
+	case PIPE_LOGICOP_EQUIV:
+		return 0x1509;
+	case PIPE_LOGICOP_NOOP:
+		return 0x1505;
+	case PIPE_LOGICOP_OR_INVERTED:
+		return 0x150d;
+	case PIPE_LOGICOP_COPY:
+		return 0x1503;
+	case PIPE_LOGICOP_OR_REVERSE:
+		return 0x150b;
+	case PIPE_LOGICOP_OR:
+		return 0x1507;
+	case PIPE_LOGICOP_SET:
+		return 0x150f;
+	default:
+		return 0x1505;
+	}
+}
+
+static INLINE unsigned
+nvgl_comparison_op(unsigned op)
+{
+	switch (op) {
+	case PIPE_FUNC_NEVER:
+		return 0x0200;
+	case PIPE_FUNC_LESS:
+		return 0x0201;
+	case PIPE_FUNC_EQUAL:
+		return 0x0202;
+	case PIPE_FUNC_LEQUAL:
+		return 0x0203;
+	case PIPE_FUNC_GREATER:
+		return 0x0204;
+	case PIPE_FUNC_NOTEQUAL:
+		return 0x0205;
+	case PIPE_FUNC_GEQUAL:
+		return 0x0206;
+	case PIPE_FUNC_ALWAYS:
+		return 0x0207;
+	default:
+		return 0x0207;
+	}
+}
+
+static INLINE unsigned
+nvgl_polygon_mode(unsigned mode)
+{
+	switch (mode) {
+	case PIPE_POLYGON_MODE_POINT:
+		return 0x1b00;
+	case PIPE_POLYGON_MODE_LINE:
+		return 0x1b01;
+	case PIPE_POLYGON_MODE_FILL:
+		return 0x1b02;
+	default:
+		return 0x1b02;
+	}
+}
+
+static INLINE unsigned
+nvgl_stencil_op(unsigned op)
+{
+	switch (op) {
+	case PIPE_STENCIL_OP_ZERO:
+		return 0x0000;
+	case PIPE_STENCIL_OP_INVERT:
+		return 0x150a;
+	case PIPE_STENCIL_OP_KEEP:
+		return 0x1e00;
+	case PIPE_STENCIL_OP_REPLACE:
+		return 0x1e01;
+	case PIPE_STENCIL_OP_INCR:
+		return 0x1e02;
+	case PIPE_STENCIL_OP_DECR:
+		return 0x1e03;
+	case PIPE_STENCIL_OP_INCR_WRAP:
+		return 0x8507;
+	case PIPE_STENCIL_OP_DECR_WRAP:
+		return 0x8508;
+	default:
+		return 0x1e00;
+	}
+}
+
+static INLINE unsigned
+nvgl_primitive(unsigned prim) {
+	switch (prim) {
+	case PIPE_PRIM_POINTS:
+		return 0x0001;
+	case PIPE_PRIM_LINES:
+		return 0x0002;
+	case PIPE_PRIM_LINE_LOOP:
+		return 0x0003;
+	case PIPE_PRIM_LINE_STRIP:
+		return 0x0004;
+	case PIPE_PRIM_TRIANGLES:
+		return 0x0005;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return 0x0006;
+	case PIPE_PRIM_TRIANGLE_FAN:
+		return 0x0007;
+	case PIPE_PRIM_QUADS:
+		return 0x0008;
+	case PIPE_PRIM_QUAD_STRIP:
+		return 0x0009;
+	case PIPE_PRIM_POLYGON:
+		return 0x000a;
+	default:
+		return 0;
+	}
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
new file mode 100644
index 0000000000..60bdd7276a
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -0,0 +1,262 @@
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_format_s3tc.h"
+
+#include <stdio.h>
+#include <errno.h>
+
+#include "nouveau/nouveau_bo.h"
+#include "nouveau_winsys.h"
+#include "nouveau_screen.h"
+
+/* XXX this should go away */
+#include "state_tracker/drm_api.h"
+#include "util/u_simple_screen.h"
+
+static const char *
+nouveau_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nouveau_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+
+
+struct nouveau_bo *
+nouveau_screen_bo_new(struct pipe_screen *pscreen, unsigned alignment,
+		      unsigned usage, unsigned bind, unsigned size)
+{
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+	struct nouveau_bo *bo = NULL;
+	uint32_t flags = NOUVEAU_BO_MAP, tile_mode = 0, tile_flags = 0;
+	int ret;
+
+	if (bind & PIPE_BIND_VERTEX_BUFFER)
+		flags |= nouveau_screen(pscreen)->vertex_buffer_flags;
+	else if (bind & PIPE_BIND_INDEX_BUFFER)
+		flags |= nouveau_screen(pscreen)->index_buffer_flags;
+
+	if (bind & (PIPE_BIND_RENDER_TARGET |
+			PIPE_BIND_DEPTH_STENCIL |
+			PIPE_BIND_SCANOUT |
+			PIPE_BIND_DISPLAY_TARGET |
+			PIPE_BIND_SAMPLER_VIEW))
+	{
+		/* TODO: this may be incorrect or suboptimal */
+		if (!(bind & PIPE_BIND_SCANOUT))
+			flags |= NOUVEAU_BO_GART;
+		if (usage != PIPE_USAGE_DYNAMIC)
+			flags |= NOUVEAU_BO_VRAM;
+
+		if (dev->chipset == 0x50 || dev->chipset >= 0x80) {
+			if (bind & PIPE_BIND_DEPTH_STENCIL)
+				tile_flags = 0x2800;
+			else
+				tile_flags = 0x7000;
+		}
+	}
+
+	ret = nouveau_bo_new_tile(dev, flags, alignment, size,
+				  tile_mode, tile_flags, &bo);
+	if (ret)
+		return NULL;
+
+	return bo;
+}
+
+struct nouveau_bo *
+nouveau_screen_bo_user(struct pipe_screen *pscreen, void *ptr, unsigned bytes)
+{
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+	struct nouveau_bo *bo = NULL;
+	int ret;
+
+	ret = nouveau_bo_user(dev, ptr, bytes, &bo);
+	if (ret)
+		return NULL;
+
+	return bo;
+}
+
+void *
+nouveau_screen_bo_map(struct pipe_screen *pscreen,
+		      struct nouveau_bo *bo,
+		      unsigned map_flags)
+{
+	int ret;
+
+	ret = nouveau_bo_map(bo, map_flags);
+	if (ret) {
+		debug_printf("map failed: %d\n", ret);
+		return NULL;
+	}
+
+	return bo->map;
+}
+
+void *
+nouveau_screen_bo_map_range(struct pipe_screen *pscreen, struct nouveau_bo *bo,
+			    unsigned offset, unsigned length, unsigned flags)
+{
+	int ret;
+
+	ret = nouveau_bo_map_range(bo, offset, length, flags);
+	if (ret) {
+		nouveau_bo_unmap(bo);
+		if (!(flags & NOUVEAU_BO_NOWAIT) || ret != -EBUSY)
+			debug_printf("map_range failed: %d\n", ret);
+		return NULL;
+	}
+
+	return (char *)bo->map - offset; /* why gallium? why? */
+}
+
+void
+nouveau_screen_bo_map_flush_range(struct pipe_screen *pscreen, struct nouveau_bo *bo,
+				  unsigned offset, unsigned length)
+{
+	nouveau_bo_map_flush(bo, offset, length);
+}
+
+void
+nouveau_screen_bo_unmap(struct pipe_screen *pscreen, struct nouveau_bo *bo)
+{
+	nouveau_bo_unmap(bo);
+}
+
+void
+nouveau_screen_bo_release(struct pipe_screen *pscreen, struct nouveau_bo *bo)
+{
+	nouveau_bo_ref(NULL, &bo);
+}
+
+static void
+nouveau_screen_fence_ref(struct pipe_screen *pscreen,
+			 struct pipe_fence_handle **ptr,
+			 struct pipe_fence_handle *pfence)
+{
+	*ptr = pfence;
+}
+
+static int
+nouveau_screen_fence_signalled(struct pipe_screen *screen,
+			       struct pipe_fence_handle *pfence,
+			       unsigned flags)
+{
+	return 0;
+}
+
+static int
+nouveau_screen_fence_finish(struct pipe_screen *screen,
+			    struct pipe_fence_handle *pfence,
+			    unsigned flags)
+{
+	return 0;
+}
+
+
+struct nouveau_bo *
+nouveau_screen_bo_from_handle(struct pipe_screen *pscreen,
+			      struct winsys_handle *whandle,
+			      unsigned *out_stride)
+{
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+	struct nouveau_bo *bo = 0;
+	int ret;
+ 
+	ret = nouveau_bo_handle_ref(dev, whandle->handle, &bo);
+	if (ret) {
+		debug_printf("%s: ref name 0x%08x failed with %d\n",
+			     __func__, whandle->handle, ret);
+		return NULL;
+	}
+
+	*out_stride = whandle->stride;
+	return bo;
+}
+
+
+boolean
+nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
+			     struct nouveau_bo *bo,
+			     unsigned stride,
+			     struct winsys_handle *whandle)
+{
+	whandle->stride = stride;
+
+	if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) { 
+		return nouveau_bo_handle_get(bo, &whandle->handle) == 0;
+	} else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+		whandle->handle = bo->handle;
+		return TRUE;
+	} else {
+		return FALSE;
+	}
+}
+
+
+unsigned int
+nouveau_reference_flags(struct nouveau_bo *bo)
+{
+	uint32_t bo_flags;
+	int flags = 0;
+
+	bo_flags = nouveau_bo_pending(bo);
+	if (bo_flags & NOUVEAU_BO_RD)
+		flags |= PIPE_REFERENCED_FOR_READ;
+	if (bo_flags & NOUVEAU_BO_WR)
+		flags |= PIPE_REFERENCED_FOR_WRITE;
+
+	return flags;
+}
+
+
+
+
+
+int
+nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
+{
+	struct pipe_screen *pscreen = &screen->base;
+	int ret;
+
+	ret = nouveau_channel_alloc(dev, 0xbeef0201, 0xbeef0202,
+				    &screen->channel);
+	if (ret)
+		return ret;
+	screen->device = dev;
+
+	pscreen->get_name = nouveau_screen_get_name;
+	pscreen->get_vendor = nouveau_screen_get_vendor;
+
+	pscreen->fence_reference = nouveau_screen_fence_ref;
+	pscreen->fence_signalled = nouveau_screen_fence_signalled;
+	pscreen->fence_finish = nouveau_screen_fence_finish;
+
+	util_format_s3tc_init();
+
+	return 0;
+}
+
+void
+nouveau_screen_fini(struct nouveau_screen *screen)
+{
+	struct pipe_winsys *ws = screen->base.winsys;
+	nouveau_channel_free(&screen->channel);
+	ws->destroy(ws);
+}
+
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
new file mode 100644
index 0000000000..8eacdff035
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -0,0 +1,82 @@
+#ifndef __NOUVEAU_SCREEN_H__
+#define __NOUVEAU_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nouveau_screen {
+	struct pipe_screen base;
+	struct nouveau_device *device;
+	struct nouveau_channel *channel;
+
+	/* note that OpenGL doesn't distinguish between these, so
+	 * these almost always should be set to the same value */
+	unsigned vertex_buffer_flags;
+	unsigned index_buffer_flags;
+};
+
+static inline struct nouveau_screen *
+nouveau_screen(struct pipe_screen *pscreen)
+{
+	return (struct nouveau_screen *)pscreen;
+}
+
+
+
+/* Not really sure if this is needed, or whether the individual
+ * drivers are happy to talk to the bo functions themselves.  In a way
+ * this is what we'd expect from a regular winsys interface.
+ */
+struct nouveau_bo *
+nouveau_screen_bo_new(struct pipe_screen *pscreen, unsigned alignment,
+		      unsigned usage, unsigned bind, unsigned size);
+struct nouveau_bo *
+nouveau_screen_bo_user(struct pipe_screen *pscreen, void *ptr, unsigned bytes);
+void *
+nouveau_screen_bo_map(struct pipe_screen *pscreen,
+		      struct nouveau_bo *pb,
+		      unsigned usage);
+void *
+nouveau_screen_bo_map_range(struct pipe_screen *pscreen, struct nouveau_bo *bo,
+			    unsigned offset, unsigned length, unsigned usage);
+void
+nouveau_screen_bo_map_flush_range(struct pipe_screen *pscreen, struct nouveau_bo *bo,
+				  unsigned offset, unsigned length);
+void
+nouveau_screen_bo_unmap(struct pipe_screen *pscreen, struct nouveau_bo *bo);
+void
+nouveau_screen_bo_release(struct pipe_screen *pscreen, struct nouveau_bo *bo);
+
+boolean
+nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
+			     struct nouveau_bo *bo,
+			     unsigned stride,
+			     struct winsys_handle *whandle);
+struct nouveau_bo *
+nouveau_screen_bo_from_handle(struct pipe_screen *pscreen,
+			      struct winsys_handle *whandle,
+			      unsigned *out_stride);
+
+unsigned int
+nouveau_reference_flags(struct nouveau_bo *bo);
+
+
+
+int nouveau_screen_init(struct nouveau_screen *, struct nouveau_device *);
+void nouveau_screen_fini(struct nouveau_screen *);
+
+
+
+
+static __inline__ unsigned
+RING_3D(unsigned mthd, unsigned size)
+{
+	return (7 << 13) | (size << 18) | mthd;
+}
+
+static __inline__ unsigned
+RING_3D_NI(unsigned mthd, unsigned size)
+{
+	return 0x40000000 | (7 << 13) | (size << 18) | mthd;
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_statebuf.h b/src/gallium/drivers/nouveau/nouveau_statebuf.h
new file mode 100644
index 0000000000..dcffdd9115
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_statebuf.h
@@ -0,0 +1,27 @@
+#ifndef __NOUVEAU_STATEBUF_H__
+#define __NOUVEAU_STATEBUF_H__
+
+/* state buffers: lightweight state objects interface */
+/* relocations are not supported, but Gallium CSOs don't require them */
+
+struct nouveau_statebuf_builder
+{
+	uint32_t* p;
+#ifdef DEBUG
+	uint32_t* pend;
+#endif
+};
+
+#ifdef DEBUG
+#define sb_init(var) {var, var + sizeof(var) / sizeof((var)[0])}
+#define sb_data(sb, v) do {assert((sb).p != (sb).pend);  *(sb).p++ = (v);} while(0)
+#else
+#define sb_init(var) {var}
+#define sb_data(sb, v) *(sb).p++ = (v)
+#endif
+
+#define sb_method(sb, v, n)  sb_data(sb, RING_3D(v, n));
+
+#define sb_len(sb, var) ((sb).p - (var))
+#define sb_emit(chan, sb_buf, sb_len) do {WAIT_RING((chan), (sb_len)); OUT_RINGp((chan), (sb_buf), (sb_len)); } while(0)
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_stateobj.h b/src/gallium/drivers/nouveau/nouveau_stateobj.h
new file mode 100644
index 0000000000..f5c1c5ca2c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_stateobj.h
@@ -0,0 +1,318 @@
+#ifndef __NOUVEAU_STATEOBJ_H__
+#define __NOUVEAU_STATEOBJ_H__
+
+#include "util/u_debug.h"
+
+#ifdef DEBUG
+#define DEBUG_NOUVEAU_STATEOBJ
+#endif /* DEBUG */
+
+struct nouveau_stateobj_reloc {
+	struct nouveau_bo *bo;
+
+	struct nouveau_grobj *gr;
+	uint32_t push_offset;
+	uint32_t mthd;
+
+	uint32_t data;
+	unsigned flags;
+	unsigned vor;
+	unsigned tor;
+};
+
+struct nouveau_stateobj_start {
+	struct nouveau_grobj *gr;
+	uint32_t mthd;
+	uint32_t size;
+	unsigned offset;
+};
+
+struct nouveau_stateobj {
+	struct pipe_reference reference;
+
+	struct nouveau_stateobj_start *start;
+	struct nouveau_stateobj_reloc *reloc;
+
+	/* Common memory pool for data. */
+	uint32_t *pool;
+	unsigned pool_cur;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	unsigned start_alloc;
+	unsigned reloc_alloc;
+	unsigned pool_alloc;
+#endif  /* DEBUG_NOUVEAU_STATEOBJ */
+
+	unsigned total; /* includes begin_ring */
+	unsigned cur; /* excludes begin_ring, offset from "cur_start" */
+	unsigned cur_start;
+	unsigned cur_reloc;
+};
+
+static INLINE void
+so_dump(struct nouveau_stateobj *so)
+{
+	unsigned i, nr, total = 0;
+
+	for (i = 0; i < so->cur_start; i++) {
+		if (so->start[i].gr->subc > -1)
+			debug_printf("+0x%04x: 0x%08x\n", total++,
+				(so->start[i].size << 18) | (so->start[i].gr->subc << 13)
+				| so->start[i].mthd);
+		else
+			debug_printf("+0x%04x: 0x%08x\n", total++,
+				(so->start[i].size << 18) | so->start[i].mthd);
+		for (nr = 0; nr < so->start[i].size; nr++, total++)
+			debug_printf("+0x%04x: 0x%08x\n", total,
+				so->pool[so->start[i].offset + nr]);
+	}
+}
+
+static INLINE struct nouveau_stateobj *
+so_new(unsigned start, unsigned push, unsigned reloc)
+{
+	struct nouveau_stateobj *so;
+
+	so = MALLOC(sizeof(struct nouveau_stateobj));
+	pipe_reference_init(&so->reference, 1);
+	so->total = so->cur = so->cur_start = so->cur_reloc = 0;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	so->start_alloc = start;
+	so->reloc_alloc = reloc;
+	so->pool_alloc = push;
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	so->start = MALLOC(start * sizeof(struct nouveau_stateobj_start));
+	so->reloc = MALLOC(reloc * sizeof(struct nouveau_stateobj_reloc));
+	so->pool = MALLOC(push * sizeof(uint32_t));
+	so->pool_cur = 0;
+
+	if (!so->start || !so->reloc || !so->pool) {
+		debug_printf("malloc failed\n");
+		assert(0);
+	}
+
+	return so;
+}
+
+static INLINE void
+so_ref(struct nouveau_stateobj *ref, struct nouveau_stateobj **pso)
+{
+	struct nouveau_stateobj *so = *pso;
+	int i;
+
+	if (pipe_reference(&(*pso)->reference, &ref->reference)) {
+		FREE(so->start);
+		for (i = 0; i < so->cur_reloc; i++)
+			nouveau_bo_ref(NULL, &so->reloc[i].bo);
+		FREE(so->reloc);
+		FREE(so->pool);
+		FREE(so);
+	}
+	*pso = ref;
+}
+
+static INLINE void
+so_data(struct nouveau_stateobj *so, uint32_t data)
+{
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->cur >= so->start[so->cur_start - 1].size) {
+		debug_printf("exceeding specified size\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	so->pool[so->start[so->cur_start - 1].offset + so->cur++] = data;
+}
+
+static INLINE void
+so_datap(struct nouveau_stateobj *so, uint32_t *data, unsigned size)
+{
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if ((so->cur + size) > so->start[so->cur_start - 1].size) {
+		debug_printf("exceeding specified size\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	while (size--)
+		so->pool[so->start[so->cur_start - 1].offset + so->cur++] =
+			*data++;
+}
+
+static INLINE void
+so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
+	  unsigned mthd, unsigned size)
+{
+	struct nouveau_stateobj_start *start;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->start_alloc <= so->cur_start) {
+		debug_printf("exceeding num_start size\n");
+		assert(0);
+	} else
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+		start = so->start;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->cur_start > 0 && start[so->cur_start - 1].size > so->cur) {
+		debug_printf("previous so_method was not filled\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	so->start = start;
+	start[so->cur_start].gr = gr;
+	start[so->cur_start].mthd = mthd;
+	start[so->cur_start].size = size;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->pool_alloc < (size + so->pool_cur)) {
+		debug_printf("exceeding num_pool size\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	start[so->cur_start].offset = so->pool_cur;
+	so->pool_cur += size;
+
+	so->cur_start++;
+	/* The 1 is for *this* begin_ring. */
+	so->total += so->cur + 1;
+	so->cur = 0;
+}
+
+static INLINE void
+so_reloc(struct nouveau_stateobj *so, struct nouveau_bo *bo,
+	 unsigned data, unsigned flags, unsigned vor, unsigned tor)
+{
+	struct nouveau_stateobj_reloc *r;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->reloc_alloc <= so->cur_reloc) {
+		debug_printf("exceeding num_reloc size\n");
+		assert(0);
+	} else
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+		r = so->reloc;
+
+	so->reloc = r;
+	r[so->cur_reloc].bo = NULL;
+	nouveau_bo_ref(bo, &(r[so->cur_reloc].bo));
+	r[so->cur_reloc].gr = so->start[so->cur_start-1].gr;
+	r[so->cur_reloc].push_offset = so->total + so->cur;
+	r[so->cur_reloc].data = data;
+	r[so->cur_reloc].flags = flags;
+	r[so->cur_reloc].mthd = so->start[so->cur_start-1].mthd +
+							(so->cur << 2);
+	r[so->cur_reloc].vor = vor;
+	r[so->cur_reloc].tor = tor;
+
+	so_data(so, data);
+	so->cur_reloc++;
+}
+
+/* Determine if this buffer object is referenced by this state object. */
+static INLINE boolean
+so_bo_is_reloc(struct nouveau_stateobj *so, struct nouveau_bo *bo)
+{
+	int i;
+
+	for (i = 0; i < so->cur_reloc; i++)
+		if (so->reloc[i].bo == bo)
+			return true;
+
+	return false;
+}
+
+static INLINE void
+so_emit(struct nouveau_channel *chan, struct nouveau_stateobj *so)
+{
+	unsigned nr, i;
+	int ret = 0;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->start[so->cur_start - 1].size > so->cur) {
+		debug_printf("emit: previous so_method was not filled\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	/* We cannot update total in case we so_emit again. */
+	nr = so->total + so->cur;
+
+	/* This will flush if we need space.
+	 * We don't actually need the marker.
+	 */
+	if ((ret = nouveau_pushbuf_marker_emit(chan, nr, so->cur_reloc))) {
+		debug_printf("so_emit failed marker emit with error %d\n", ret);
+		assert(0);
+	}
+
+	/* Submit data. This will ensure proper binding of objects. */
+	for (i = 0; i < so->cur_start; i++) {
+		BEGIN_RING(chan, so->start[i].gr, so->start[i].mthd, so->start[i].size);
+		OUT_RINGp(chan, &(so->pool[so->start[i].offset]), so->start[i].size);
+	}
+
+	for (i = 0; i < so->cur_reloc; i++) {
+		struct nouveau_stateobj_reloc *r = &so->reloc[i];
+
+		if ((ret = nouveau_pushbuf_emit_reloc(chan, chan->cur - nr +
+						r->push_offset, r->bo, r->data,
+						0, r->flags, r->vor, r->tor))) {
+			debug_printf("so_emit failed reloc with error %d\n", ret);
+			assert(0);
+		}
+	}
+}
+
+static INLINE void
+so_emit_reloc_markers(struct nouveau_channel *chan, struct nouveau_stateobj *so)
+{
+	unsigned i;
+	int ret = 0;
+
+	if (!so)
+		return;
+
+	/* If we need to flush in flush notify, then we have a problem anyway. */
+	for (i = 0; i < so->cur_reloc; i++) {
+		struct nouveau_stateobj_reloc *r = &so->reloc[i];
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+		if (r->mthd & 0x40000000) {
+			debug_printf("error: NI mthd 0x%08X\n", r->mthd);
+			continue;
+		}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+		/* We don't need to autobind, since there are enough subchannels
+		 * for all objects we use. If this is changed, account for the extra
+		 * space in callers of this function.
+		 */
+		assert(r->gr->bound != NOUVEAU_GROBJ_UNBOUND);
+
+		/* Some relocs really don't like to be hammered,
+		 * NOUVEAU_BO_DUMMY makes sure it only
+		 * happens when needed.
+		 */
+		ret = OUT_RELOC(chan, r->bo, (r->gr->subc << 13) | (1<< 18) |
+			r->mthd, (r->flags & (NOUVEAU_BO_VRAM | NOUVEAU_BO_GART
+				| NOUVEAU_BO_RDWR)) | NOUVEAU_BO_DUMMY, 0, 0);
+		if (ret) {
+			debug_printf("OUT_RELOC failed %d\n", ret);
+			assert(0);
+		}
+
+		ret = OUT_RELOC(chan, r->bo, r->data, r->flags |
+			NOUVEAU_BO_DUMMY, r->vor, r->tor);
+		if (ret) {
+			debug_printf("OUT_RELOC failed %d\n", ret);
+			assert(0);
+		}
+	}
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_util.h b/src/gallium/drivers/nouveau/nouveau_util.h
new file mode 100644
index 0000000000..ed6e643785
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_util.h
@@ -0,0 +1,191 @@
+#ifndef __NOUVEAU_UTIL_H__
+#define __NOUVEAU_UTIL_H__
+
+/* Determine how many vertices can be pushed into the command stream.
+ * Where the remaining space isn't large enough to represent all verices,
+ * split the buffer at primitive boundaries.
+ *
+ * Returns a count of vertices that can be rendered, and an index to
+ * restart drawing at after a flush.
+ */
+static INLINE unsigned
+nouveau_vbuf_split(unsigned remaining, unsigned overhead, unsigned vpp,
+		   unsigned mode, unsigned start, unsigned count,
+		   unsigned *restart)
+{
+	int max, adj = 0;
+
+	max  = remaining - overhead;
+	if (max < 0)
+		return 0;
+
+	max *= vpp;
+	if (max >= count)
+		return count;
+
+	switch (mode) {
+	case PIPE_PRIM_POINTS:
+		break;
+	case PIPE_PRIM_LINES:
+		max = max & 1;
+		break;
+	case PIPE_PRIM_TRIANGLES:
+		max = max - (max % 3);
+		break;
+	case PIPE_PRIM_QUADS:
+		max = max & ~3;
+		break;
+	case PIPE_PRIM_LINE_LOOP:
+	case PIPE_PRIM_LINE_STRIP:
+		if (max < 2)
+			max = 0;
+		adj = 1;
+		break;
+	case PIPE_PRIM_POLYGON:
+	case PIPE_PRIM_TRIANGLE_STRIP:
+	case PIPE_PRIM_TRIANGLE_FAN:
+		if (max < 3)
+			max = 0;
+		adj = 2;
+		break;
+	case PIPE_PRIM_QUAD_STRIP:
+		if (max < 4)
+			max = 0;
+		adj = 3;
+		break;
+	default:
+		assert(0);
+	}
+
+	*restart = start + max - adj;
+	return max;
+}
+
+/* Integer base-2 logarithm, rounded towards zero. */
+static INLINE unsigned log2i(unsigned i)
+{
+	unsigned r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+struct u_split_prim {
+   void *priv;
+   void (*emit)(void *priv, unsigned start, unsigned count);
+   void (*edge)(void *priv, boolean enabled);
+
+   unsigned mode;
+   unsigned start;
+   unsigned p_start;
+   unsigned p_end;
+
+   uint repeat_first:1;
+   uint close_first:1;
+   uint edgeflag_off:1;
+};
+
+static inline void
+u_split_prim_init(struct u_split_prim *s,
+                  unsigned mode, unsigned start, unsigned count)
+{
+   if (mode == PIPE_PRIM_LINE_LOOP) {
+      s->mode = PIPE_PRIM_LINE_STRIP;
+      s->close_first = 1;
+   } else {
+      s->mode = mode;
+      s->close_first = 0;
+   }
+   s->start = start;
+   s->p_start = start;
+   s->p_end = start + count;
+   s->edgeflag_off = 0;
+   s->repeat_first = 0;
+}
+
+static INLINE boolean
+u_split_prim_next(struct u_split_prim *s, unsigned max_verts)
+{
+   int repeat = 0;
+
+   if (s->repeat_first) {
+      s->emit(s->priv, s->start, 1);
+      max_verts--;
+      if (s->edgeflag_off) {
+         s->edge(s->priv, TRUE);
+         s->edgeflag_off = FALSE;
+      }
+   }
+
+   if (s->p_start + s->close_first + max_verts >= s->p_end) {
+      s->emit(s->priv, s->p_start, s->p_end - s->p_start);
+      if (s->close_first)
+         s->emit(s->priv, s->start, 1);
+      return TRUE;
+   }
+
+   switch (s->mode) {
+   case PIPE_PRIM_LINES:
+      max_verts &= ~1;
+      break;
+   case PIPE_PRIM_LINE_STRIP:
+      repeat = 1;
+      break;
+   case PIPE_PRIM_POLYGON:
+      max_verts--;
+      s->emit(s->priv, s->p_start, max_verts);
+      s->edge(s->priv, FALSE);
+      s->emit(s->priv, s->p_start + max_verts, 1);
+      s->p_start += max_verts;
+      s->repeat_first = TRUE;
+      s->edgeflag_off = TRUE;
+      return FALSE;
+   case PIPE_PRIM_TRIANGLES:
+      max_verts = max_verts - (max_verts % 3);
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      /* to ensure winding stays correct, always split
+       * on an even number of generated triangles
+       */
+      max_verts = max_verts & ~1;
+      repeat = 2;
+      break;
+   case PIPE_PRIM_TRIANGLE_FAN:
+      s->repeat_first = TRUE;
+      repeat = 1;
+      break;
+   case PIPE_PRIM_QUADS:
+      max_verts &= ~3;
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      max_verts &= ~1;
+      repeat = 2;
+      break;
+   default:
+      break;
+   }
+
+   s->emit (s->priv, s->p_start, max_verts);
+   s->p_start += (max_verts - repeat);
+   return FALSE;
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
new file mode 100644
index 0000000000..cd7da9977d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -0,0 +1,42 @@
+#ifndef NOUVEAU_WINSYS_H
+#define NOUVEAU_WINSYS_H
+
+#include <stdint.h>
+#include "pipe/p_defines.h"
+
+#include "nouveau/nouveau_bo.h"
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_class.h"
+#include "nouveau/nouveau_device.h"
+#include "nouveau/nouveau_grobj.h"
+#include "nouveau/nouveau_notifier.h"
+#include "nouveau/nouveau_resource.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+static inline uint32_t
+nouveau_screen_transfer_flags(unsigned pipe)
+{
+	uint32_t flags = 0;
+
+	if (pipe & PIPE_TRANSFER_READ)
+		flags |= NOUVEAU_BO_RD;
+	if (pipe & PIPE_TRANSFER_WRITE)
+		flags |= NOUVEAU_BO_WR;
+	if (pipe & PIPE_TRANSFER_DISCARD)
+		flags |= NOUVEAU_BO_INVAL;
+	if (pipe & PIPE_TRANSFER_DONTBLOCK)
+		flags |= NOUVEAU_BO_NOWAIT;
+	else
+	if (pipe & PIPE_TRANSFER_UNSYNCHRONIZED)
+		flags |= NOUVEAU_BO_NOSYNC;
+
+	return flags;
+}
+
+extern struct pipe_screen *
+nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *);
+
+extern struct pipe_screen *
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *);
+
+#endif
diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile
new file mode 100644
index 0000000000..e31e6f8662
--- /dev/null
+++ b/src/gallium/drivers/nv50/Makefile
@@ -0,0 +1,24 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv50
+
+C_SOURCES = \
+	nv50_buffer.c \
+	nv50_clear.c \
+	nv50_context.c \
+	nv50_draw.c \
+	nv50_miptree.c \
+	nv50_query.c \
+	nv50_program.c \
+	nv50_resource.c \
+	nv50_screen.c \
+	nv50_state.c \
+	nv50_state_validate.c \
+	nv50_surface.c \
+	nv50_tex.c \
+	nv50_transfer.c \
+	nv50_vbo.c \
+	nv50_push.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/nv50/SConscript b/src/gallium/drivers/nv50/SConscript
new file mode 100644
index 0000000000..8625f92622
--- /dev/null
+++ b/src/gallium/drivers/nv50/SConscript
@@ -0,0 +1,26 @@
+Import('*')
+
+env = env.Clone()
+
+nv50 = env.ConvenienceLibrary(
+    target = 'nv50',
+    source = [
+        'nv50_buffer.c',
+        'nv50_clear.c',
+        'nv50_context.c',
+        'nv50_draw.c',
+        'nv50_miptree.c',
+        'nv50_query.c',
+        'nv50_program.c',
+        'nv50_resource.c',
+        'nv50_screen.c',
+        'nv50_state.c',
+        'nv50_state_validate.c',
+        'nv50_surface.c',
+        'nv50_tex.c',
+        'nv50_transfer.c',
+        'nv50_vbo.c',
+        'nv50_push.c',
+    ])
+
+Export('nv50')
diff --git a/src/gallium/drivers/nv50/nv50_buffer.c b/src/gallium/drivers/nv50/nv50_buffer.c
new file mode 100644
index 0000000000..dacfee9799
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_buffer.c
@@ -0,0 +1,150 @@
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "nouveau/nouveau_screen.h"
+#include "nouveau/nouveau_winsys.h"
+#include "nv50_resource.h"
+
+
+
+static void nv50_buffer_destroy(struct pipe_screen *pscreen,
+				struct pipe_resource *presource)
+{
+	struct nv50_resource *buffer = nv50_resource(presource);
+
+	nouveau_screen_bo_release(pscreen, buffer->bo);
+	FREE(buffer);
+}
+
+
+
+
+/* Utility functions for transfer create/destroy are hooked in and
+ * just record the arguments to those functions.
+ */
+static void *
+nv50_buffer_transfer_map( struct pipe_context *pipe,
+			  struct pipe_transfer *transfer )
+{
+	struct nv50_resource *buffer = nv50_resource(transfer->resource);
+	uint8_t *map;
+
+	map = nouveau_screen_bo_map_range( pipe->screen,
+					   buffer->bo,
+					   transfer->box.x,
+					   transfer->box.width,
+					   nouveau_screen_transfer_flags(transfer->usage) );
+	if (map == NULL)
+		return NULL;
+	
+	return map + transfer->box.x;
+}
+
+
+
+static void nv50_buffer_transfer_flush_region( struct pipe_context *pipe,
+					       struct pipe_transfer *transfer,
+					       const struct pipe_box *box)
+{
+	struct nv50_resource *buffer = nv50_resource(transfer->resource);
+
+	nouveau_screen_bo_map_flush_range(pipe->screen,
+					  buffer->bo,
+					  transfer->box.x + box->x,
+					  box->width);
+}
+
+static void nv50_buffer_transfer_unmap( struct pipe_context *pipe,
+					struct pipe_transfer *transfer )
+{
+	struct nv50_resource *buffer = nv50_resource(transfer->resource);
+
+	nouveau_screen_bo_unmap(pipe->screen, buffer->bo);
+}
+
+
+
+
+const struct u_resource_vtbl nv50_buffer_vtbl =
+{
+	u_default_resource_get_handle,      /* get_handle */
+	nv50_buffer_destroy,		    /* resource_destroy */
+	NULL,			            /* is_resource_referenced */
+	u_default_get_transfer,		    /* get_transfer */
+	u_default_transfer_destroy,	    /* transfer_destroy */
+	nv50_buffer_transfer_map,	    /* transfer_map */
+	nv50_buffer_transfer_flush_region,  /* transfer_flush_region */
+	nv50_buffer_transfer_unmap,	    /* transfer_unmap */
+	u_default_transfer_inline_write	    /* transfer_inline_write */
+};
+
+
+
+
+struct pipe_resource *
+nv50_buffer_create(struct pipe_screen *pscreen,
+		   const struct pipe_resource *template)
+{
+	struct nv50_resource *buffer;
+
+	buffer = CALLOC_STRUCT(nv50_resource);
+	if (!buffer)
+		return NULL;
+
+	buffer->base = *template;
+	buffer->vtbl = &nv50_buffer_vtbl;
+	pipe_reference_init(&buffer->base.reference, 1);
+	buffer->base.screen = pscreen;
+
+	buffer->bo = nouveau_screen_bo_new(pscreen,
+					   16,
+					   buffer->base.usage,
+					   buffer->base.bind,
+					   buffer->base.width0);
+
+	if (buffer->bo == NULL)
+		goto fail;
+
+	return &buffer->base;
+
+fail:
+	FREE(buffer);
+	return NULL;
+}
+
+
+struct pipe_resource *
+nv50_user_buffer_create(struct pipe_screen *pscreen,
+			void *ptr,
+			unsigned bytes,
+			unsigned bind)
+{
+	struct nv50_resource *buffer;
+
+	buffer = CALLOC_STRUCT(nv50_resource);
+	if (!buffer)
+		return NULL;
+
+	pipe_reference_init(&buffer->base.reference, 1);
+	buffer->vtbl = &nv50_buffer_vtbl;
+	buffer->base.screen = pscreen;
+	buffer->base.format = PIPE_FORMAT_R8_UNORM;
+	buffer->base.usage = PIPE_USAGE_IMMUTABLE;
+	buffer->base.bind = bind;
+	buffer->base.width0 = bytes;
+	buffer->base.height0 = 1;
+	buffer->base.depth0 = 1;
+
+	buffer->bo = nouveau_screen_bo_user(pscreen, ptr, bytes);
+	if (!buffer->bo)
+		goto fail;
+	
+	return &buffer->base;
+
+fail:
+	FREE(buffer);
+	return NULL;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_clear.c b/src/gallium/drivers/nv50/nv50_clear.c
new file mode 100644
index 0000000000..ee7cf281f4
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_clear.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv50_context.h"
+
+void
+nv50_clear(struct pipe_context *pipe, unsigned buffers,
+	   const float *rgba, double depth, unsigned stencil)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+	unsigned mode = 0, i;
+	const unsigned dirty = nv50->dirty;
+
+	/* don't need NEW_BLEND, NV50TCL_COLOR_MASK doesn't affect CLEAR_BUFFERS */
+	nv50->dirty &= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
+	if (!nv50_state_validate(nv50, 64))
+		return;
+
+	if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
+		BEGIN_RING(chan, tesla, NV50TCL_CLEAR_COLOR(0), 4);
+		OUT_RING  (chan, fui(rgba[0]));
+		OUT_RING  (chan, fui(rgba[1]));
+		OUT_RING  (chan, fui(rgba[2]));
+		OUT_RING  (chan, fui(rgba[3]));
+		mode |= 0x3c;
+	}
+
+	if (buffers & PIPE_CLEAR_DEPTH) {
+		BEGIN_RING(chan, tesla, NV50TCL_CLEAR_DEPTH, 1);
+		OUT_RING  (chan, fui(depth));
+		mode |= NV50TCL_CLEAR_BUFFERS_Z;
+	}
+	if (buffers & PIPE_CLEAR_STENCIL) {
+		BEGIN_RING(chan, tesla, NV50TCL_CLEAR_STENCIL, 1);
+		OUT_RING  (chan, stencil & 0xff);
+		mode |= NV50TCL_CLEAR_BUFFERS_S;
+	}
+
+	BEGIN_RING(chan, tesla, NV50TCL_CLEAR_BUFFERS, 1);
+	OUT_RING  (chan, mode);
+
+	for (i = 1; i < fb->nr_cbufs; i++) {
+		BEGIN_RING(chan, tesla, NV50TCL_CLEAR_BUFFERS, 1);
+		OUT_RING  (chan, (i << 6) | 0x3c);
+	}
+	nv50->dirty = dirty;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
new file mode 100644
index 0000000000..915a925402
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+
+#include "nv50_context.h"
+#include "nv50_screen.h"
+#include "nv50_resource.h"
+
+static void
+nv50_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(chan, nv50->screen->tesla, 0x1338, 1);
+		OUT_RING  (chan, 0x20);
+	}
+
+	if (flags & PIPE_FLUSH_FRAME)
+		FIRE_RING(chan);
+}
+
+static void
+nv50_destroy(struct pipe_context *pipe)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	int i;
+
+	for (i = 0; i < 64; i++) {
+		if (!nv50->state.hw[i])
+			continue;
+		so_ref(NULL, &nv50->state.hw[i]);
+	}
+
+	draw_destroy(nv50->draw);
+
+	if (nv50->screen->cur_ctx == nv50)
+		nv50->screen->cur_ctx = NULL;
+
+	FREE(nv50);
+}
+
+
+struct pipe_context *
+nv50_create(struct pipe_screen *pscreen, void *priv)
+{
+	struct pipe_winsys *pipe_winsys = pscreen->winsys;
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nv50_context *nv50;
+
+	nv50 = CALLOC_STRUCT(nv50_context);
+	if (!nv50)
+		return NULL;
+	nv50->screen = screen;
+
+	nv50->pipe.winsys = pipe_winsys;
+	nv50->pipe.screen = pscreen;
+	nv50->pipe.priv = priv;
+
+	nv50->pipe.destroy = nv50_destroy;
+
+	nv50->pipe.draw_arrays = nv50_draw_arrays;
+	nv50->pipe.draw_arrays_instanced = nv50_draw_arrays_instanced;
+	nv50->pipe.draw_elements = nv50_draw_elements;
+	nv50->pipe.draw_elements_instanced = nv50_draw_elements_instanced;
+	nv50->pipe.clear = nv50_clear;
+
+	nv50->pipe.flush = nv50_flush;
+
+	screen->base.channel->user_private = nv50;
+
+	nv50_init_surface_functions(nv50);
+	nv50_init_state_functions(nv50);
+	nv50_init_query_functions(nv50);
+	nv50_init_resource_functions(&nv50->pipe);
+
+	nv50->draw = draw_create(&nv50->pipe);
+	assert(nv50->draw);
+	draw_set_rasterize_stage(nv50->draw, nv50_draw_render_stage(nv50));
+
+	return &nv50->pipe;
+}
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
new file mode 100644
index 0000000000..61807dd999
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -0,0 +1,276 @@
+#ifndef __NV50_CONTEXT_H__
+#define __NV50_CONTEXT_H__
+
+#include <stdio.h>
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_inlines.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nv50_screen.h"
+#include "nv50_program.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+/* Constant buffer assignment */
+#define NV50_CB_PMISC		0
+#define NV50_CB_PVP		1
+#define NV50_CB_PFP		2
+#define NV50_CB_PGP		3
+#define NV50_CB_AUX		4
+
+#define NV50_NEW_BLEND		(1 << 0)
+#define NV50_NEW_ZSA		(1 << 1)
+#define NV50_NEW_BLEND_COLOUR	(1 << 2)
+#define NV50_NEW_STIPPLE	(1 << 3)
+#define NV50_NEW_SCISSOR	(1 << 4)
+#define NV50_NEW_VIEWPORT	(1 << 5)
+#define NV50_NEW_RASTERIZER	(1 << 6)
+#define NV50_NEW_FRAMEBUFFER	(1 << 7)
+#define NV50_NEW_VERTPROG	(1 << 8)
+#define NV50_NEW_VERTPROG_CB	(1 << 9)
+#define NV50_NEW_FRAGPROG	(1 << 10)
+#define NV50_NEW_FRAGPROG_CB	(1 << 11)
+#define NV50_NEW_GEOMPROG	(1 << 12)
+#define NV50_NEW_GEOMPROG_CB	(1 << 13)
+#define NV50_NEW_ARRAYS		(1 << 14)
+#define NV50_NEW_SAMPLER	(1 << 15)
+#define NV50_NEW_TEXTURE	(1 << 16)
+#define NV50_NEW_STENCIL_REF	(1 << 17)
+
+struct nv50_blend_stateobj {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_zsa_stateobj {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_rasterizer_stateobj {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_sampler_stateobj {
+	boolean normalized;
+	unsigned tsc[8];
+};
+
+struct nv50_sampler_view {
+	struct pipe_sampler_view pipe;
+	uint32_t tic[8];
+};
+
+struct nv50_vtxelt_stateobj {
+	struct pipe_vertex_element pipe[16];
+	unsigned num_elements;
+	uint32_t hw[16];
+};
+
+static INLINE struct nv50_sampler_view *
+nv50_sampler_view(struct pipe_sampler_view *view)
+{
+	return (struct nv50_sampler_view *)view;
+}
+
+static INLINE unsigned
+get_tile_height(uint32_t tile_mode)
+{
+        return 1 << ((tile_mode & 0xf) + 2);
+}
+
+static INLINE unsigned
+get_tile_depth(uint32_t tile_mode)
+{
+        return 1 << (tile_mode >> 4);
+}
+
+
+struct nv50_surface {
+	struct pipe_surface base;
+};
+
+static INLINE struct nv50_surface *
+nv50_surface(struct pipe_surface *pt)
+{
+	return (struct nv50_surface *)pt;
+}
+
+struct nv50_state {
+	struct nouveau_stateobj *hw[64];
+	uint64_t hw_dirty;
+
+	unsigned sampler_view_nr[3];
+	struct nouveau_stateobj *vtxbuf;
+	struct nouveau_stateobj *vtxattr;
+	unsigned vtxelt_nr;
+};
+
+struct nv50_context {
+	struct pipe_context pipe;
+
+	struct nv50_screen *screen;
+
+	struct draw_context *draw;
+
+	struct nv50_state state;
+
+	unsigned dirty;
+	struct nv50_blend_stateobj *blend;
+	struct nv50_zsa_stateobj *zsa;
+	struct nv50_rasterizer_stateobj *rasterizer;
+	struct pipe_blend_color blend_colour;
+	struct pipe_stencil_ref stencil_ref;
+	struct pipe_poly_stipple stipple;
+	struct pipe_scissor_state scissor;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct nv50_program *vertprog;
+	struct nv50_program *fragprog;
+	struct nv50_program *geomprog;
+	struct pipe_resource *constbuf[PIPE_SHADER_TYPES];
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct nv50_vtxelt_stateobj *vtxelt;
+	struct nv50_sampler_stateobj *sampler[3][PIPE_MAX_SAMPLERS];
+	unsigned sampler_nr[3];
+	struct pipe_sampler_view *sampler_views[3][PIPE_MAX_SAMPLERS];
+	unsigned sampler_view_nr[3];
+
+	unsigned vbo_fifo;
+};
+
+static INLINE struct nv50_context *
+nv50_context(struct pipe_context *pipe)
+{
+	return (struct nv50_context *)pipe;
+}
+
+extern void nv50_init_surface_functions(struct nv50_context *nv50);
+extern void nv50_init_state_functions(struct nv50_context *nv50);
+extern void nv50_init_query_functions(struct nv50_context *nv50);
+extern void nv50_init_transfer_functions(struct nv50_context *nv50);
+
+extern void nv50_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+extern int
+nv50_surface_do_copy(struct nv50_screen *screen, struct pipe_surface *dst,
+		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		     int w, int h);
+
+/* nv50_draw.c */
+extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *nv50);
+
+/* nv50_vbo.c */
+extern void nv50_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern void nv50_draw_arrays_instanced(struct pipe_context *, unsigned mode,
+					unsigned start, unsigned count,
+					unsigned startInstance,
+					unsigned instanceCount);
+extern void nv50_draw_elements(struct pipe_context *pipe,
+				  struct pipe_resource *indexBuffer,
+				  unsigned indexSize, int indexBias,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+extern void nv50_draw_elements_instanced(struct pipe_context *pipe,
+					 struct pipe_resource *indexBuffer,
+					 unsigned indexSize, int indexBias,
+					 unsigned mode, unsigned start,
+					 unsigned count,
+					 unsigned startInstance,
+					 unsigned instanceCount);
+extern void nv50_vtxelt_construct(struct nv50_vtxelt_stateobj *cso);
+extern struct nouveau_stateobj *nv50_vbo_validate(struct nv50_context *nv50);
+
+/* nv50_push.c */
+extern void
+nv50_push_elements_instanced(struct pipe_context *, struct pipe_resource *,
+			     unsigned idxsize, int idxbias,
+                             unsigned mode, unsigned start,
+			     unsigned count, unsigned i_start,
+			     unsigned i_count);
+
+/* nv50_clear.c */
+extern void nv50_clear(struct pipe_context *pipe, unsigned buffers,
+		       const float *rgba, double depth, unsigned stencil);
+
+/* nv50_program.c */
+extern struct nouveau_stateobj *
+nv50_vertprog_validate(struct nv50_context *nv50);
+extern struct nouveau_stateobj *
+nv50_fragprog_validate(struct nv50_context *nv50);
+extern struct nouveau_stateobj *
+nv50_geomprog_validate(struct nv50_context *nv50);
+extern struct nouveau_stateobj *
+nv50_fp_linkage_validate(struct nv50_context *nv50);
+extern struct nouveau_stateobj *
+nv50_gp_linkage_validate(struct nv50_context *nv50);
+extern void nv50_program_destroy(struct nv50_context *nv50,
+				 struct nv50_program *p);
+
+/* nv50_state_validate.c */
+extern boolean nv50_state_validate(struct nv50_context *nv50, unsigned dwords);
+
+extern void nv50_so_init_sifc(struct nv50_context *nv50,
+			      struct nouveau_stateobj *so,
+			      struct nouveau_bo *bo, unsigned reloc,
+			      unsigned offset, unsigned size);
+
+/* nv50_tex.c */
+extern boolean nv50_tex_construct(struct nv50_sampler_view *view);
+extern void nv50_tex_relocs(struct nv50_context *);
+extern struct nouveau_stateobj *nv50_tex_validate(struct nv50_context *);
+
+
+/* nv50_context.c */
+struct pipe_context *
+nv50_create(struct pipe_screen *pscreen, void *priv);
+
+static INLINE unsigned
+nv50_prim(unsigned mode)
+{
+	switch (mode) {
+	case PIPE_PRIM_POINTS: return NV50TCL_VERTEX_BEGIN_POINTS;
+	case PIPE_PRIM_LINES: return NV50TCL_VERTEX_BEGIN_LINES;
+	case PIPE_PRIM_LINE_LOOP: return NV50TCL_VERTEX_BEGIN_LINE_LOOP;
+	case PIPE_PRIM_LINE_STRIP: return NV50TCL_VERTEX_BEGIN_LINE_STRIP;
+	case PIPE_PRIM_TRIANGLES: return NV50TCL_VERTEX_BEGIN_TRIANGLES;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP;
+	case PIPE_PRIM_TRIANGLE_FAN: return NV50TCL_VERTEX_BEGIN_TRIANGLE_FAN;
+	case PIPE_PRIM_QUADS: return NV50TCL_VERTEX_BEGIN_QUADS;
+	case PIPE_PRIM_QUAD_STRIP: return NV50TCL_VERTEX_BEGIN_QUAD_STRIP;
+	case PIPE_PRIM_POLYGON: return NV50TCL_VERTEX_BEGIN_POLYGON;
+	case PIPE_PRIM_LINES_ADJACENCY:
+		return NV50TCL_VERTEX_BEGIN_LINES_ADJACENCY;
+	case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+		return NV50TCL_VERTEX_BEGIN_LINE_STRIP_ADJACENCY;
+	case PIPE_PRIM_TRIANGLES_ADJACENCY:
+		return NV50TCL_VERTEX_BEGIN_TRIANGLES_ADJACENCY;
+	case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+		return NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP_ADJACENCY;
+	default:
+		break;
+	}
+
+	NOUVEAU_ERR("invalid primitive type %d\n", mode);
+	return NV50TCL_VERTEX_BEGIN_POINTS;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_draw.c b/src/gallium/drivers/nv50/nv50_draw.c
new file mode 100644
index 0000000000..2f6f607261
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_draw.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "draw/draw_pipe.h"
+
+#include "nv50_context.h"
+
+struct nv50_render_stage {
+	struct draw_stage stage;
+	struct nv50_context *nv50;
+};
+
+static INLINE struct nv50_render_stage *
+nv50_render_stage(struct draw_stage *stage)
+{
+	return (struct nv50_render_stage *)stage;
+}
+
+static void
+nv50_render_point(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_line(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_flush(struct draw_stage *stage, unsigned flags)
+{
+}
+
+static void
+nv50_render_reset_stipple_counter(struct draw_stage *stage)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_destroy(struct draw_stage *stage)
+{
+	FREE(stage);
+}
+
+struct draw_stage *
+nv50_draw_render_stage(struct nv50_context *nv50)
+{
+	struct nv50_render_stage *rs = CALLOC_STRUCT(nv50_render_stage);
+
+	rs->nv50 = nv50;
+	rs->stage.draw = nv50->draw;
+	rs->stage.destroy = nv50_render_destroy;
+	rs->stage.point = nv50_render_point;
+	rs->stage.line = nv50_render_line;
+	rs->stage.tri = nv50_render_tri;
+	rs->stage.flush = nv50_render_flush;
+	rs->stage.reset_stipple_counter = nv50_render_reset_stipple_counter;
+
+	return &rs->stage;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
new file mode 100644
index 0000000000..b7cd92158f
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+
+#include "nv50_context.h"
+#include "nv50_resource.h"
+#include "nv50_transfer.h"
+
+/* The restrictions in tile mode selection probably aren't necessary. */
+static INLINE uint32_t
+get_tile_mode(unsigned ny, unsigned d)
+{
+	uint32_t tile_mode = 0x00;
+
+	if (ny > 32) tile_mode = 0x04; /* height 64 tiles */
+	else
+	if (ny > 16) tile_mode = 0x03; /* height 32 tiles */
+	else
+	if (ny >  8) tile_mode = 0x02; /* height 16 tiles */
+	else
+	if (ny >  4) tile_mode = 0x01; /* height 8 tiles */
+
+	if (d == 1)
+		return tile_mode;
+	else
+	if (tile_mode > 0x02)
+		tile_mode = 0x02;
+
+	if (d > 16 && tile_mode < 0x02)
+		return tile_mode | 0x50; /* depth 32 tiles */
+	if (d >  8) return tile_mode | 0x40; /* depth 16 tiles */
+	if (d >  4) return tile_mode | 0x30; /* depth 8 tiles */
+	if (d >  2) return tile_mode | 0x20; /* depth 4 tiles */
+
+	return tile_mode | 0x10;
+}
+
+static INLINE unsigned
+get_zslice_offset(unsigned tile_mode, unsigned z, unsigned pitch, unsigned nb_h)
+{
+	unsigned tile_h = get_tile_height(tile_mode);
+	unsigned tile_d = get_tile_depth(tile_mode);
+
+	/* pitch_2d == to next slice within this volume-tile */
+	/* pitch_3d == size (in bytes) of a volume-tile */
+	unsigned pitch_2d = tile_h * 64;
+	unsigned pitch_3d = tile_d * align(nb_h, tile_h) * pitch;
+
+	return (z % tile_d) * pitch_2d + (z / tile_d) * pitch_3d;
+}
+
+
+
+
+static void
+nv50_miptree_destroy(struct pipe_screen *pscreen,
+		     struct pipe_resource *pt)
+{
+	struct nv50_miptree *mt = nv50_miptree(pt);
+	unsigned l;
+
+	for (l = 0; l <= pt->last_level; ++l)
+		FREE(mt->level[l].image_offset);
+
+	nouveau_screen_bo_release(pscreen, mt->base.bo);
+	FREE(mt);
+}
+
+static boolean
+nv50_miptree_get_handle(struct pipe_screen *pscreen,
+			struct pipe_resource *pt,
+			struct winsys_handle *whandle)
+{
+	struct nv50_miptree *mt = nv50_miptree(pt);
+	unsigned stride;
+
+
+	if (!mt || !mt->base.bo)
+		return FALSE;
+
+	stride = util_format_get_stride(mt->base.base.format,
+					mt->base.base.width0);
+
+	return nouveau_screen_bo_get_handle(pscreen,
+					    mt->base.bo,
+					    stride,
+					    whandle);
+}
+
+
+const struct u_resource_vtbl nv50_miptree_vtbl =
+{
+   nv50_miptree_get_handle,	      /* get_handle */
+   nv50_miptree_destroy,	      /* resource_destroy */
+   NULL,			      /* is_resource_referenced */
+   nv50_miptree_transfer_new,	      /* get_transfer */
+   nv50_miptree_transfer_del,     /* transfer_destroy */
+   nv50_miptree_transfer_map,	      /* transfer_map */
+   u_default_transfer_flush_region,   /* transfer_flush_region */
+   nv50_miptree_transfer_unmap,	      /* transfer_unmap */
+   u_default_transfer_inline_write    /* transfer_inline_write */
+};
+
+
+
+struct pipe_resource *
+nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *tmp)
+{
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+	struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
+	struct pipe_resource *pt = &mt->base.base;
+	unsigned width = tmp->width0, height = tmp->height0;
+	unsigned depth = tmp->depth0, image_alignment;
+	uint32_t tile_flags;
+	int ret, i, l;
+
+	if (!mt)
+		return NULL;
+
+	*pt = *tmp;
+	mt->base.vtbl = &nv50_miptree_vtbl;
+	pipe_reference_init(&pt->reference, 1);
+	pt->screen = pscreen;
+
+	switch (pt->format) {
+	case PIPE_FORMAT_Z32_FLOAT:
+		tile_flags = 0x4800;
+		break;
+	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+		tile_flags = 0x1800;
+		break;
+	case PIPE_FORMAT_Z16_UNORM:
+		tile_flags = 0x6c00;
+		break;
+	case PIPE_FORMAT_Z24X8_UNORM:
+	case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+		tile_flags = 0x2800;
+		break;
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+		tile_flags = 0x7400;
+		break;
+	default:
+		if ((pt->bind & PIPE_BIND_SCANOUT) &&
+		    util_format_get_blocksizebits(pt->format) == 32)
+			tile_flags = 0x7a00;
+		else
+			tile_flags = 0x7000;
+		break;
+	}
+
+	/* XXX: texture arrays */
+	mt->image_nr = (pt->target == PIPE_TEXTURE_CUBE) ? 6 : 1;
+
+	for (l = 0; l <= pt->last_level; l++) {
+		struct nv50_miptree_level *lvl = &mt->level[l];
+		unsigned nblocksy = util_format_get_nblocksy(pt->format, height);
+
+		lvl->image_offset = CALLOC(mt->image_nr, sizeof(int));
+		lvl->pitch = align(util_format_get_stride(pt->format, width), 64);
+		lvl->tile_mode = get_tile_mode(nblocksy, depth);
+
+		width = u_minify(width, 1);
+		height = u_minify(height, 1);
+		depth = u_minify(depth, 1);
+	}
+
+	image_alignment  = get_tile_height(mt->level[0].tile_mode) * 64;
+	image_alignment *= get_tile_depth(mt->level[0].tile_mode);
+
+	/* NOTE the distinction between arrays of mip-mapped 2D textures and
+	 * mip-mapped 3D textures. We can't use image_nr == depth for 3D mip.
+	 */
+	for (i = 0; i < mt->image_nr; i++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			struct nv50_miptree_level *lvl = &mt->level[l];
+			int size;
+			unsigned tile_h = get_tile_height(lvl->tile_mode);
+			unsigned tile_d = get_tile_depth(lvl->tile_mode);
+
+			size  = lvl->pitch;
+			size *= align(util_format_get_nblocksy(pt->format, u_minify(pt->height0, l)), tile_h);
+			size *= align(u_minify(pt->depth0, l), tile_d);
+
+			lvl->image_offset[i] = mt->total_size;
+
+			mt->total_size += size;
+		}
+		mt->total_size = align(mt->total_size, image_alignment);
+	}
+
+	ret = nouveau_bo_new_tile(dev, NOUVEAU_BO_VRAM, 256, mt->total_size,
+				  mt->level[0].tile_mode, tile_flags,
+				  &mt->base.bo);
+	if (ret) {
+		for (l = 0; l <= pt->last_level; ++l)
+			FREE(mt->level[l].image_offset);
+		FREE(mt);
+		return NULL;
+	}
+
+	return pt;
+}
+
+
+struct pipe_resource *
+nv50_miptree_from_handle(struct pipe_screen *pscreen,
+			 const struct pipe_resource *template,
+			 struct winsys_handle *whandle)
+{
+	struct nv50_miptree *mt;
+	unsigned stride;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (template->target != PIPE_TEXTURE_2D ||
+	    template->last_level != 0 ||
+	    template->depth0 != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv50_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base.bo = nouveau_screen_bo_from_handle(pscreen, whandle, &stride);
+	if (mt->base.bo == NULL) {
+		FREE(mt);
+		return NULL;
+	}
+
+
+	mt->base.base = *template;
+	mt->base.vtbl = &nv50_miptree_vtbl;
+	pipe_reference_init(&mt->base.base.reference, 1);
+	mt->base.base.screen = pscreen;
+	mt->image_nr = 1;
+	mt->level[0].pitch = stride;
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+	mt->level[0].tile_mode = mt->base.bo->tile_mode;
+
+	/* XXX: Need to adjust bo refcount??
+	 */
+	/* nouveau_bo_ref(bo, &mt->base.bo); */
+	return &mt->base.base;
+}
+
+
+
+/* Surface functions
+ */
+
+struct pipe_surface *
+nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv50_miptree *mt = nv50_miptree(pt);
+	struct nv50_miptree_level *lvl = &mt->level[level];
+	struct pipe_surface *ps;
+	unsigned img = 0;
+
+	if (pt->target == PIPE_TEXTURE_CUBE)
+		img = face;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_resource_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = u_minify(pt->width0, level);
+	ps->height = u_minify(pt->height0, level);
+	ps->usage = flags;
+	pipe_reference_init(&ps->reference, 1);
+	ps->face = face;
+	ps->level = level;
+	ps->zslice = zslice;
+	ps->offset = lvl->image_offset[img];
+
+	if (pt->target == PIPE_TEXTURE_3D) {
+		unsigned nb_h = util_format_get_nblocksy(pt->format, ps->height);
+		ps->offset += get_zslice_offset(lvl->tile_mode, zslice,
+						lvl->pitch, nb_h);
+	}
+
+	return ps;
+}
+
+void
+nv50_miptree_surface_del(struct pipe_surface *ps)
+{
+	struct nv50_surface *s = nv50_surface(ps);
+
+	pipe_resource_reference(&ps->texture, NULL);
+	FREE(s);
+}
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
new file mode 100644
index 0000000000..8cb1639013
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -0,0 +1,4693 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv50_context.h"
+#include "nv50_transfer.h"
+
+#define NV50_SU_MAX_TEMP 127
+#define NV50_SU_MAX_ADDR 4
+//#define NV50_PROGRAM_DUMP
+
+/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */
+
+/* ARL - gallium craps itself on progs/vp/arl.txt
+ *
+ * MSB - Like MAD, but MUL+SUB
+ * 	- Fuck it off, introduce a way to negate args for ops that
+ * 	  support it.
+ *
+ * Look into inlining IMMD for ops other than MOV (make it general?)
+ * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
+ * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
+ *
+ * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
+ * case, if the emit_src() causes the inst to suddenly become long.
+ *
+ * Verify half-insns work where expected - and force disable them where they
+ * don't work - MUL has it forcibly disabled atm as it fixes POW..
+ *
+ * FUCK! watch dst==src vectors, can overwrite components that are needed.
+ * 	ie. SUB R0, R0.yzxw, R0
+ *
+ * Things to check with renouveau:
+ * 	FP attr/result assignment - how?
+ * 		attrib
+ * 			- 0x16bc maps vp output onto fp hpos
+ * 			- 0x16c0 maps vp output onto fp col0
+ * 		result
+ * 			- colr always 0-3
+ * 			- depr always 4
+ * 0x16bc->0x16e8 --> some binding between vp/fp regs
+ * 0x16b8 --> VP output count
+ *
+ * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
+ * 	      "MOV rcol.x, fcol.y" = 0x00000004
+ * 0x19a8 --> as above but 0x00000100 and 0x00000000
+ * 	- 0x00100000 used when KIL used
+ * 0x196c --> as above but 0x00000011 and 0x00000000
+ *
+ * 0x1988 --> 0xXXNNNNNN
+ * 	- XX == FP high something
+ */
+struct nv50_reg {
+	enum {
+		P_TEMP,
+		P_ATTR,
+		P_RESULT,
+		P_CONST,
+		P_IMMD,
+		P_ADDR
+	} type;
+	int index;
+
+	int hw;
+	int mod;
+
+	int rhw; /* result hw for FP outputs, or interpolant index */
+	int acc; /* instruction where this reg is last read (first insn == 1) */
+
+	int vtx; /* vertex index, for GP inputs (TGSI Dimension.Index) */
+	int indirect[2]; /* index into pc->addr, or -1 */
+
+	ubyte buf_index; /* c{0 .. 15}[] or g{0 .. 15}[] */
+};
+
+#define NV50_MOD_NEG 1
+#define NV50_MOD_ABS 2
+#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS)
+#define NV50_MOD_SAT 4
+#define NV50_MOD_I32 8
+
+/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */
+
+/* STACK: Conditionals and loops have to use the (per warp) stack.
+ * Stack entries consist of an entry type (divergent path, join at),
+ * a mask indicating the active threads of the warp, and an address.
+ * MPs can store 12 stack entries internally, if we need more (and
+ * we probably do), we have to create a stack buffer in VRAM.
+ */
+/* impose low limits for now */
+#define NV50_MAX_COND_NESTING 4
+#define NV50_MAX_LOOP_NESTING 3
+
+#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2
+
+struct nv50_pc {
+	struct nv50_program *p;
+
+	/* hw resources */
+	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
+	struct nv50_reg r_addr[NV50_SU_MAX_ADDR];
+
+	/* tgsi resources */
+	struct nv50_reg *temp;
+	int temp_nr;
+	struct nv50_reg *attr;
+	int attr_nr;
+	struct nv50_reg *result;
+	int result_nr;
+	struct nv50_reg *param;
+	int param_nr;
+	struct nv50_reg *immd;
+	uint32_t *immd_buf;
+	int immd_nr;
+	struct nv50_reg **addr;
+	int addr_nr;
+	struct nv50_reg *sysval;
+	int sysval_nr;
+
+	struct nv50_reg *temp_temp[16];
+	struct nv50_program_exec *temp_temp_exec[16];
+	unsigned temp_temp_nr;
+
+	/* broadcast and destination replacement regs */
+	struct nv50_reg *r_brdc;
+	struct nv50_reg *r_dst[4];
+
+	struct nv50_reg reg_instances[16];
+	unsigned reg_instance_nr;
+
+	unsigned interp_mode[32];
+	/* perspective interpolation registers */
+	struct nv50_reg *iv_p;
+	struct nv50_reg *iv_c;
+
+	struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING];
+	struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING];
+	struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING];
+	int if_lvl, loop_lvl;
+	unsigned loop_pos[NV50_MAX_LOOP_NESTING];
+
+	unsigned *insn_pos; /* actual program offset of each TGSI insn */
+	boolean in_subroutine;
+
+	/* current instruction and total number of insns */
+	unsigned insn_cur;
+	unsigned insn_nr;
+
+	boolean allow32;
+
+	uint8_t edgeflag_out;
+};
+
+static struct nv50_reg *get_address_reg(struct nv50_pc *, struct nv50_reg *);
+
+static INLINE void
+ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
+{
+	reg->type = type;
+	reg->index = index;
+	reg->hw = hw;
+	reg->mod = 0;
+	reg->rhw = -1;
+	reg->vtx = -1;
+	reg->acc = 0;
+	reg->indirect[0] = reg->indirect[1] = -1;
+	reg->buf_index = (type == P_CONST) ? 1 : 0;
+}
+
+static INLINE unsigned
+popcnt4(uint32_t val)
+{
+	static const unsigned cnt[16]
+	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+	return cnt[val & 0xf];
+}
+
+static void
+terminate_mbb(struct nv50_pc *pc)
+{
+	int i;
+
+	/* remove records of temporary address register values */
+	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
+		if (pc->r_addr[i].index < 0)
+			pc->r_addr[i].acc = 0;
+}
+
+static void
+alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	int i = 0;
+
+	if (reg->type == P_RESULT) {
+		if (pc->p->cfg.high_result < (reg->hw + 1))
+			pc->p->cfg.high_result = reg->hw + 1;
+	}
+
+	if (reg->type != P_TEMP)
+		return;
+
+	if (reg->hw >= 0) {
+		/*XXX: do this here too to catch FP temp-as-attr usage..
+		 *     not clean, but works */
+		if (pc->p->cfg.high_temp < (reg->hw + 1))
+			pc->p->cfg.high_temp = reg->hw + 1;
+		return;
+	}
+
+	if (reg->rhw != -1) {
+		/* try to allocate temporary with index rhw first */
+		if (!(pc->r_temp[reg->rhw])) {
+			pc->r_temp[reg->rhw] = reg;
+			reg->hw = reg->rhw;
+			if (pc->p->cfg.high_temp < (reg->rhw + 1))
+				pc->p->cfg.high_temp = reg->rhw + 1;
+			return;
+		}
+		/* make sure we don't get things like $r0 needs to go
+		 * in $r1 and $r1 in $r0
+		 */
+		i = pc->result_nr * 4;
+	}
+
+	for (; i < NV50_SU_MAX_TEMP; i++) {
+		if (!(pc->r_temp[i])) {
+			pc->r_temp[i] = reg;
+			reg->hw = i;
+			if (pc->p->cfg.high_temp < (i + 1))
+				pc->p->cfg.high_temp = i + 1;
+			return;
+		}
+	}
+
+	NOUVEAU_ERR("out of registers\n");
+	abort();
+}
+
+static INLINE struct nv50_reg *
+reg_instance(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	struct nv50_reg *ri;
+
+	assert(pc->reg_instance_nr < 16);
+	ri = &pc->reg_instances[pc->reg_instance_nr++];
+	if (reg) {
+		alloc_reg(pc, reg);
+		*ri = *reg;
+		reg->indirect[0] = reg->indirect[1] = -1;
+		reg->mod = 0;
+	}
+	return ri;
+}
+
+/* XXX: For shaders that aren't executed linearly (e.g. shaders that
+ * contain loops), we need to assign all hw regs to TGSI TEMPs early,
+ * lest we risk temp_temps overwriting regs alloc'd "later".
+ */
+static struct nv50_reg *
+alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
+{
+	struct nv50_reg *r;
+	int i;
+
+	if (dst && dst->type == P_TEMP && dst->hw == -1)
+		return dst;
+
+	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+		if (!pc->r_temp[i]) {
+			r = MALLOC_STRUCT(nv50_reg);
+			ctor_reg(r, P_TEMP, -1, i);
+			pc->r_temp[i] = r;
+			return r;
+		}
+	}
+
+	NOUVEAU_ERR("out of registers\n");
+	abort();
+	return NULL;
+}
+
+/* release the hardware resource held by r */
+static void
+release_hw(struct nv50_pc *pc, struct nv50_reg *r)
+{
+	assert(r->type == P_TEMP);
+	if (r->hw == -1)
+		return;
+
+	assert(pc->r_temp[r->hw] == r);
+	pc->r_temp[r->hw] = NULL;
+
+	r->acc = 0;
+	if (r->index == -1)
+		FREE(r);
+}
+
+static void
+free_temp(struct nv50_pc *pc, struct nv50_reg *r)
+{
+	if (r->index == -1) {
+		unsigned hw = r->hw;
+
+		FREE(pc->r_temp[hw]);
+		pc->r_temp[hw] = NULL;
+	}
+}
+
+static int
+alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
+{
+	int i;
+
+	if ((idx + 4) >= NV50_SU_MAX_TEMP)
+		return 1;
+
+	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
+	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
+		return alloc_temp4(pc, dst, idx + 4);
+
+	for (i = 0; i < 4; i++) {
+		dst[i] = MALLOC_STRUCT(nv50_reg);
+		ctor_reg(dst[i], P_TEMP, -1, idx + i);
+		pc->r_temp[idx + i] = dst[i];
+	}
+
+	return 0;
+}
+
+static void
+free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
+{
+	int i;
+
+	for (i = 0; i < 4; i++)
+		free_temp(pc, reg[i]);
+}
+
+static struct nv50_reg *
+temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	if (pc->temp_temp_nr >= 16)
+		assert(0);
+
+	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
+	pc->temp_temp_exec[pc->temp_temp_nr] = e;
+	return pc->temp_temp[pc->temp_temp_nr++];
+}
+
+/* This *must* be called for all nv50_program_exec that have been
+ * given as argument to temp_temp, or the temps will be leaked !
+ */
+static void
+kill_temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	int i;
+
+	for (i = 0; i < pc->temp_temp_nr; i++)
+		if (pc->temp_temp_exec[i] == e)
+			free_temp(pc, pc->temp_temp[i]);
+	if (!e)
+		pc->temp_temp_nr = 0;
+}
+
+static int
+ctor_immd_4u32(struct nv50_pc *pc,
+	       uint32_t x, uint32_t y, uint32_t z, uint32_t w)
+{
+	unsigned size = pc->immd_nr * 4 * sizeof(uint32_t);
+
+	pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t));
+
+	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
+	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
+	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
+	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
+
+	return pc->immd_nr++;
+}
+
+static INLINE int
+ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w)
+{
+	return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w));
+}
+
+static struct nv50_reg *
+alloc_immd(struct nv50_pc *pc, float f)
+{
+	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
+	unsigned hw;
+
+	for (hw = 0; hw < pc->immd_nr * 4; hw++)
+		if (pc->immd_buf[hw] == fui(f))
+			break;
+
+	if (hw == pc->immd_nr * 4)
+		hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4;
+
+	ctor_reg(r, P_IMMD, -1, hw);
+	return r;
+}
+
+static struct nv50_program_exec *
+exec(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
+
+	e->param.index = -1;
+	return e;
+}
+
+static void
+emit(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	struct nv50_program *p = pc->p;
+
+	if (p->exec_tail)
+		p->exec_tail->next = e;
+	if (!p->exec_head)
+		p->exec_head = e;
+	p->exec_tail = e;
+	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
+
+	kill_temp_temp(pc, e);
+}
+
+static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
+
+static boolean
+is_long(struct nv50_program_exec *e)
+{
+	if (e->inst[0] & 1)
+		return TRUE;
+	return FALSE;
+}
+
+static boolean
+is_immd(struct nv50_program_exec *e)
+{
+	if (is_long(e) && (e->inst[1] & 3) == 3)
+		return TRUE;
+	return FALSE;
+}
+
+static boolean
+is_join(struct nv50_program_exec *e)
+{
+	if (is_long(e) && (e->inst[1] & 3) == 2)
+		return TRUE;
+	return FALSE;
+}
+
+static INLINE boolean
+is_control_flow(struct nv50_program_exec *e)
+{
+	return (e->inst[0] & 2);
+}
+
+static INLINE void
+set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
+	 struct nv50_program_exec *e)
+{
+	assert(!is_immd(e));
+	set_long(pc, e);
+	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
+	e->inst[1] |= (pred << 7) | (idx << 12);
+}
+
+static INLINE void
+set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
+	    struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
+	e->inst[1] |= (idx << 4) | (on << 6);
+}
+
+static INLINE void
+set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	if (is_long(e))
+		return;
+
+	e->inst[0] |= 1;
+	set_pred(pc, 0xf, 0, e);
+	set_pred_wr(pc, 0, 0, e);
+}
+
+static INLINE void
+set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
+{
+	if (dst->type == P_RESULT) {
+		set_long(pc, e);
+		e->inst[1] |= 0x00000008;
+	}
+
+	alloc_reg(pc, dst);
+	if (dst->hw > 63)
+		set_long(pc, e);
+	e->inst[0] |= (dst->hw << 2);
+}
+
+static INLINE void
+set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+	/* XXX: can't be predicated - bits overlap; cases where both
+	 * are required should be avoided by using pc->allow32 */
+	set_pred(pc, 0, 0, e);
+	set_pred_wr(pc, 0, 0, e);
+
+	e->inst[1] |= 0x00000002 | 0x00000001;
+	e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16;
+	e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2;
+}
+
+static INLINE void
+set_addr(struct nv50_program_exec *e, struct nv50_reg *a)
+{
+	assert(a->type == P_ADDR);
+
+	assert(!(e->inst[0] & 0x0c000000));
+	assert(!(e->inst[1] & 0x00000004));
+
+	e->inst[0] |= (a->hw & 3) << 26;
+	e->inst[1] |= a->hw & 4;
+}
+
+static void
+emit_arl(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, uint8_t);
+
+static void
+emit_shl_imm(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, int);
+
+static void
+emit_mov_from_addr(struct nv50_pc *pc, struct nv50_reg *dst,
+		   struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[1] = 0x40000000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_addr(e, src);
+
+	emit(pc, e);
+}
+
+static void
+emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
+		  struct nv50_reg *src0, uint16_t src1_val)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xd0000000 | (src1_val << 9);
+	e->inst[1] = 0x20000000;
+	set_long(pc, e);
+	e->inst[0] |= dst->hw << 2;
+	if (src0) /* otherwise will add to $a0, which is always 0 */
+		set_addr(e, src0);
+
+	emit(pc, e);
+}
+
+#define INTERP_LINEAR		0
+#define INTERP_FLAT		1
+#define INTERP_PERSPECTIVE	2
+#define INTERP_CENTROID		4
+
+/* interpolant index has been stored in dst->rhw */
+static void
+emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
+		unsigned mode)
+{
+	struct nv50_program_exec *e = exec(pc);
+	assert(dst->rhw != -1);
+
+	e->inst[0] |= 0x80000000;
+	set_dst(pc, dst, e);
+	e->inst[0] |= (dst->rhw << 16);
+
+	if (mode & INTERP_FLAT) {
+		e->inst[0] |= (1 << 8);
+	} else {
+		if (mode & INTERP_PERSPECTIVE) {
+			e->inst[0] |= (1 << 25);
+			alloc_reg(pc, iv);
+			e->inst[0] |= (iv->hw << 9);
+		}
+
+		if (mode & INTERP_CENTROID)
+			e->inst[0] |= (1 << 24);
+	}
+
+	emit(pc, e);
+}
+
+static void
+set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
+	 struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+
+	e->param.index = src->hw & 127;
+	e->param.shift = s;
+	e->param.mask = m << (s % 32);
+
+	if (src->hw < 0 || src->hw > 127) /* need (additional) address reg */
+		set_addr(e, get_address_reg(pc, src));
+	else
+	if (src->acc < 0) {
+		assert(src->type == P_CONST);
+		set_addr(e, pc->addr[src->indirect[0]]);
+	}
+
+	e->inst[1] |= (src->buf_index << 22);
+}
+
+/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */
+static void
+emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x10000000;
+	if (!pc->allow32)
+		set_long(pc, e);
+
+	set_dst(pc, dst, e);
+
+	if (!is_long(e) && src->type == P_IMMD) {
+		set_immd(pc, src, e);
+		/*XXX: 32-bit, but steals part of "half" reg space - need to
+		 *     catch and handle this case if/when we do half-regs
+		 */
+	} else
+	if (src->type == P_IMMD || src->type == P_CONST) {
+		set_long(pc, e);
+		set_data(pc, src, 0x7f, 9, e);
+		e->inst[1] |= 0x20000000; /* mov from c[] */
+	} else {
+		if (src->type == P_ATTR) {
+			set_long(pc, e);
+			e->inst[1] |= 0x00200000;
+
+			if (src->vtx >= 0) {
+				/* indirect (vertex base + c) load from p[] */
+				e->inst[0] |= 0x01800000;
+				set_addr(e, get_address_reg(pc, src));
+			}
+		}
+
+		alloc_reg(pc, src);
+		if (src->hw > 63)
+			set_long(pc, e);
+		e->inst[0] |= (src->hw << 9);
+	}
+
+	if (is_long(e) && !is_immd(e)) {
+		e->inst[1] |= 0x04000000; /* 32-bit */
+		e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */
+		if (!(e->inst[1] & 0x20000000))
+			e->inst[1] |= 0x00030000; /* lane mask 2:3 */
+	} else
+		e->inst[0] |= 0x00008000;
+
+	emit(pc, e);
+}
+
+static INLINE void
+emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
+{
+	struct nv50_reg *imm = alloc_immd(pc, f);
+	emit_mov(pc, dst, imm);
+	FREE(imm);
+}
+
+/* Assign the hw of the discarded temporary register src
+ * to the tgsi register dst and free src.
+ */
+static void
+assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	assert(src->index == -1 && src->hw != -1);
+
+	if (pc->if_lvl || pc->loop_lvl ||
+	    (dst->type != P_TEMP) ||
+	    (src->hw < pc->result_nr * 4 &&
+	     pc->p->type == PIPE_SHADER_FRAGMENT) ||
+	    pc->p->info.opcode_count[TGSI_OPCODE_CAL] ||
+	    pc->p->info.opcode_count[TGSI_OPCODE_BRA]) {
+
+		emit_mov(pc, dst, src);
+		free_temp(pc, src);
+		return;
+	}
+
+	if (dst->hw != -1)
+		pc->r_temp[dst->hw] = NULL;
+	pc->r_temp[src->hw] = dst;
+	dst->hw = src->hw;
+
+	FREE(src);
+}
+
+static void
+emit_nop(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xf0000000;
+	set_long(pc, e);
+	e->inst[1] = 0xe0000000;
+	emit(pc, e);
+}
+
+static boolean
+check_swap_src_0_1(struct nv50_pc *pc,
+		   struct nv50_reg **s0, struct nv50_reg **s1)
+{
+	struct nv50_reg *src0 = *s0, *src1 = *s1;
+
+	if (src0->type == P_CONST) {
+		if (src1->type != P_CONST) {
+			*s0 = src1;
+			*s1 = src0;
+			return TRUE;
+		}
+	} else
+	if (src1->type == P_ATTR) {
+		if (src0->type != P_ATTR) {
+			*s0 = src1;
+			*s1 = src0;
+			return TRUE;
+		}
+	}
+
+	return FALSE;
+}
+
+static void
+set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
+		     struct nv50_program_exec *e)
+{
+	struct nv50_reg *temp;
+
+	if (src->type != P_TEMP) {
+		temp = temp_temp(pc, e);
+		emit_mov(pc, temp, src);
+		src = temp;
+	}
+
+	alloc_reg(pc, src);
+	if (src->hw > 63)
+		set_long(pc, e);
+	e->inst[0] |= (src->hw << 9);
+}
+
+static void
+set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	if (src->type == P_ATTR) {
+		set_long(pc, e);
+		e->inst[1] |= 0x00200000;
+
+		if (src->vtx >= 0) {
+			e->inst[0] |= 0x01800000; /* src from p[] */
+			set_addr(e, get_address_reg(pc, src));
+		}
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		struct nv50_reg *temp = temp_temp(pc, e);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	}
+
+	alloc_reg(pc, src);
+	if (src->hw > 63)
+		set_long(pc, e);
+	e->inst[0] |= (src->hw << 9);
+}
+
+static void
+set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	if (src->type == P_ATTR) {
+		struct nv50_reg *temp = temp_temp(pc, e);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		if (e->inst[0] & 0x01800000) {
+			struct nv50_reg *temp = temp_temp(pc, e);
+
+			emit_mov(pc, temp, src);
+			src = temp;
+		} else {
+			assert(!(e->inst[0] & 0x00800000));
+			set_data(pc, src, 0x7f, 16, e);
+			e->inst[0] |= 0x00800000;
+		}
+	}
+
+	alloc_reg(pc, src);
+	if (src->hw > 63)
+		set_long(pc, e);
+	e->inst[0] |= ((src->hw & 127) << 16);
+}
+
+static void
+set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+
+	if (src->type == P_ATTR) {
+		struct nv50_reg *temp = temp_temp(pc, e);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		if (e->inst[0] & 0x01800000) {
+			struct nv50_reg *temp = temp_temp(pc, e);
+
+			emit_mov(pc, temp, src);
+			src = temp;
+		} else {
+			assert(!(e->inst[0] & 0x01000000));
+			set_data(pc, src, 0x7f, 32+14, e);
+			e->inst[0] |= 0x01000000;
+		}
+	}
+
+	alloc_reg(pc, src);
+	e->inst[1] |= ((src->hw & 127) << 14);
+}
+
+static void
+set_half_src(struct nv50_pc *pc, struct nv50_reg *src, int lh,
+	     struct nv50_program_exec *e, int pos)
+{
+	struct nv50_reg *r = src;
+
+	alloc_reg(pc, r);
+	if (r->type != P_TEMP) {
+		r = temp_temp(pc, e);
+		emit_mov(pc, r, src);
+	}
+
+	if (r->hw > (NV50_SU_MAX_TEMP / 2)) {
+		NOUVEAU_ERR("out of low GPRs\n");
+		abort();
+	}
+
+	e->inst[pos / 32] |= ((src->hw * 2) + lh) << (pos % 32);
+}
+
+static void
+emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	assert(dst->type == P_TEMP);
+	e->inst[1] = 0x20000000 | (pred << 12);
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x000001fc;
+	e->inst[1] = 0xa0000008;
+	set_long(pc, e);
+	set_pred_wr(pc, 1, pred, e);
+	set_src_0_restricted(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xc0000000;
+
+	if (!pc->allow32)
+		set_long(pc, e);
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	if (src1->type == P_IMMD && !is_long(e)) {
+		if (src0->mod ^ src1->mod)
+			e->inst[0] |= 0x00008000;
+		set_immd(pc, src1, e);
+	} else {
+		set_src_1(pc, src1, e);
+		if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) {
+			if (is_long(e))
+				e->inst[1] |= 0x08000000;
+			else
+				e->inst[0] |= 0x00008000;
+		}
+	}
+
+	emit(pc, e);
+}
+
+static void
+emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xb0000000;
+
+	alloc_reg(pc, src1);
+	check_swap_src_0_1(pc, &src0, &src1);
+
+	if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) {
+		set_long(pc, e);
+		e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) |
+			      ((src1->mod & NV50_MOD_NEG) << 27);
+	}
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
+		set_src_2(pc, src1, e);
+	else
+	if (src1->type == P_IMMD)
+		set_immd(pc, src1, e);
+	else
+		set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
+	 uint8_t s)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	set_long(pc, e);
+	e->inst[1] |= 0xc0000000;
+
+	e->inst[0] |= dst->hw << 2;
+	e->inst[0] |= s << 16; /* shift left */
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static boolean
+address_reg_suitable(struct nv50_reg *a, struct nv50_reg *r)
+{
+	if (!r)
+		return FALSE;
+
+	if (r->vtx != a->vtx)
+		return FALSE;
+	if (r->vtx >= 0)
+		return (r->indirect[1] == a->indirect[1]);
+
+	if (r->hw < a->rhw || (r->hw - a->rhw) >= 128)
+		return FALSE;
+
+	if (a->index >= 0)
+		return (a->index == r->indirect[0]);
+	return (a->indirect[0] == r->indirect[0]);
+}
+
+static void
+load_vertex_base(struct nv50_pc *pc, struct nv50_reg *dst,
+		 struct nv50_reg *a, int shift)
+{
+	struct nv50_reg mem, *temp;
+
+	ctor_reg(&mem, P_ATTR, -1, dst->vtx);
+
+	assert(dst->type == P_ADDR);
+	if (!a) {
+		emit_arl(pc, dst, &mem, 0);
+		return;
+	}
+	temp = alloc_temp(pc, NULL);
+
+	if (shift) {
+		emit_mov_from_addr(pc, temp, a);
+		if (shift < 0)
+			emit_shl_imm(pc, temp, temp, shift);
+		emit_arl(pc, dst, temp, MAX2(shift, 0));
+	}
+	emit_mov(pc, temp, &mem);
+	set_addr(pc->p->exec_tail, dst);
+
+	emit_arl(pc, dst, temp, 0);
+	free_temp(pc, temp);
+}
+
+/* case (ref == NULL): allocate address register for TGSI_FILE_ADDRESS
+ * case (vtx >= 0, acc >= 0): load vertex base from a[vtx * 4] to $aX
+ * case (vtx >= 0, acc < 0): load vertex base from s[$aY + vtx * 4] to $aX
+ * case (vtx < 0, acc >= 0): memory address too high to encode
+ * case (vtx < 0, acc < 0): get source register for TGSI_FILE_ADDRESS
+ */
+static struct nv50_reg *
+get_address_reg(struct nv50_pc *pc, struct nv50_reg *ref)
+{
+	int i;
+	struct nv50_reg *a_ref, *a = NULL;
+
+	for (i = 0; i < NV50_SU_MAX_ADDR; ++i) {
+		if (pc->r_addr[i].acc == 0)
+			a = &pc->r_addr[i]; /* an unused address reg */
+		else
+		if (address_reg_suitable(&pc->r_addr[i], ref)) {
+			pc->r_addr[i].acc = pc->insn_cur;
+			return &pc->r_addr[i];
+		} else
+		if (!a && pc->r_addr[i].index < 0 &&
+		    pc->r_addr[i].acc < pc->insn_cur)
+			a = &pc->r_addr[i];
+	}
+	if (!a) {
+		/* We'll be able to spill address regs when this
+		 * mess is replaced with a proper compiler ...
+		 */
+		NOUVEAU_ERR("out of address regs\n");
+		abort();
+		return NULL;
+	}
+
+	/* initialize and reserve for this TGSI instruction */
+	a->rhw = 0;
+	a->index = a->indirect[0] = a->indirect[1] = -1;
+	a->acc = pc->insn_cur;
+
+	if (!ref) {
+		a->vtx = -1;
+		return a;
+	}
+	a->vtx = ref->vtx;
+
+	/* now put in the correct value ... */
+
+	if (ref->vtx >= 0) {
+		a->indirect[1] = ref->indirect[1];
+
+		/* For an indirect vertex index, we need to shift address right
+		 * by 2, the address register will contain vtx * 16, we need to
+		 * load from a[vtx * 4].
+		 */
+		load_vertex_base(pc, a, (ref->acc < 0) ?
+				 pc->addr[ref->indirect[1]] : NULL, -2);
+	} else {
+		assert(ref->acc < 0 || ref->indirect[0] < 0);
+
+		a->rhw = ref->hw & ~0x7f;
+		a->indirect[0] = ref->indirect[0];
+		a_ref = (ref->acc < 0) ? pc->addr[ref->indirect[0]] : NULL;
+
+		emit_add_addr_imm(pc, a, a_ref, a->rhw * 4);
+	}
+	return a;
+}
+
+#define NV50_MAX_F32 0x880
+#define NV50_MAX_S32 0x08c
+#define NV50_MAX_U32 0x084
+#define NV50_MIN_F32 0x8a0
+#define NV50_MIN_S32 0x0ac
+#define NV50_MIN_U32 0x0a4
+
+static void
+emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
+	    struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	set_long(pc, e);
+	e->inst[0] |= 0x30000000 | ((sub & 0x800) << 20);
+	e->inst[1] |= (sub << 24);
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+
+	if (src0->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00100000;
+	if (src1->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00080000;
+
+	emit(pc, e);
+}
+
+static INLINE void
+emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1)
+{
+	src1->mod ^= NV50_MOD_NEG;
+	emit_add(pc, dst, src0, src1);
+	src1->mod ^= NV50_MOD_NEG;
+}
+
+static void
+emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	    struct nv50_reg *src1, unsigned op)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xd0000000;
+	set_long(pc, e);
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+
+	if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR &&
+	    op != TGSI_OPCODE_XOR)
+		assert(!"invalid bit op");
+
+	assert(!(src0->mod | src1->mod));
+
+	if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) {
+		set_immd(pc, src1, e);
+		if (op == TGSI_OPCODE_OR)
+			e->inst[0] |= 0x0100;
+		else
+		if (op == TGSI_OPCODE_XOR)
+			e->inst[0] |= 0x8000;
+	} else {
+		set_src_1(pc, src1, e);
+		e->inst[1] |= 0x04000000; /* 32 bit */
+		if (op == TGSI_OPCODE_OR)
+			e->inst[1] |= 0x4000;
+		else
+		if (op == TGSI_OPCODE_XOR)
+			e->inst[1] |= 0x8000;
+	}
+
+	emit(pc, e);
+}
+
+static void
+emit_not(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xd0000000;
+	e->inst[1] = 0x0402c000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_1(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_shift(struct nv50_pc *pc, struct nv50_reg *dst,
+	   struct nv50_reg *src0, struct nv50_reg *src1, unsigned dir)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x30000000;
+	e->inst[1] = 0xc4000000;
+
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+
+	if (src1->type == P_IMMD) {
+		e->inst[1] |= (1 << 20);
+		e->inst[0] |= (pc->immd_buf[src1->hw] & 0x7f) << 16;
+	} else
+		set_src_1(pc, src1, e);
+
+	if (dir != TGSI_OPCODE_SHL)
+		e->inst[1] |= (1 << 29);
+
+	if (dir == TGSI_OPCODE_ISHR)
+		e->inst[1] |= (1 << 27);
+
+	emit(pc, e);
+}
+
+static void
+emit_shl_imm(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src, int s)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x30000000;
+	e->inst[1] = 0xc4100000;
+	if (s < 0) {
+		e->inst[1] |= 1 << 29;
+		s = -s;
+	}
+	e->inst[1] |= ((s & 0x7f) << 16);
+
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xe0000000;
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	set_src_2(pc, src2, e);
+
+	if ((src0->mod ^ src1->mod) & NV50_MOD_NEG)
+		e->inst[1] |= 0x04000000;
+	if (src2->mod & NV50_MOD_NEG)
+		e->inst[1] |= 0x08000000;
+
+	emit(pc, e);
+}
+
+static INLINE void
+emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	src2->mod ^= NV50_MOD_NEG;
+	emit_mad(pc, dst, src0, src1, src2);
+	src2->mod ^= NV50_MOD_NEG;
+}
+
+#define NV50_FLOP_RCP 0
+#define NV50_FLOP_RSQ 2
+#define NV50_FLOP_LG2 3
+#define NV50_FLOP_SIN 4
+#define NV50_FLOP_COS 5
+#define NV50_FLOP_EX2 6
+
+/* rcp, rsqrt, lg2 support neg and abs */
+static void
+emit_flop(struct nv50_pc *pc, unsigned sub,
+	  struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x90000000;
+	if (sub || src->mod) {
+		set_long(pc, e);
+		e->inst[1] |= (sub << 29);
+	}
+
+	set_dst(pc, dst, e);
+	set_src_0_restricted(pc, src, e);
+
+	assert(!src->mod || sub < 4);
+
+	if (src->mod & NV50_MOD_NEG)
+		e->inst[1] |= 0x04000000;
+	if (src->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00100000;
+
+	emit(pc, e);
+}
+
+static void
+emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29) | 0x00004000;
+
+	if (src->mod & NV50_MOD_NEG)
+		e->inst[1] |= 0x04000000;
+	if (src->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00100000;
+
+	emit(pc, e);
+}
+
+static void
+emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29);
+
+	if (src->mod & NV50_MOD_NEG)
+		e->inst[1] |= 0x04000000;
+	if (src->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00100000;
+
+	emit(pc, e);
+}
+
+#define CVT_RN    (0x00 << 16)
+#define CVT_FLOOR (0x02 << 16)
+#define CVT_CEIL  (0x04 << 16)
+#define CVT_TRUNC (0x06 << 16)
+#define CVT_SAT   (0x08 << 16)
+#define CVT_ABS   (0x10 << 16)
+
+#define CVT_X32_X32 0x04004000
+#define CVT_X32_S32 0x04014000
+#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32)
+#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32)
+#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32)
+#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32)
+#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32)
+#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32)
+#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32)
+#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32)
+
+#define CVT_NEG 0x20000000
+#define CVT_RI  0x08000000
+
+static void
+emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
+	 int wp, uint32_t cvn)
+{
+	struct nv50_program_exec *e;
+
+	e = exec(pc);
+
+	if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG;
+	if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS;
+
+	e->inst[0] = 0xa0000000;
+	e->inst[1] = cvn;
+	set_long(pc, e);
+	set_src_0(pc, src, e);
+
+	if (wp >= 0)
+		set_pred_wr(pc, 1, wp, e);
+
+	if (dst)
+		set_dst(pc, dst, e);
+	else {
+		e->inst[0] |= 0x000001fc;
+		e->inst[1] |= 0x00000008;
+	}
+
+	emit(pc, e);
+}
+
+/* nv50 Condition codes:
+ *  0x1 = LT
+ *  0x2 = EQ
+ *  0x3 = LE
+ *  0x4 = GT
+ *  0x5 = NE
+ *  0x6 = GE
+ *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
+ *  0x8 = unordered bit (allows NaN)
+ *
+ *  mode = 0x04 (u32), 0x0c (s32), 0x80 (f32)
+ */
+static void
+emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
+	 struct nv50_reg *src0, struct nv50_reg *src1, uint8_t mode)
+{
+	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+
+	struct nv50_program_exec *e = exec(pc);
+	struct nv50_reg *rdst;
+
+	assert(ccode < 16);
+	if (check_swap_src_0_1(pc, &src0, &src1))
+		ccode = cc_swapped[ccode & 7] | (ccode & 8);
+
+	rdst = dst;
+	if (dst && dst->type != P_TEMP)
+		dst = alloc_temp(pc, NULL);
+
+	set_long(pc, e);
+	e->inst[0] |= 0x30000000 | (mode << 24);
+	e->inst[1] |= 0x60000000 | (ccode << 14);
+
+	if (wp >= 0)
+		set_pred_wr(pc, 1, wp, e);
+	if (dst)
+		set_dst(pc, dst, e);
+	else {
+		e->inst[0] |= 0x000001fc;
+		e->inst[1] |= 0x00000008;
+	}
+
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+
+	emit(pc, e);
+
+	if (rdst && mode == 0x80) /* convert to float ? */
+		emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32);
+	if (rdst && rdst != dst)
+		free_temp(pc, dst);
+}
+
+static INLINE void
+map_tgsi_setop_hw(unsigned op, uint8_t *cc, uint8_t *ty)
+{
+	switch (op) {
+	case TGSI_OPCODE_SLT: *cc = 0x1; *ty = 0x80; break;
+	case TGSI_OPCODE_SGE: *cc = 0x6; *ty = 0x80; break;
+	case TGSI_OPCODE_SEQ: *cc = 0x2; *ty = 0x80; break;
+	case TGSI_OPCODE_SGT: *cc = 0x4; *ty = 0x80; break;
+	case TGSI_OPCODE_SLE: *cc = 0x3; *ty = 0x80; break;
+	case TGSI_OPCODE_SNE: *cc = 0xd; *ty = 0x80; break;
+
+	case TGSI_OPCODE_ISLT: *cc = 0x1; *ty = 0x0c; break;
+	case TGSI_OPCODE_ISGE: *cc = 0x6; *ty = 0x0c; break;
+	case TGSI_OPCODE_USEQ: *cc = 0x2; *ty = 0x04; break;
+	case TGSI_OPCODE_USGE: *cc = 0x6; *ty = 0x04; break;
+	case TGSI_OPCODE_USLT: *cc = 0x1; *ty = 0x04; break;
+	case TGSI_OPCODE_USNE: *cc = 0x5; *ty = 0x04; break;
+	default:
+		assert(0);
+		return;
+	}
+}
+
+static void
+emit_add_b32(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src0, struct nv50_reg *rsrc1)
+{
+	struct nv50_program_exec *e = exec(pc);
+	struct nv50_reg *src1;
+
+	e->inst[0] = 0x20000000;
+
+	alloc_reg(pc, rsrc1);
+	check_swap_src_0_1(pc, &src0, &rsrc1);
+
+	src1 = rsrc1;
+	if (src0->mod & rsrc1->mod & NV50_MOD_NEG) {
+		src1 = temp_temp(pc, e);
+		emit_cvt(pc, src1, rsrc1, -1, CVT_S32_S32);
+	}
+
+	if (!pc->allow32 || src1->hw > 63 ||
+	    (src1->type != P_TEMP && src1->type != P_IMMD))
+		set_long(pc, e);
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+
+	if (is_long(e)) {
+		e->inst[1] |= 1 << 26;
+		set_src_2(pc, src1, e);
+	} else {
+		e->inst[0] |= 0x8000;
+		if (src1->type == P_IMMD)
+			set_immd(pc, src1, e);
+		else
+			set_src_1(pc, src1, e);
+	}
+
+	if (src0->mod & NV50_MOD_NEG)
+		e->inst[0] |= 1 << 28;
+	else
+	if (src1->mod & NV50_MOD_NEG)
+		e->inst[0] |= 1 << 22;
+
+	emit(pc, e);
+}
+
+static void
+emit_mad_u16(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1,
+	     struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x60000000;
+	if (!pc->allow32)
+		set_long(pc, e);
+	set_dst(pc, dst, e);
+
+	set_half_src(pc, src0, lh_0, e, 9);
+	set_half_src(pc, src1, lh_1, e, 16);
+	alloc_reg(pc, src2);
+	if (is_long(e) || (src2->type != P_TEMP) || (src2->hw != dst->hw))
+		set_src_2(pc, src2, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mul_u16(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x40000000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+
+	set_half_src(pc, src0, lh_0, e, 9);
+	set_half_src(pc, src1, lh_1, e, 16);
+
+	emit(pc, e);
+}
+
+static void
+emit_sad(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x50000000;
+	if (!pc->allow32)
+		set_long(pc, e);
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	alloc_reg(pc, src2);
+	if (is_long(e) || (src2->type != dst->type) || (src2->hw != dst->hw))
+		set_src_2(pc, src2, e);
+
+	if (is_long(e))
+		e->inst[1] |= 0x0c << 24;
+	else
+		e->inst[0] |= 0x81 << 8;
+
+	emit(pc, e);
+}
+
+static INLINE void
+emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI);
+}
+
+static void
+emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *v, struct nv50_reg *e)
+{
+	struct nv50_reg *temp = alloc_temp(pc, NULL);
+
+	emit_flop(pc, NV50_FLOP_LG2, temp, v);
+	emit_mul(pc, temp, temp, e);
+	emit_preex2(pc, temp, temp);
+	emit_flop(pc, NV50_FLOP_EX2, dst, temp);
+
+	free_temp(pc, temp);
+}
+
+static INLINE void
+emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32);
+}
+
+static void
+emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+	 struct nv50_reg **src)
+{
+	struct nv50_reg *one = alloc_immd(pc, 1.0);
+	struct nv50_reg *zero = alloc_immd(pc, 0.0);
+	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
+	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
+	struct nv50_reg *tmp[4] = { 0 };
+	boolean allow32 = pc->allow32;
+
+	pc->allow32 = FALSE;
+
+	if (mask & (3 << 1)) {
+		tmp[0] = alloc_temp(pc, NULL);
+		emit_minmax(pc, NV50_MAX_F32, tmp[0], src[0], zero);
+	}
+
+	if (mask & (1 << 2)) {
+		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
+
+		tmp[1] = temp_temp(pc, NULL);
+		emit_minmax(pc, NV50_MAX_F32, tmp[1], src[1], zero);
+
+		tmp[3] = temp_temp(pc, NULL);
+		emit_minmax(pc, NV50_MAX_F32, tmp[3], src[3], neg128);
+		emit_minmax(pc, NV50_MIN_F32, tmp[3], tmp[3], pos128);
+
+		emit_pow(pc, dst[2], tmp[1], tmp[3]);
+		emit_mov(pc, dst[2], zero);
+		set_pred(pc, 3, 0, pc->p->exec_tail);
+	}
+
+	if (mask & (1 << 1))
+		assimilate_temp(pc, dst[1], tmp[0]);
+	else
+	if (mask & (1 << 2))
+		free_temp(pc, tmp[0]);
+
+	pc->allow32 = allow32;
+
+	/* do this last, in case src[i,j] == dst[0,3] */
+	if (mask & (1 << 0))
+		emit_mov(pc, dst[0], one);
+
+	if (mask & (1 << 3))
+		emit_mov(pc, dst[3], one);
+
+	FREE(pos128);
+	FREE(neg128);
+	FREE(zero);
+	FREE(one);
+}
+
+static void
+emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e;
+	const int r_pred = 1;
+
+	e = exec(pc);
+	e->inst[0] = 0x00000002; /* discard */
+	set_long(pc, e); /* sets cond code to ALWAYS */
+
+	if (src) {
+		set_pred(pc, 0x1 /* cc = LT */, r_pred, e);
+		/* write to predicate reg */
+		emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32);
+	}
+
+	emit(pc, e);
+}
+
+static struct nv50_program_exec *
+emit_control_flow(struct nv50_pc *pc, unsigned op, int pred, unsigned cc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = (op << 28) | 2;
+	set_long(pc, e);
+	if (pred >= 0)
+		set_pred(pc, cc, pred, e);
+
+	emit(pc, e);
+	return e;
+}
+
+static INLINE struct nv50_program_exec *
+emit_breakaddr(struct nv50_pc *pc)
+{
+	return emit_control_flow(pc, 0x4, -1, 0);
+}
+
+static INLINE void
+emit_break(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	emit_control_flow(pc, 0x5, pred, cc);
+}
+
+static INLINE struct nv50_program_exec *
+emit_joinat(struct nv50_pc *pc)
+{
+	return emit_control_flow(pc, 0xa, -1, 0);
+}
+
+static INLINE struct nv50_program_exec *
+emit_branch(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	return emit_control_flow(pc, 0x1, pred, cc);
+}
+
+static INLINE struct nv50_program_exec *
+emit_call(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	return emit_control_flow(pc, 0x2, pred, cc);
+}
+
+static INLINE void
+emit_ret(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	emit_control_flow(pc, 0x3, pred, cc);
+}
+
+static void
+emit_prim_cmd(struct nv50_pc *pc, unsigned cmd)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xf0000000 | (cmd << 9);
+	e->inst[1] = 0xc0000000;
+	set_long(pc, e);
+
+	emit(pc, e);
+}
+
+#define QOP_ADD 0
+#define QOP_SUBR 1
+#define QOP_SUB 2
+#define QOP_MOV_SRC1 3
+
+/* For a quad of threads / top left, top right, bottom left, bottom right
+ * pixels, do a different operation, and take src0 from a specific thread.
+ */
+static void
+emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0,
+	    struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop)
+{
+       struct nv50_program_exec *e = exec(pc);
+
+       e->inst[0] = 0xc0000000;
+       e->inst[1] = 0x80000000;
+       set_long(pc, e);
+       e->inst[0] |= lane_src0 << 16;
+       set_src_0(pc, src0, e);
+       set_src_2(pc, src1, e);
+
+       if (wp >= 0)
+	       set_pred_wr(pc, 1, wp, e);
+
+       if (dst)
+	       set_dst(pc, dst, e);
+       else {
+	       e->inst[0] |= 0x000001fc;
+	       e->inst[1] |= 0x00000008;
+       }
+
+       e->inst[0] |= (qop & 3) << 20;
+       e->inst[1] |= (qop >> 2) << 22;
+
+       emit(pc, e);
+}
+
+static void
+load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
+		     struct nv50_reg **src, unsigned arg, boolean proj)
+{
+	int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod };
+
+	src[0]->mod |= NV50_MOD_ABS;
+	src[1]->mod |= NV50_MOD_ABS;
+	src[2]->mod |= NV50_MOD_ABS;
+
+	emit_minmax(pc, NV50_MAX_F32, t[2], src[0], src[1]);
+	emit_minmax(pc, NV50_MAX_F32, t[2], src[2], t[2]);
+
+	src[0]->mod = mod[0];
+	src[1]->mod = mod[1];
+	src[2]->mod = mod[2];
+
+	if (proj && 0 /* looks more correct without this */)
+		emit_mul(pc, t[2], t[2], src[3]);
+	else
+	if (arg == 4) /* there is no textureProj(samplerCubeShadow) */
+		emit_mov(pc, t[3], src[3]);
+
+	emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]);
+
+	emit_mul(pc, t[0], src[0], t[2]);
+	emit_mul(pc, t[1], src[1], t[2]);
+	emit_mul(pc, t[2], src[2], t[2]);
+}
+
+static void
+load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
+		     struct nv50_reg **src, unsigned dim, unsigned arg)
+{
+	unsigned c, mode;
+
+	if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
+		mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE;
+
+		t[3]->rhw = src[3]->rhw;
+		emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
+		emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]);
+
+		for (c = 0; c < dim; ++c) {
+			t[c]->rhw = src[c]->rhw;
+			emit_interp(pc, t[c], t[3], mode);
+		}
+		if (arg != dim) { /* depth reference value */
+			t[dim]->rhw = src[2]->rhw;
+			emit_interp(pc, t[dim], t[3], mode);
+		}
+	} else {
+		/* XXX: for some reason the blob sometimes uses MAD
+		 * (mad f32 $rX $rY $rZ neg $r63)
+		 */
+		emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]);
+		for (c = 0; c < dim; ++c)
+			emit_mul(pc, t[c], src[c], t[3]);
+		if (arg != dim) /* depth reference value */
+			emit_mul(pc, t[dim], src[2], t[3]);
+	}
+}
+
+static INLINE void
+get_tex_dim(unsigned type, unsigned *dim, unsigned *arg)
+{
+	switch (type) {
+	case TGSI_TEXTURE_1D:
+		*arg = *dim = 1;
+		break;
+	case TGSI_TEXTURE_SHADOW1D:
+		*dim = 1;
+		*arg = 2;
+		break;
+	case TGSI_TEXTURE_UNKNOWN:
+	case TGSI_TEXTURE_2D:
+	case TGSI_TEXTURE_RECT:
+		*arg = *dim = 2;
+		break;
+	case TGSI_TEXTURE_SHADOW2D:
+	case TGSI_TEXTURE_SHADOWRECT:
+		*dim = 2;
+		*arg = 3;
+		break;
+	case TGSI_TEXTURE_3D:
+	case TGSI_TEXTURE_CUBE:
+		*dim = *arg = 3;
+		break;
+	default:
+		assert(0);
+		break;
+	}
+}
+
+/* We shouldn't execute TEXLOD if any of the pixels in a quad have
+ * different LOD values, so branch off groups of equal LOD.
+ */
+static void
+emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod,
+		     struct nv50_reg *src, struct nv50_program_exec *tex)
+{
+	struct nv50_program_exec *join_at;
+	unsigned i, target = pc->p->exec_size + 9 * 2;
+
+	if (pc->p->type != PIPE_SHADER_FRAGMENT) {
+		emit(pc, tex);
+		return;
+	}
+	pc->allow32 = FALSE;
+
+	/* Subtract lod of each pixel from lod of top left pixel, jump
+	 * texlod insn if result is 0, then repeat for 2 other pixels.
+	 */
+	join_at = emit_joinat(pc);
+	emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55);
+	emit_branch(pc, 0, 2)->param.index = target;
+
+	for (i = 1; i < 4; ++i) {
+		emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55);
+		emit_branch(pc, 0, 2)->param.index = target;
+	}
+
+	emit_mov(pc, tlod, src); /* target */
+	emit(pc, tex); /* texlod */
+
+	join_at->param.index = target + 2 * 2;
+	JOIN_ON(emit_nop(pc)); /* join _after_ tex */
+}
+
+static void
+emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg,
+		      struct nv50_program_exec *tex)
+{
+	struct nv50_program_exec *e;
+	struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL);
+	int r_pred = 0;
+	unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 };
+
+	pc->allow32 = FALSE;
+	ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4);
+
+	/* Subtract bias value of thread i from bias values of each thread,
+	 * store result in r_pred, and set bit i in r_bits if result was 0.
+	 */
+	assert(arg < 4);
+	for (i = 0; i < 4; ++i, ++imm_1248.hw) {
+		emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55);
+		emit_mov(pc, r_bits, &imm_1248);
+		set_pred(pc, 2, r_pred, pc->p->exec_tail);
+	}
+	emit_mov_to_pred(pc, r_pred, r_bits);
+
+	/* The lanes of a quad are now grouped by the bit in r_pred they have
+	 * set. Put the input values for TEX into a new register set for each
+	 * group and execute TEX only for a specific group.
+	 * We cannot use the same register set for each group because we need
+	 * the derivatives, which are implicitly calculated, to be correct.
+	 */
+	for (i = 1; i < 4; ++i) {
+		alloc_temp4(pc, t123[i], 0);
+
+		for (c = 0; c <= arg; ++c)
+			emit_mov(pc, t123[i][c], t[c]);
+
+		*(e = exec(pc)) = *(tex);
+		e->inst[0] &= ~0x01fc;
+		set_dst(pc, t123[i][0], e);
+		set_pred(pc, cc[i], r_pred, e);
+		emit(pc, e);
+	}
+	/* finally TEX on the original regs (where we kept the input) */
+	set_pred(pc, cc[0], r_pred, tex);
+	emit(pc, tex);
+
+	/* put the 3 * n other results into regs for lane 0 */
+	n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc));
+	for (i = 1; i < 4; ++i) {
+		for (c = 0; c < n; ++c) {
+			emit_mov(pc, t[c], t123[i][c]);
+			set_pred(pc, cc[i], r_pred, pc->p->exec_tail);
+		}
+		free_temp4(pc, t123[i]);
+	}
+
+	emit_nop(pc);
+	free_temp(pc, r_bits);
+}
+
+static void
+emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+	 struct nv50_reg **src, unsigned unit, unsigned type,
+	 boolean proj, int bias_lod)
+{
+	struct nv50_reg *t[4];
+	struct nv50_program_exec *e;
+	unsigned c, dim, arg;
+
+	/* t[i] must be within a single 128 bit super-reg */
+	alloc_temp4(pc, t, 0);
+
+	e = exec(pc);
+	e->inst[0] = 0xf0000000;
+	set_long(pc, e);
+	set_dst(pc, t[0], e);
+
+	/* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */
+	e->inst[0] |= (unit << 9) /* | (unit << 17) */;
+
+	/* live flag (don't set if TEX results affect input to another TEX): */
+	/* e->inst[0] |= 0x00000004; */
+
+	get_tex_dim(type, &dim, &arg);
+
+	if (type == TGSI_TEXTURE_CUBE) {
+		e->inst[0] |= 0x08000000;
+		load_cube_tex_coords(pc, t, src, arg, proj);
+	} else
+	if (proj)
+		load_proj_tex_coords(pc, t, src, dim, arg);
+	else {
+		for (c = 0; c < dim; c++)
+			emit_mov(pc, t[c], src[c]);
+		if (arg != dim) /* depth reference value (always src.z here) */
+			emit_mov(pc, t[dim], src[2]);
+	}
+
+	e->inst[0] |= (mask & 0x3) << 25;
+	e->inst[1] |= (mask & 0xc) << 12;
+
+	if (!bias_lod) {
+		e->inst[0] |= (arg - 1) << 22;
+		emit(pc, e);
+	} else
+	if (bias_lod < 0) {
+		assert(pc->p->type == PIPE_SHADER_FRAGMENT);
+		e->inst[0] |= arg << 22;
+		e->inst[1] |= 0x20000000; /* texbias */
+		emit_mov(pc, t[arg], src[3]);
+		emit_texbias_sequence(pc, t, arg, e);
+	} else {
+		e->inst[0] |= arg << 22;
+		e->inst[1] |= 0x40000000; /* texlod */
+		emit_mov(pc, t[arg], src[3]);
+		emit_texlod_sequence(pc, t[arg], src[3], e);
+	}
+
+#if 1
+	c = 0;
+	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
+	if (mask & 2) emit_mov(pc, dst[1], t[c++]);
+	if (mask & 4) emit_mov(pc, dst[2], t[c++]);
+	if (mask & 8) emit_mov(pc, dst[3], t[c]);
+
+	free_temp4(pc, t);
+#else
+	/* XXX: if p.e. MUL is used directly after TEX, it would still use
+	 * the texture coordinates, not the fetched values: latency ? */
+
+	for (c = 0; c < 4; c++) {
+		if (mask & (1 << c))
+			assimilate_temp(pc, dst[c], t[c]);
+		else
+			free_temp(pc, t[c]);
+	}
+#endif
+}
+
+static void
+emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	assert(src->type == P_TEMP);
+
+	e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000;
+	e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_src_2(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	assert(src->type == P_TEMP);
+
+	e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000;
+	e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_src_2(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	unsigned q = 0, m = ~0;
+
+	assert(!is_long(e));
+
+	switch (e->inst[0] >> 28) {
+	case 0x1:
+		/* MOV */
+		q = 0x0403c000;
+		m = 0xffff7fff;
+		break;
+	case 0x2:
+	case 0x3:
+		/* ADD, SUB, SUBR b32 */
+		m = ~(0x8000 | (127 << 16));
+		q = ((e->inst[0] & (~m)) >> 2) | (1 << 26);
+		break;
+	case 0x5:
+		/* SAD */
+		m = ~(0x81 << 8);
+		q = (0x0c << 24) | ((e->inst[0] & (0x7f << 2)) << 12);
+		break;
+	case 0x6:
+		/* MAD u16 */
+		q = (e->inst[0] & (0x7f << 2)) << 12;
+		break;
+	case 0x8:
+		/* INTERP (move centroid, perspective and flat bits) */
+		m = ~0x03000100;
+		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
+		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
+		break;
+	case 0x9:
+		/* RCP */
+		break;
+	case 0xB:
+		/* ADD */
+		m = ~(127 << 16);
+		q = ((e->inst[0] & (~m)) >> 2);
+		break;
+	case 0xC:
+		/* MUL */
+		m = ~0x00008000;
+		q = ((e->inst[0] & (~m)) << 12);
+		break;
+	case 0xE:
+		/* MAD (if src2 == dst) */
+		q = ((e->inst[0] & 0x1fc) << 12);
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	set_long(pc, e);
+	pc->p->exec_size++;
+
+	e->inst[0] &= m;
+	e->inst[1] |= q;
+}
+
+/* Some operations support an optional negation flag. */
+static int
+get_supported_mods(const struct tgsi_full_instruction *insn, int i)
+{
+	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_ADD:
+	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_DDX:
+	case TGSI_OPCODE_DDY:
+	case TGSI_OPCODE_DP3:
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_KIL:
+	case TGSI_OPCODE_LG2:
+	case TGSI_OPCODE_MAD:
+	case TGSI_OPCODE_MUL:
+	case TGSI_OPCODE_POW:
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */
+	case TGSI_OPCODE_SCS:
+	case TGSI_OPCODE_SIN:
+	case TGSI_OPCODE_SUB:
+		return NV50_MOD_NEG;
+	case TGSI_OPCODE_MAX:
+	case TGSI_OPCODE_MIN:
+	case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */
+		return NV50_MOD_ABS;
+	case TGSI_OPCODE_CEIL:
+	case TGSI_OPCODE_FLR:
+	case TGSI_OPCODE_TRUNC:
+		return NV50_MOD_NEG | NV50_MOD_ABS;
+	case TGSI_OPCODE_F2I:
+	case TGSI_OPCODE_F2U:
+	case TGSI_OPCODE_I2F:
+	case TGSI_OPCODE_U2F:
+		return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32;
+	case TGSI_OPCODE_UADD:
+		return NV50_MOD_NEG | NV50_MOD_I32;
+	case TGSI_OPCODE_SAD:
+	case TGSI_OPCODE_SHL:
+	case TGSI_OPCODE_IMAX:
+	case TGSI_OPCODE_IMIN:
+	case TGSI_OPCODE_ISHR:
+	case TGSI_OPCODE_NOT:
+	case TGSI_OPCODE_UMAD:
+	case TGSI_OPCODE_UMAX:
+	case TGSI_OPCODE_UMIN:
+	case TGSI_OPCODE_UMUL:
+	case TGSI_OPCODE_USHR:
+		return NV50_MOD_I32;
+	default:
+		return 0;
+	}
+}
+
+/* Return a read mask for source registers deduced from opcode & write mask. */
+static unsigned
+nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
+{
+	unsigned x, mask = insn->Dst[0].Register.WriteMask;
+
+	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_SIN:
+		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
+	case TGSI_OPCODE_DP3:
+		return 0x7;
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_DPH:
+	case TGSI_OPCODE_KIL: /* WriteMask ignored */
+		return 0xf;
+	case TGSI_OPCODE_DST:
+		return mask & (c ? 0xa : 0x6);
+	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_EXP:
+	case TGSI_OPCODE_LG2:
+	case TGSI_OPCODE_LOG:
+	case TGSI_OPCODE_POW:
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ:
+	case TGSI_OPCODE_SCS:
+		return 0x1;
+	case TGSI_OPCODE_IF:
+		return 0x1;
+	case TGSI_OPCODE_LIT:
+		return 0xb;
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXB:
+	case TGSI_OPCODE_TXL:
+	case TGSI_OPCODE_TXP:
+	{
+		const struct tgsi_instruction_texture *tex;
+
+		assert(insn->Instruction.Texture);
+		tex = &insn->Texture;
+
+		mask = 0x7;
+		if (insn->Instruction.Opcode != TGSI_OPCODE_TEX &&
+		    insn->Instruction.Opcode != TGSI_OPCODE_TXD)
+			mask |= 0x8; /* bias, lod or proj */
+
+		switch (tex->Texture) {
+		case TGSI_TEXTURE_1D:
+			mask &= 0x9;
+			break;
+		case TGSI_TEXTURE_SHADOW1D:
+			mask &= 0x5;
+			break;
+		case TGSI_TEXTURE_2D:
+			mask &= 0xb;
+			break;
+		default:
+			break;
+		}
+	}
+		return mask;
+	case TGSI_OPCODE_XPD:
+		x = 0;
+		if (mask & 1) x |= 0x6;
+		if (mask & 2) x |= 0x5;
+		if (mask & 4) x |= 0x3;
+		return x;
+	default:
+		break;
+	}
+
+	return mask;
+}
+
+static struct nv50_reg *
+tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
+{
+	switch (dst->Register.File) {
+	case TGSI_FILE_TEMPORARY:
+		return &pc->temp[dst->Register.Index * 4 + c];
+	case TGSI_FILE_OUTPUT:
+		return &pc->result[dst->Register.Index * 4 + c];
+	case TGSI_FILE_ADDRESS:
+	{
+		struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c];
+		if (!r) {
+			r = get_address_reg(pc, NULL);
+			r->index = dst->Register.Index * 4 + c;
+			pc->addr[r->index] = r;
+		}
+		assert(r);
+		return r;
+	}
+	case TGSI_FILE_NULL:
+		return NULL;
+	case TGSI_FILE_SYSTEM_VALUE:
+		assert(pc->sysval[dst->Register.Index].type == P_RESULT);
+		assert(c == 0);
+		return &pc->sysval[dst->Register.Index];
+	default:
+		break;
+	}
+
+	return NULL;
+}
+
+static struct nv50_reg *
+tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
+	 int mod)
+{
+	struct nv50_reg *r = NULL;
+	struct nv50_reg *temp = NULL;
+	unsigned sgn, c, swz, cvn;
+
+	if (src->Register.File != TGSI_FILE_CONSTANT)
+		assert(!src->Register.Indirect);
+
+	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
+
+	c = tgsi_util_get_full_src_register_swizzle(src, chan);
+	switch (c) {
+	case TGSI_SWIZZLE_X:
+	case TGSI_SWIZZLE_Y:
+	case TGSI_SWIZZLE_Z:
+	case TGSI_SWIZZLE_W:
+		switch (src->Register.File) {
+		case TGSI_FILE_INPUT:
+			r = &pc->attr[src->Register.Index * 4 + c];
+
+			if (!src->Dimension.Dimension)
+				break;
+			r = reg_instance(pc, r);
+			r->vtx = src->Dimension.Index;
+
+			if (!src->Dimension.Indirect)
+				break;
+			swz = tgsi_util_get_src_register_swizzle(
+				&src->DimIndirect, 0);
+			r->acc = -1;
+			r->indirect[1] = src->DimIndirect.Index * 4 + swz;
+			break;
+		case TGSI_FILE_TEMPORARY:
+			r = &pc->temp[src->Register.Index * 4 + c];
+			break;
+		case TGSI_FILE_CONSTANT:
+			if (!src->Register.Indirect) {
+				r = &pc->param[src->Register.Index * 4 + c];
+				break;
+			}
+			/* Indicate indirection by setting r->acc < 0 and
+			 * use the index field to select the address reg.
+			 */
+			r = reg_instance(pc, NULL);
+			ctor_reg(r, P_CONST, -1, src->Register.Index * 4 + c);
+
+			swz = tgsi_util_get_src_register_swizzle(
+				&src->Indirect, 0);
+			r->acc = -1;
+			r->indirect[0] = src->Indirect.Index * 4 + swz;
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			r = &pc->immd[src->Register.Index * 4 + c];
+			break;
+		case TGSI_FILE_SAMPLER:
+			return NULL;
+		case TGSI_FILE_ADDRESS:
+			r = pc->addr[src->Register.Index * 4 + c];
+			assert(r);
+			break;
+		case TGSI_FILE_SYSTEM_VALUE:
+			assert(c == 0);
+			r = &pc->sysval[src->Register.Index];
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32;
+
+	switch (sgn) {
+	case TGSI_UTIL_SIGN_CLEAR:
+		r->mod = NV50_MOD_ABS;
+		break;
+	case TGSI_UTIL_SIGN_SET:
+		r->mod = NV50_MOD_NEG_ABS;
+		break;
+	case TGSI_UTIL_SIGN_TOGGLE:
+		r->mod = NV50_MOD_NEG;
+		break;
+	default:
+		assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP);
+		break;
+	}
+
+	if ((r->mod & mod) != r->mod) {
+		temp = temp_temp(pc, NULL);
+		emit_cvt(pc, temp, r, -1, cvn);
+		r->mod = 0;
+		r = temp;
+	} else
+		r->mod |= mod & NV50_MOD_I32;
+
+	assert(r);
+	if (r->acc >= 0 && r->vtx < 0 && r != temp)
+		return reg_instance(pc, r); /* will clear r->mod */
+	return r;
+}
+
+/* return TRUE for ops that produce only a single result */
+static boolean
+is_scalar_op(unsigned op)
+{
+	switch (op) {
+	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_DP2:
+	case TGSI_OPCODE_DP3:
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_DPH:
+	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_LG2:
+	case TGSI_OPCODE_POW:
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ:
+	case TGSI_OPCODE_SIN:
+		/*
+	case TGSI_OPCODE_KIL:
+	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_SCS:
+		*/
+		return TRUE;
+	default:
+		return FALSE;
+	}
+}
+
+/* Returns a bitmask indicating which dst components depend
+ * on source s, component c (reverse of nv50_tgsi_src_mask).
+ */
+static unsigned
+nv50_tgsi_dst_revdep(unsigned op, int s, int c)
+{
+	if (is_scalar_op(op))
+		return 0x1;
+
+	switch (op) {
+	case TGSI_OPCODE_DST:
+		return (1 << c) & (s ? 0xa : 0x6);
+	case TGSI_OPCODE_XPD:
+		switch (c) {
+		case 0: return 0x6;
+		case 1: return 0x5;
+		case 2: return 0x3;
+		case 3: return 0x0;
+		default:
+			assert(0);
+			return 0x0;
+		}
+	case TGSI_OPCODE_EXP:
+	case TGSI_OPCODE_LOG:
+	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_SCS:
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXB:
+	case TGSI_OPCODE_TXL:
+	case TGSI_OPCODE_TXP:
+		/* these take care of dangerous swizzles themselves */
+		return 0x0;
+	case TGSI_OPCODE_IF:
+	case TGSI_OPCODE_KIL:
+		/* don't call this function for these ops */
+		assert(0);
+		return 0;
+	default:
+		/* linear vector instruction */
+		return (1 << c);
+	}
+}
+
+static INLINE boolean
+has_pred(struct nv50_program_exec *e, unsigned cc)
+{
+	if (!is_long(e) || is_immd(e))
+		return FALSE;
+	return ((e->inst[1] & 0x780) == (cc << 7));
+}
+
+/* on ENDIF see if we can do "@p0.neu single_op" instead of:
+ *        join_at ENDIF
+ *        @p0.eq bra ENDIF
+ *        single_op
+ * ENDIF: nop.join
+ */
+static boolean
+nv50_kill_branch(struct nv50_pc *pc)
+{
+	int lvl = pc->if_lvl;
+
+	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
+		return FALSE;
+	if (is_immd(pc->p->exec_tail))
+		return FALSE;
+
+	/* if ccode == 'true', the BRA is from an ELSE and the predicate
+	 * reg may no longer be valid, since we currently always use $p0
+	 */
+	if (has_pred(pc->if_insn[lvl], 0xf))
+		return FALSE;
+	assert(pc->if_insn[lvl] && pc->if_join[lvl]);
+
+	/* We'll use the exec allocated for JOIN_AT (we can't easily
+	 * access nv50_program_exec's prev).
+	 */
+	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
+
+	*pc->if_join[lvl] = *pc->p->exec_tail;
+
+	FREE(pc->if_insn[lvl]);
+	FREE(pc->p->exec_tail);
+
+	pc->p->exec_tail = pc->if_join[lvl];
+	pc->p->exec_tail->next = NULL;
+	set_pred(pc, 0xd, 0, pc->p->exec_tail);
+
+	return TRUE;
+}
+
+static void
+nv50_fp_move_results(struct nv50_pc *pc)
+{
+	struct nv50_reg reg;
+	unsigned i;
+
+	ctor_reg(&reg, P_TEMP, -1, -1);
+
+	for (i = 0; i < pc->result_nr * 4; ++i) {
+		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
+			continue;
+		if (pc->result[i].rhw != pc->result[i].hw) {
+			reg.hw = pc->result[i].rhw;
+			emit_mov(pc, &reg, &pc->result[i]);
+		}
+	}
+}
+
+static boolean
+nv50_program_tx_insn(struct nv50_pc *pc,
+		     const struct tgsi_full_instruction *inst)
+{
+	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
+	unsigned mask, sat, unit = 0;
+	int i, c;
+
+	mask = inst->Dst[0].Register.WriteMask;
+	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
+
+	memset(src, 0, sizeof(src));
+
+	for (c = 0; c < 4; c++) {
+		if ((mask & (1 << c)) && !pc->r_dst[c])
+			dst[c] = tgsi_dst(pc, c, &inst->Dst[0]);
+		else
+			dst[c] = pc->r_dst[c];
+		rdst[c] = dst[c];
+	}
+
+	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fs = &inst->Src[i];
+		unsigned src_mask;
+		int mod_supp;
+
+		src_mask = nv50_tgsi_src_mask(inst, i);
+		mod_supp = get_supported_mods(inst, i);
+
+		if (fs->Register.File == TGSI_FILE_SAMPLER)
+			unit = fs->Register.Index;
+
+		for (c = 0; c < 4; c++)
+			if (src_mask & (1 << c))
+				src[i][c] = tgsi_src(pc, c, fs, mod_supp);
+	}
+
+	brdc = temp = pc->r_brdc;
+	if (brdc && brdc->type != P_TEMP) {
+		temp = temp_temp(pc, NULL);
+		if (sat)
+			brdc = temp;
+	} else
+	if (sat) {
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
+				continue;
+			/* rdst[c] = dst[c]; */ /* done above */
+			dst[c] = temp_temp(pc, NULL);
+		}
+	}
+
+	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
+
+	switch (inst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_ABS | CVT_F32_F32);
+		}
+		break;
+	case TGSI_OPCODE_ADD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_add(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_AND:
+	case TGSI_OPCODE_XOR:
+	case TGSI_OPCODE_OR:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_bitop2(pc, dst[c], src[0][c], src[1][c],
+				    inst->Instruction.Opcode);
+		}
+		break;
+	case TGSI_OPCODE_ARL:
+		temp = temp_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, temp, src[0][c], -1,
+				 CVT_FLOOR | CVT_S32_F32);
+			emit_arl(pc, dst[c], temp, 4);
+		}
+		break;
+	case TGSI_OPCODE_BGNLOOP:
+		pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc);
+		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
+		terminate_mbb(pc);
+		break;
+	case TGSI_OPCODE_BGNSUB:
+		assert(!pc->in_subroutine);
+		pc->in_subroutine = TRUE;
+		/* probably not necessary, but align to 8 byte boundary */
+		if (!is_long(pc->p->exec_tail))
+			convert_to_long(pc, pc->p->exec_tail);
+		break;
+	case TGSI_OPCODE_BRK:
+		assert(pc->loop_lvl > 0);
+		emit_break(pc, -1, 0);
+		break;
+	case TGSI_OPCODE_CAL:
+		assert(inst->Label.Label < pc->insn_nr);
+		emit_call(pc, -1, 0)->param.index = inst->Label.Label;
+		/* replaced by actual offset in nv50_program_fixup_insns */
+		break;
+	case TGSI_OPCODE_CEIL:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_CEIL | CVT_F32_F32 | CVT_RI);
+		}
+		break;
+	case TGSI_OPCODE_CMP:
+		pc->allow32 = FALSE;
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32);
+			emit_mov(pc, dst[c], src[1][c]);
+			set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
+			emit_mov(pc, dst[c], src[2][c]);
+			set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
+		}
+		break;
+	case TGSI_OPCODE_CONT:
+		assert(pc->loop_lvl > 0);
+		emit_branch(pc, -1, 0)->param.index =
+			pc->loop_pos[pc->loop_lvl - 1];
+		break;
+	case TGSI_OPCODE_COS:
+		if (mask & 8) {
+			emit_precossin(pc, temp, src[0][3]);
+			emit_flop(pc, NV50_FLOP_COS, dst[3], temp);
+			if (!(mask &= 7))
+				break;
+			if (temp == dst[3])
+				temp = brdc = temp_temp(pc, NULL);
+		}
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, NV50_FLOP_COS, brdc, temp);
+		break;
+	case TGSI_OPCODE_DDX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_ddx(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_DDY:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_ddy(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
+		break;
+	case TGSI_OPCODE_DP4:
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
+		break;
+	case TGSI_OPCODE_DPH:
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		emit_add(pc, brdc, src[1][3], temp);
+		break;
+	case TGSI_OPCODE_DST:
+		if (mask & (1 << 1))
+			emit_mul(pc, dst[1], src[0][1], src[1][1]);
+		if (mask & (1 << 2))
+			emit_mov(pc, dst[2], src[0][2]);
+		if (mask & (1 << 3))
+			emit_mov(pc, dst[3], src[1][3]);
+		if (mask & (1 << 0))
+			emit_mov_immdval(pc, dst[0], 1.0f);
+		break;
+	case TGSI_OPCODE_ELSE:
+		emit_branch(pc, -1, 0);
+		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
+		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
+		terminate_mbb(pc);
+		break;
+	case TGSI_OPCODE_EMIT:
+		emit_prim_cmd(pc, 1);
+		break;
+	case TGSI_OPCODE_ENDIF:
+		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
+
+		/* try to replace branch over 1 insn with a predicated insn */
+		if (nv50_kill_branch(pc) == TRUE)
+			break;
+
+		if (pc->if_join[pc->if_lvl]) {
+			pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size;
+			pc->if_join[pc->if_lvl] = NULL;
+		}
+		terminate_mbb(pc);
+		/* emit a NOP as join point, we could set it on the next
+		 * one, but would have to make sure it is long and !immd
+		 */
+		JOIN_ON(emit_nop(pc));
+		break;
+	case TGSI_OPCODE_ENDLOOP:
+		emit_branch(pc, -1, 0)->param.index =
+			pc->loop_pos[--pc->loop_lvl];
+		pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size;
+		terminate_mbb(pc);
+		break;
+	case TGSI_OPCODE_ENDPRIM:
+		emit_prim_cmd(pc, 2);
+		break;
+	case TGSI_OPCODE_ENDSUB:
+		assert(pc->in_subroutine);
+		terminate_mbb(pc);
+		pc->in_subroutine = FALSE;
+		break;
+	case TGSI_OPCODE_EX2:
+		emit_preex2(pc, temp, src[0][0]);
+		emit_flop(pc, NV50_FLOP_EX2, brdc, temp);
+		break;
+	case TGSI_OPCODE_EXP:
+	{
+		struct nv50_reg *t[2];
+
+		assert(!temp);
+		t[0] = temp_temp(pc, NULL);
+		t[1] = temp_temp(pc, NULL);
+
+		if (mask & 0x6)
+			emit_mov(pc, t[0], src[0][0]);
+		if (mask & 0x3)
+			emit_flr(pc, t[1], src[0][0]);
+
+		if (mask & (1 << 1))
+			emit_sub(pc, dst[1], t[0], t[1]);
+		if (mask & (1 << 0)) {
+			emit_preex2(pc, t[1], t[1]);
+			emit_flop(pc, NV50_FLOP_EX2, dst[0], t[1]);
+		}
+		if (mask & (1 << 2)) {
+			emit_preex2(pc, t[0], t[0]);
+			emit_flop(pc, NV50_FLOP_EX2, dst[2], t[0]);
+		}
+		if (mask & (1 << 3))
+			emit_mov_immdval(pc, dst[3], 1.0f);
+	}
+		break;
+	case TGSI_OPCODE_F2I:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_TRUNC | CVT_S32_F32);
+		}
+		break;
+	case TGSI_OPCODE_F2U:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_TRUNC | CVT_U32_F32);
+		}
+		break;
+	case TGSI_OPCODE_FLR:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flr(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_FRC:
+		temp = temp_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flr(pc, temp, src[0][c]);
+			emit_sub(pc, dst[c], src[0][c], temp);
+		}
+		break;
+	case TGSI_OPCODE_I2F:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32);
+		}
+		break;
+	case TGSI_OPCODE_IF:
+		assert(pc->if_lvl < NV50_MAX_COND_NESTING);
+		emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32);
+		pc->if_join[pc->if_lvl] = emit_joinat(pc);
+		pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);;
+		terminate_mbb(pc);
+		break;
+	case TGSI_OPCODE_IMAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x08c, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_IMIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x0ac, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_INEG:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_S32_S32 | CVT_NEG);
+		}
+		break;
+	case TGSI_OPCODE_KIL:
+		assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]);
+		emit_kil(pc, src[0][0]);
+		emit_kil(pc, src[0][1]);
+		emit_kil(pc, src[0][2]);
+		emit_kil(pc, src[0][3]);
+		break;
+	case TGSI_OPCODE_KILP:
+		emit_kil(pc, NULL);
+		break;
+	case TGSI_OPCODE_LIT:
+		emit_lit(pc, &dst[0], mask, &src[0][0]);
+		break;
+	case TGSI_OPCODE_LG2:
+		emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]);
+		break;
+	case TGSI_OPCODE_LOG:
+	{
+		struct nv50_reg *t[2];
+
+		t[0] = temp_temp(pc, NULL);
+		if (mask & (1 << 1))
+			t[1] = temp_temp(pc, NULL);
+		else
+			t[1] = t[0];
+
+		emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32);
+		emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]);
+		if (mask & (1 << 2))
+			emit_mov(pc, dst[2], t[1]);
+		emit_flr(pc, t[1], t[1]);
+		if (mask & (1 << 0))
+			emit_mov(pc, dst[0], t[1]);
+		if (mask & (1 << 1)) {
+			t[1]->mod = NV50_MOD_NEG;
+			emit_preex2(pc, t[1], t[1]);
+			t[1]->mod = 0;
+			emit_flop(pc, NV50_FLOP_EX2, t[1], t[1]);
+			emit_mul(pc, dst[1], t[0], t[1]);
+		}
+		if (mask & (1 << 3))
+			emit_mov_immdval(pc, dst[3], 1.0f);
+	}
+		break;
+	case TGSI_OPCODE_LRP:
+		temp = temp_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_sub(pc, temp, src[1][c], src[2][c]);
+			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
+		}
+		break;
+	case TGSI_OPCODE_MAD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
+		}
+		break;
+	case TGSI_OPCODE_MAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x880, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_MIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x8a0, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_MOV:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_MUL:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_NOT:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_not(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_POW:
+		emit_pow(pc, brdc, src[0][0], src[1][0]);
+		break;
+	case TGSI_OPCODE_RCP:
+		if (!sat && popcnt4(mask) == 1)
+			brdc = dst[ffs(mask) - 1];
+		emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]);
+		break;
+	case TGSI_OPCODE_RET:
+		if (pc->p->type == PIPE_SHADER_FRAGMENT && !pc->in_subroutine)
+			nv50_fp_move_results(pc);
+		emit_ret(pc, -1, 0);
+		break;
+	case TGSI_OPCODE_RSQ:
+		if (!sat && popcnt4(mask) == 1)
+			brdc = dst[ffs(mask) - 1];
+		src[0][0]->mod |= NV50_MOD_ABS;
+		emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]);
+		break;
+	case TGSI_OPCODE_SAD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_sad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
+		}
+		break;
+	case TGSI_OPCODE_SCS:
+		temp = temp_temp(pc, NULL);
+		if (mask & 3)
+			emit_precossin(pc, temp, src[0][0]);
+		if (mask & (1 << 0))
+			emit_flop(pc, NV50_FLOP_COS, dst[0], temp);
+		if (mask & (1 << 1))
+			emit_flop(pc, NV50_FLOP_SIN, dst[1], temp);
+		if (mask & (1 << 2))
+			emit_mov_immdval(pc, dst[2], 0.0);
+		if (mask & (1 << 3))
+			emit_mov_immdval(pc, dst[3], 1.0);
+		break;
+	case TGSI_OPCODE_SHL:
+	case TGSI_OPCODE_ISHR:
+	case TGSI_OPCODE_USHR:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_shift(pc, dst[c], src[0][c], src[1][c],
+				   inst->Instruction.Opcode);
+		}
+		break;
+	case TGSI_OPCODE_SIN:
+		if (mask & 8) {
+			emit_precossin(pc, temp, src[0][3]);
+			emit_flop(pc, NV50_FLOP_SIN, dst[3], temp);
+			if (!(mask &= 7))
+				break;
+			if (temp == dst[3])
+				temp = brdc = temp_temp(pc, NULL);
+		}
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, NV50_FLOP_SIN, brdc, temp);
+		break;
+	case TGSI_OPCODE_SLT:
+	case TGSI_OPCODE_SGE:
+	case TGSI_OPCODE_SEQ:
+	case TGSI_OPCODE_SGT:
+	case TGSI_OPCODE_SLE:
+	case TGSI_OPCODE_SNE:
+	case TGSI_OPCODE_ISLT:
+	case TGSI_OPCODE_ISGE:
+	case TGSI_OPCODE_USEQ:
+	case TGSI_OPCODE_USGE:
+	case TGSI_OPCODE_USLT:
+	case TGSI_OPCODE_USNE:
+	{
+		uint8_t cc, ty;
+
+		map_tgsi_setop_hw(inst->Instruction.Opcode, &cc, &ty);
+
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_set(pc, cc, dst[c], -1, src[0][c], src[1][c], ty);
+		}
+	}
+		break;
+	case TGSI_OPCODE_SUB:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_sub(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_TEX:
+		emit_tex(pc, dst, mask, src[0], unit,
+			 inst->Texture.Texture, FALSE, 0);
+		break;
+	case TGSI_OPCODE_TXB:
+		emit_tex(pc, dst, mask, src[0], unit,
+			 inst->Texture.Texture, FALSE, -1);
+		break;
+	case TGSI_OPCODE_TXL:
+		emit_tex(pc, dst, mask, src[0], unit,
+			 inst->Texture.Texture, FALSE, 1);
+		break;
+	case TGSI_OPCODE_TXP:
+		emit_tex(pc, dst, mask, src[0], unit,
+			 inst->Texture.Texture, TRUE, 0);
+		break;
+	case TGSI_OPCODE_TRUNC:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_TRUNC | CVT_F32_F32 | CVT_RI);
+		}
+		break;
+	case TGSI_OPCODE_U2F:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32);
+		}
+		break;
+	case TGSI_OPCODE_UADD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_add_b32(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_UMAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x084, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_UMIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x0a4, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_UMAD:
+	{
+		assert(!temp);
+		temp = temp_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1);
+			emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0,
+				     temp);
+			emit_shl_imm(pc, temp, temp, 16);
+			emit_mad_u16(pc, temp, src[0][c], 0, src[1][c], 0,
+				     temp);
+			emit_add_b32(pc, dst[c], temp, src[2][c]);
+		}
+	}
+		break;
+	case TGSI_OPCODE_UMUL:
+	{
+		assert(!temp);
+		temp = temp_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1);
+			emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0,
+				     temp);
+			emit_shl_imm(pc, temp, temp, 16);
+			emit_mad_u16(pc, dst[c], src[0][c], 0, src[1][c], 0,
+				     temp);
+		}
+	}
+		break;
+	case TGSI_OPCODE_XPD:
+		temp = temp_temp(pc, NULL);
+		if (mask & (1 << 0)) {
+			emit_mul(pc, temp, src[0][2], src[1][1]);
+			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
+		}
+		if (mask & (1 << 1)) {
+			emit_mul(pc, temp, src[0][0], src[1][2]);
+			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
+		}
+		if (mask & (1 << 2)) {
+			emit_mul(pc, temp, src[0][1], src[1][0]);
+			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
+		}
+		if (mask & (1 << 3))
+			emit_mov_immdval(pc, dst[3], 1.0);
+		break;
+	case TGSI_OPCODE_END:
+		if (pc->p->type == PIPE_SHADER_FRAGMENT)
+			nv50_fp_move_results(pc);
+
+		if (!pc->p->exec_tail ||
+		    is_immd(pc->p->exec_tail) ||
+		    is_join(pc->p->exec_tail) ||
+		    is_control_flow(pc->p->exec_tail))
+			emit_nop(pc);
+
+		/* last insn must be long so it can have the exit bit set */
+		if (!is_long(pc->p->exec_tail))
+			convert_to_long(pc, pc->p->exec_tail);
+
+		pc->p->exec_tail->inst[1] |= 1; /* set exit bit */
+
+		terminate_mbb(pc);
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	if (brdc) {
+		if (sat)
+			emit_sat(pc, brdc, brdc);
+		for (c = 0; c < 4; c++)
+			if ((mask & (1 << c)) && dst[c] != brdc)
+				emit_mov(pc, dst[c], brdc);
+	} else
+	if (sat) {
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			/* In this case we saturate later, and dst[c] won't
+			 * be another temp_temp (and thus lost), since rdst
+			 * already is TEMP (see above). */
+			if (rdst[c]->type == P_TEMP && rdst[c]->index < 0)
+				continue;
+			emit_sat(pc, rdst[c], dst[c]);
+		}
+	}
+
+	kill_temp_temp(pc, NULL);
+	pc->reg_instance_nr = 0;
+
+	return TRUE;
+}
+
+static void
+prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
+{
+	struct nv50_reg *r, *reg = NULL;
+	const struct tgsi_full_src_register *src;
+	const struct tgsi_dst_register *dst;
+	unsigned i, c, k, mask;
+
+	dst = &insn->Dst[0].Register;
+	mask = dst->WriteMask;
+
+        if (dst->File == TGSI_FILE_TEMPORARY)
+		reg = pc->temp;
+        else
+	if (dst->File == TGSI_FILE_OUTPUT) {
+		reg = pc->result;
+
+		if (insn->Instruction.Opcode == TGSI_OPCODE_MOV &&
+		    dst->Index == pc->edgeflag_out &&
+		    insn->Src[0].Register.File == TGSI_FILE_INPUT)
+			pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index;
+	}
+
+	if (reg) {
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			reg[dst->Index * 4 + c].acc = pc->insn_nr;
+		}
+	}
+
+	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
+		src = &insn->Src[i];
+
+		if (src->Register.File == TGSI_FILE_TEMPORARY)
+			reg = pc->temp;
+		else
+		if (src->Register.File == TGSI_FILE_INPUT)
+			reg = pc->attr;
+		else
+			continue;
+
+		mask = nv50_tgsi_src_mask(insn, i);
+
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			k = tgsi_util_get_full_src_register_swizzle(src, c);
+
+			r = &reg[src->Register.Index * 4 + k];
+
+			/* If used before written, pre-allocate the reg,
+			 * lest we overwrite results from a subroutine.
+			 */
+			if (!r->acc && r->type == P_TEMP)
+				alloc_reg(pc, r);
+
+			r->acc = pc->insn_nr;
+		}
+	}
+}
+
+/* Returns a bitmask indicating which dst components need to be
+ * written to temporaries first to avoid 'corrupting' sources.
+ *
+ * m[i]   (out) indicate component to write in the i-th position
+ * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
+ */
+static unsigned
+nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
+{
+	unsigned i, c, x, unsafe = 0;
+
+	for (c = 0; c < 4; c++)
+		m[c] = c;
+
+	/* Swap as long as a dst component written earlier is depended on
+	 * by one written later, but the next one isn't depended on by it.
+	 */
+	for (c = 0; c < 3; c++) {
+		if (rdep[m[c + 1]] & (1 << m[c]))
+			continue; /* if next one is depended on by us */
+		for (i = c + 1; i < 4; i++)
+			/* if we are depended on by a later one */
+			if (rdep[m[c]] & (1 << m[i]))
+				break;
+		if (i == 4)
+			continue;
+		/* now, swap */
+		x = m[c];
+		m[c] = m[c + 1];
+		m[c + 1] = x;
+
+		/* restart */
+		c = 0;
+	}
+
+	/* mark dependencies that could not be resolved by reordering */
+	for (i = 0; i < 3; ++i)
+		for (c = i + 1; c < 4; ++c)
+			if (rdep[m[i]] & (1 << m[c]))
+				unsafe |= (1 << i);
+
+	/* NOTE: $unsafe is with respect to order, not component */
+	return unsafe;
+}
+
+/* Select a suitable dst register for broadcasting scalar results,
+ * or return NULL if we have to allocate an extra TEMP.
+ *
+ * If e.g. only 1 component is written, we may also emit the final
+ * result to a write-only register.
+ */
+static struct nv50_reg *
+tgsi_broadcast_dst(struct nv50_pc *pc,
+		   const struct tgsi_full_dst_register *fd, unsigned mask)
+{
+	if (fd->Register.File == TGSI_FILE_TEMPORARY) {
+		int c = ffs(~mask & fd->Register.WriteMask);
+		if (c)
+			return tgsi_dst(pc, c - 1, fd);
+	} else {
+		int c = ffs(fd->Register.WriteMask) - 1;
+		if ((1 << c) == fd->Register.WriteMask)
+			return tgsi_dst(pc, c, fd);
+	}
+
+	return NULL;
+}
+
+/* Scan source swizzles and return a bitmask indicating dst regs that
+ * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
+ */
+static unsigned
+nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
+		       unsigned rdep[4])
+{
+	const struct tgsi_full_dst_register *fd = &insn->Dst[0];
+	const struct tgsi_full_src_register *fs;
+	unsigned i, deqs = 0;
+
+	for (i = 0; i < 4; ++i)
+		rdep[i] = 0;
+
+	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
+		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
+		int ms = get_supported_mods(insn, i);
+
+		fs = &insn->Src[i];
+		if (fs->Register.File != fd->Register.File ||
+		    fs->Register.Index != fd->Register.Index)
+			continue;
+
+		for (chn = 0; chn < 4; ++chn) {
+			unsigned s, c;
+
+			if (!(mask & (1 << chn))) /* src is not read */
+				continue;
+			c = tgsi_util_get_full_src_register_swizzle(fs, chn);
+			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
+
+			if (!(fd->Register.WriteMask & (1 << c)))
+				continue;
+
+			if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG))
+					continue;
+			if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS))
+					continue;
+			if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3))
+					continue;
+
+			rdep[c] |= nv50_tgsi_dst_revdep(
+				insn->Instruction.Opcode, i, chn);
+			deqs |= (1 << c);
+		}
+	}
+
+	return deqs;
+}
+
+static boolean
+nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
+{
+	struct tgsi_full_instruction insn = tok->FullInstruction;
+	const struct tgsi_full_dst_register *fd;
+	unsigned i, deqs, rdep[4], m[4];
+
+	fd = &tok->FullInstruction.Dst[0];
+	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
+
+	if (is_scalar_op(insn.Instruction.Opcode)) {
+		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
+		if (!pc->r_brdc)
+			pc->r_brdc = temp_temp(pc, NULL);
+		return nv50_program_tx_insn(pc, &insn);
+	}
+	pc->r_brdc = NULL;
+
+	if (!deqs || (!rdep[0] && !rdep[1] && !rdep[2] && !rdep[3]))
+		return nv50_program_tx_insn(pc, &insn);
+
+	deqs = nv50_revdep_reorder(m, rdep);
+
+	for (i = 0; i < 4; ++i) {
+		assert(pc->r_dst[m[i]] == NULL);
+
+		insn.Dst[0].Register.WriteMask =
+			fd->Register.WriteMask & (1 << m[i]);
+
+		if (!insn.Dst[0].Register.WriteMask)
+			continue;
+
+		if (deqs & (1 << i))
+			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
+
+		if (!nv50_program_tx_insn(pc, &insn))
+			return FALSE;
+	}
+
+	for (i = 0; i < 4; i++) {
+		struct nv50_reg *reg = pc->r_dst[i];
+		if (!reg)
+			continue;
+		pc->r_dst[i] = NULL;
+
+		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
+			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
+		else
+			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
+		free_temp(pc, reg);
+	}
+
+	return TRUE;
+}
+
+static void
+load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	struct nv50_reg *iv, **ppiv;
+	unsigned mode = pc->interp_mode[reg->index];
+
+	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
+	iv = *ppiv;
+
+	if ((mode & INTERP_PERSPECTIVE) && !iv) {
+		iv = *ppiv = alloc_temp(pc, NULL);
+		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
+
+		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
+		emit_flop(pc, NV50_FLOP_RCP, iv, iv);
+
+		/* XXX: when loading interpolants dynamically, move these
+		 * to the program head, or make sure it can't be skipped.
+		 */
+	}
+
+	emit_interp(pc, reg, iv, mode);
+}
+
+/* The face input is always at v[255] (varying space), with a
+ * value of 0 for back-facing, and 0xffffffff for front-facing.
+ */
+static void
+load_frontfacing(struct nv50_pc *pc, struct nv50_reg *sv)
+{
+	struct nv50_reg *temp = alloc_temp(pc, NULL);
+	int r_pred = 0;
+
+	temp->rhw = 255;
+	emit_interp(pc, temp, NULL, INTERP_FLAT);
+
+	emit_cvt(pc, sv, temp, r_pred, CVT_ABS | CVT_F32_S32);
+
+	emit_not(pc, temp, temp);
+	set_pred(pc, 0x2, r_pred, pc->p->exec_tail);
+	emit_cvt(pc, sv, temp, -1, CVT_F32_S32);
+	set_pred(pc, 0x2, r_pred, pc->p->exec_tail);
+
+	free_temp(pc, temp);
+}
+
+static void
+load_instance_id(struct nv50_pc *pc, unsigned index)
+{
+	struct nv50_reg reg, mem;
+
+	ctor_reg(&reg, P_TEMP, -1, -1);
+	ctor_reg(&mem, P_CONST, -1, 24); /* startInstance */
+	mem.buf_index = 2;
+
+	emit_add_b32(pc, &reg, &pc->sysval[index], &mem);
+	pc->sysval[index] = reg;
+}
+
+static void
+copy_semantic_info(struct nv50_program *p)
+{
+	unsigned i, id;
+
+	for (i = 0; i < p->cfg.in_nr; ++i) {
+		id = p->cfg.in[i].id;
+		p->cfg.in[i].sn = p->info.input_semantic_name[id];
+		p->cfg.in[i].si = p->info.input_semantic_index[id];
+	}
+
+	for (i = 0; i < p->cfg.out_nr; ++i) {
+		id = p->cfg.out[i].id;
+		p->cfg.out[i].sn = p->info.output_semantic_name[id];
+		p->cfg.out[i].si = p->info.output_semantic_index[id];
+	}
+}
+
+static boolean
+nv50_program_tx_prep(struct nv50_pc *pc)
+{
+	struct tgsi_parse_context tp;
+	struct nv50_program *p = pc->p;
+	boolean ret = FALSE;
+	unsigned i, c, instance_id = 0, vertex_id = 0, flat_nr = 0;
+
+	tgsi_parse_init(&tp, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&tp)) {
+		const union tgsi_full_token *tok = &tp.FullToken;
+
+		tgsi_parse_token(&tp);
+		switch (tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm =
+				&tp.FullToken.FullImmediate;
+
+			ctor_immd_4f32(pc, imm->u[0].Float,
+				       imm->u[1].Float,
+				       imm->u[2].Float,
+				       imm->u[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *d;
+			unsigned si, last, first, mode;
+
+			d = &tp.FullToken.FullDeclaration;
+			first = d->Range.First;
+			last = d->Range.Last;
+
+			switch (d->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!d->Declaration.Semantic ||
+				    p->type == PIPE_SHADER_FRAGMENT)
+					break;
+
+				si = d->Semantic.Index;
+				switch (d->Semantic.Name) {
+				case TGSI_SEMANTIC_BCOLOR:
+					p->cfg.two_side[si].hw = first;
+					if (p->cfg.out_nr > first)
+						p->cfg.out_nr = first;
+					break;
+				case TGSI_SEMANTIC_PSIZE:
+					p->cfg.psiz = first;
+					if (p->cfg.out_nr > first)
+						p->cfg.out_nr = first;
+					break;
+				case TGSI_SEMANTIC_EDGEFLAG:
+					pc->edgeflag_out = first;
+					break;
+					/*
+				case TGSI_SEMANTIC_CLIP_DISTANCE:
+					p->cfg.clpd = MIN2(p->cfg.clpd, first);
+					break;
+					*/
+				default:
+					break;
+				}
+				break;
+			case TGSI_FILE_INPUT:
+			{
+				if (p->type != PIPE_SHADER_FRAGMENT)
+					break;
+
+				switch (d->Declaration.Interpolate) {
+				case TGSI_INTERPOLATE_CONSTANT:
+					mode = INTERP_FLAT;
+					flat_nr++;
+					break;
+				case TGSI_INTERPOLATE_PERSPECTIVE:
+					mode = INTERP_PERSPECTIVE;
+					p->cfg.regs[1] |= 0x08 << 24;
+					break;
+				default:
+					mode = INTERP_LINEAR;
+					break;
+				}
+				if (d->Declaration.Centroid)
+					mode |= INTERP_CENTROID;
+
+				assert(last < 32);
+				for (i = first; i <= last; i++)
+					pc->interp_mode[i] = mode;
+			}
+				break;
+			case TGSI_FILE_SYSTEM_VALUE:
+				assert(d->Declaration.Semantic);
+				switch (d->Semantic.Name) {
+				case TGSI_SEMANTIC_FACE:
+					assert(p->type == PIPE_SHADER_FRAGMENT);
+					load_frontfacing(pc,
+							 &pc->sysval[first]);
+					break;
+				case TGSI_SEMANTIC_INSTANCEID:
+					assert(p->type == PIPE_SHADER_VERTEX);
+					instance_id = first;
+					p->cfg.regs[0] |= (1 << 4);
+					break;
+				case TGSI_SEMANTIC_PRIMID:
+					assert(p->type != PIPE_SHADER_VERTEX);
+					p->cfg.prim_id = first;
+					break;
+					/*
+				case TGSI_SEMANTIC_PRIMIDIN:
+					assert(p->type == PIPE_SHADER_GEOMETRY);
+					pc->sysval[first].hw = 6;
+					p->cfg.regs[0] |= (1 << 8);
+					break;
+				case TGSI_SEMANTIC_VERTEXID:
+					assert(p->type == PIPE_SHADER_VERTEX);
+					vertex_id = first;
+					p->cfg.regs[0] |= (1 << 12) | (1 << 0);
+					break;
+					*/
+				}
+				break;
+			case TGSI_FILE_ADDRESS:
+			case TGSI_FILE_CONSTANT:
+			case TGSI_FILE_SAMPLER:
+				break;
+			default:
+				NOUVEAU_ERR("bad decl file %d\n",
+					    d->Declaration.File);
+				goto out_err;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			pc->insn_nr++;
+			prep_inspect_insn(pc, &tok->FullInstruction);
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (p->type == PIPE_SHADER_VERTEX || p->type == PIPE_SHADER_GEOMETRY) {
+		int rid = 0;
+
+		if (p->type == PIPE_SHADER_GEOMETRY) {
+			for (i = 0; i < pc->attr_nr; ++i) {
+				p->cfg.in[i].hw = rid;
+				p->cfg.in[i].id = i;
+
+				for (c = 0; c < 4; ++c) {
+					int n = i * 4 + c;
+					if (!pc->attr[n].acc)
+						continue;
+					pc->attr[n].hw = rid++;
+					p->cfg.in[i].mask |= 1 << c;
+				}
+			}
+		} else {
+			for (i = 0; i < pc->attr_nr * 4; ++i) {
+				if (pc->attr[i].acc) {
+					pc->attr[i].hw = rid++;
+					p->cfg.attr[i / 32] |= 1 << (i % 32);
+				}
+			}
+			if (p->cfg.regs[0] & (1 << 0))
+				pc->sysval[vertex_id].hw = rid++;
+			if (p->cfg.regs[0] & (1 << 4)) {
+				pc->sysval[instance_id].hw = rid++;
+				load_instance_id(pc, instance_id);
+			}
+		}
+
+		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
+			p->cfg.out[i].hw = rid;
+			p->cfg.out[i].id = i;
+
+			for (c = 0; c < 4; ++c) {
+				int n = i * 4 + c;
+				if (!pc->result[n].acc)
+					continue;
+				pc->result[n].hw = rid++;
+				p->cfg.out[i].mask |= 1 << c;
+			}
+		}
+		if (p->cfg.prim_id < 0x40) {
+			/* GP has to write to PrimitiveID */
+			ctor_reg(&pc->sysval[p->cfg.prim_id],
+				 P_RESULT, p->cfg.prim_id, rid);
+			p->cfg.prim_id = rid++;
+		}
+
+		for (c = 0; c < 2; ++c)
+			if (p->cfg.two_side[c].hw < 0x40)
+				p->cfg.two_side[c] = p->cfg.out[
+					p->cfg.two_side[c].hw];
+
+		if (p->cfg.psiz < 0x40)
+			p->cfg.psiz = p->cfg.out[p->cfg.psiz].hw;
+
+		copy_semantic_info(p);
+	} else
+	if (p->type == PIPE_SHADER_FRAGMENT) {
+		int rid = 0, aid;
+		unsigned n = 0, m = pc->attr_nr - flat_nr;
+
+		pc->allow32 = TRUE;
+
+		/* do we read FragCoord ? */
+		if (pc->attr_nr &&
+		    p->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) {
+			/* select FCRD components we want accessible */
+			for (c = 0; c < 4; ++c)
+				if (pc->attr[c].acc)
+					p->cfg.regs[1] |= 1 << (24 + c);
+			aid = 0;
+		} else /* offset by 1 if FCRD.w is needed for pinterp */
+			aid = popcnt4(p->cfg.regs[1] >> 24);
+
+		/* non-flat interpolants have to be mapped to
+		 * the lower hardware IDs, so sort them:
+		 */
+		for (i = 0; i < pc->attr_nr; i++) {
+			if (pc->interp_mode[i] == INTERP_FLAT)
+				p->cfg.in[m++].id = i;
+			else {
+				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
+					p->cfg.in[n].linear = TRUE;
+				p->cfg.in[n++].id = i;
+			}
+		}
+		copy_semantic_info(p);
+
+		for (n = 0; n < pc->attr_nr; ++n) {
+			p->cfg.in[n].hw = rid = aid;
+			i = p->cfg.in[n].id;
+
+			if (p->info.input_semantic_name[i] ==
+			    TGSI_SEMANTIC_FACE) {
+				load_frontfacing(pc, &pc->attr[i * 4]);
+				continue;
+			}
+
+			for (c = 0; c < 4; ++c) {
+				if (!pc->attr[i * 4 + c].acc)
+					continue;
+				pc->attr[i * 4 + c].rhw = rid++;
+				p->cfg.in[n].mask |= 1 << c;
+
+				load_interpolant(pc, &pc->attr[i * 4 + c]);
+			}
+			aid += popcnt4(p->cfg.in[n].mask);
+		}
+
+		m = popcnt4(p->cfg.regs[1] >> 24);
+
+		/* set count of non-position inputs and of non-flat
+		 * non-position inputs for FP_INTERPOLANT_CTRL
+		 */
+		p->cfg.regs[1] |= aid - m;
+
+		if (flat_nr) {
+			i = p->cfg.in[pc->attr_nr - flat_nr].hw;
+			p->cfg.regs[1] |= (i - m) << 16;
+		} else
+			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
+
+		/* mark color semantic for light-twoside */
+		n = 0x80;
+		for (i = 0; i < p->cfg.in_nr; i++) {
+			if (p->cfg.in[i].sn == TGSI_SEMANTIC_COLOR) {
+				n = MIN2(n, p->cfg.in[i].hw - m);
+				p->cfg.two_side[p->cfg.in[i].si] = p->cfg.in[i];
+
+				p->cfg.regs[0] += /* increase colour count */
+					popcnt4(p->cfg.in[i].mask) << 16;
+			}
+		}
+		if (n < 0x80)
+			p->cfg.regs[0] += n;
+
+		if (p->cfg.prim_id < 0x40) {
+			pc->sysval[p->cfg.prim_id].rhw = rid++;
+			emit_interp(pc, &pc->sysval[p->cfg.prim_id], NULL,
+				    INTERP_FLAT);
+			/* increase FP_INTERPOLANT_CTRL_COUNT */
+			p->cfg.regs[1] += 1;
+		}
+
+		/* Initialize FP results:
+		 * FragDepth is always first TGSI and last hw output
+		 */
+		i = p->info.writes_z ? 4 : 0;
+		for (rid = 0; i < pc->result_nr * 4; i++)
+			pc->result[i].rhw = rid++;
+		if (p->info.writes_z)
+			pc->result[2].rhw = rid++;
+
+		p->cfg.high_result = rid;
+
+		/* separate/different colour results for MRTs ? */
+		if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1)
+			p->cfg.regs[2] |= 1;
+	}
+
+	if (pc->immd_nr) {
+		int rid = 0;
+
+		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->immd)
+			goto out_err;
+
+		for (i = 0; i < pc->immd_nr; i++) {
+			for (c = 0; c < 4; c++, rid++)
+				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
+		}
+	}
+
+	ret = TRUE;
+out_err:
+	if (pc->iv_p)
+		free_temp(pc, pc->iv_p);
+	if (pc->iv_c)
+		free_temp(pc, pc->iv_c);
+
+	tgsi_parse_free(&tp);
+	return ret;
+}
+
+static void
+free_nv50_pc(struct nv50_pc *pc)
+{
+	if (pc->immd)
+		FREE(pc->immd);
+	if (pc->param)
+		FREE(pc->param);
+	if (pc->result)
+		FREE(pc->result);
+	if (pc->attr)
+		FREE(pc->attr);
+	if (pc->temp)
+		FREE(pc->temp);
+	if (pc->sysval)
+		FREE(pc->sysval);
+	if (pc->insn_pos)
+		FREE(pc->insn_pos);
+
+	FREE(pc);
+}
+
+static INLINE uint32_t
+nv50_map_gs_output_prim(unsigned pprim)
+{
+	switch (pprim) {
+	case PIPE_PRIM_POINTS:
+		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;
+	case PIPE_PRIM_LINE_STRIP:
+		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;
+	default:
+		NOUVEAU_ERR("invalid GS_OUTPUT_PRIMITIVE: %u\n", pprim);
+		abort();
+		return 0;
+	}
+}
+
+static boolean
+ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
+{
+	int i, c;
+	unsigned rtype[2] = { P_ATTR, P_RESULT };
+
+	pc->p = p;
+	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
+	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
+	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
+	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
+	pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1;
+	assert(pc->addr_nr <= 2);
+	pc->sysval_nr = p->info.file_max[TGSI_FILE_SYSTEM_VALUE] + 1;
+
+	p->cfg.high_temp = 4;
+
+	p->cfg.two_side[0].hw = 0x40;
+	p->cfg.two_side[1].hw = 0x40;
+	p->cfg.prim_id = 0x40;
+
+	p->cfg.edgeflag_in = pc->edgeflag_out = 0xff;
+
+	for (i = 0; i < p->info.num_properties; ++i) {
+		unsigned *data = &p->info.properties[i].data[0];
+
+		switch (p->info.properties[i].name) {
+		case TGSI_PROPERTY_GS_OUTPUT_PRIM:
+			p->cfg.prim_type = nv50_map_gs_output_prim(data[0]);
+			break;
+		case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES:
+			p->cfg.vert_count = data[0];
+			break;
+		default:
+			break;
+		}
+	}
+
+	switch (p->type) {
+	case PIPE_SHADER_VERTEX:
+		p->cfg.psiz = 0x40;
+		p->cfg.clpd = 0x40;
+		p->cfg.out_nr = pc->result_nr;
+		break;
+	case PIPE_SHADER_GEOMETRY:
+		assert(p->cfg.prim_type);
+		assert(p->cfg.vert_count);
+
+		p->cfg.psiz = 0x80;
+		p->cfg.clpd = 0x80;
+		p->cfg.prim_id = 0x80;
+		p->cfg.out_nr = pc->result_nr;
+		p->cfg.in_nr = pc->attr_nr;
+
+		p->cfg.two_side[0].hw = 0x80;
+		p->cfg.two_side[1].hw = 0x80;
+		break;
+	case PIPE_SHADER_FRAGMENT:
+		rtype[0] = rtype[1] = P_TEMP;
+
+		p->cfg.regs[0] = 0x01000004;
+		p->cfg.in_nr = pc->attr_nr;
+
+		if (p->info.writes_z) {
+			p->cfg.regs[2] |= 0x00000100;
+			p->cfg.regs[3] |= 0x00000011;
+		}
+		if (p->info.uses_kill)
+			p->cfg.regs[2] |= 0x00100000;
+		break;
+	}
+
+	if (pc->temp_nr) {
+		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->temp)
+			return FALSE;
+
+		for (i = 0; i < pc->temp_nr * 4; ++i)
+			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
+	}
+
+	if (pc->attr_nr) {
+		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->attr)
+			return FALSE;
+
+		for (i = 0; i < pc->attr_nr * 4; ++i)
+			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
+	}
+
+	if (pc->result_nr) {
+		unsigned nr = pc->result_nr * 4;
+
+		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
+		if (!pc->result)
+			return FALSE;
+
+		for (i = 0; i < nr; ++i)
+			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
+	}
+
+	if (pc->param_nr) {
+		int rid = 0;
+
+		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->param)
+			return FALSE;
+
+		for (i = 0; i < pc->param_nr; ++i)
+			for (c = 0; c < 4; ++c, ++rid)
+				ctor_reg(&pc->param[rid], P_CONST, i, rid);
+	}
+
+	if (pc->addr_nr) {
+		pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *));
+		if (!pc->addr)
+			return FALSE;
+	}
+	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
+		ctor_reg(&pc->r_addr[i], P_ADDR, -1, i + 1);
+
+	if (pc->sysval_nr) {
+		pc->sysval = CALLOC(pc->sysval_nr, sizeof(struct nv50_reg *));
+		if (!pc->sysval)
+			return FALSE;
+		/* will only ever use SYSTEM_VALUE[i].x (hopefully) */
+		for (i = 0; i < pc->sysval_nr; ++i)
+			ctor_reg(&pc->sysval[i], rtype[0], i, -1);
+	}
+
+	return TRUE;
+}
+
+static void
+nv50_program_fixup_insns(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e, **bra_list;
+	unsigned i, n, pos;
+
+	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
+
+	/* Collect branch instructions, we need to adjust their offsets
+	 * when converting 32 bit instructions to 64 bit ones
+	 */
+	for (n = 0, e = pc->p->exec_head; e; e = e->next)
+		if (e->param.index >= 0 && !e->param.mask)
+			bra_list[n++] = e;
+
+	/* Make sure we don't have any single 32 bit instructions. */
+	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
+		pos += is_long(e) ? 2 : 1;
+
+		if ((pos & 1) && (!e->next || is_long(e->next))) {
+			for (i = 0; i < n; ++i)
+				if (bra_list[i]->param.index >= pos)
+					bra_list[i]->param.index += 1;
+			for (i = 0; i < pc->insn_nr; ++i)
+				if (pc->insn_pos[i] >= pos)
+					pc->insn_pos[i] += 1;
+			convert_to_long(pc, e);
+			++pos;
+		}
+	}
+
+	FREE(bra_list);
+
+	if (!pc->p->info.opcode_count[TGSI_OPCODE_CAL])
+		return;
+
+	/* fill in CALL offsets */
+	for (e = pc->p->exec_head; e; e = e->next) {
+		if ((e->inst[0] & 2) && (e->inst[0] >> 28) == 0x2)
+			e->param.index = pc->insn_pos[e->param.index];
+	}
+}
+
+static boolean
+nv50_program_tx(struct nv50_program *p)
+{
+	struct tgsi_parse_context parse;
+	struct nv50_pc *pc;
+	boolean ret;
+
+	pc = CALLOC_STRUCT(nv50_pc);
+	if (!pc)
+		return FALSE;
+
+	ret = ctor_nv50_pc(pc, p);
+	if (ret == FALSE)
+		goto out_cleanup;
+
+	ret = nv50_program_tx_prep(pc);
+	if (ret == FALSE)
+		goto out_cleanup;
+
+	pc->insn_pos = MALLOC(pc->insn_nr * sizeof(unsigned));
+
+	tgsi_parse_init(&parse, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		const union tgsi_full_token *tok = &parse.FullToken;
+
+		/* previously allow32 was FALSE for first & last instruction */
+		pc->allow32 = TRUE;
+
+		tgsi_parse_token(&parse);
+
+		switch (tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			pc->insn_pos[pc->insn_cur] = pc->p->exec_size;
+			++pc->insn_cur;
+			ret = nv50_tgsi_insn(pc, tok);
+			if (ret == FALSE)
+				goto out_err;
+			break;
+		default:
+			break;
+		}
+	}
+
+	nv50_program_fixup_insns(pc);
+
+	p->param_nr = pc->param_nr * 4;
+	p->immd_nr = pc->immd_nr * 4;
+	p->immd = pc->immd_buf;
+
+out_err:
+	tgsi_parse_free(&parse);
+
+out_cleanup:
+	free_nv50_pc(pc);
+	return ret;
+}
+
+static void
+nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
+{
+	if (nv50_program_tx(p) == FALSE)
+		assert(0);
+	p->translated = TRUE;
+}
+
+static void
+nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map,
+			unsigned start, unsigned count, unsigned cbuf)
+{
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+
+	while (count) {
+		unsigned nr = count > 2047 ? 2047 : count;
+
+		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
+		OUT_RING  (chan, (cbuf << 0) | (start << 8));
+		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
+		OUT_RINGp (chan, map, nr);
+
+		map += nr;
+		start += nr;
+		count -= nr;
+	}
+}
+
+static void
+nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct pipe_context *pipe = &nv50->pipe;
+	struct pipe_transfer *transfer;
+
+	if (!p->data[0] && p->immd_nr) {
+		struct nouveau_resource *heap = nv50->screen->immd_heap;
+
+		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
+			while (heap->next && heap->size < p->immd_nr) {
+				struct nv50_program *evict = heap->next->priv;
+				nouveau_resource_free(&evict->data[0]);
+			}
+
+			if (nouveau_resource_alloc(heap, p->immd_nr, p,
+						   &p->data[0]))
+				assert(0);
+		}
+
+		/* immediates only need to be uploaded again when freed */
+		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
+					 p->immd_nr, NV50_CB_PMISC);
+	}
+
+	assert(p->param_nr <= 16384);
+
+	if (p->param_nr) {
+		unsigned cb;
+		uint32_t *map = pipe_buffer_map(pipe,
+						nv50->constbuf[p->type],
+						PIPE_TRANSFER_READ,
+						&transfer);
+		switch (p->type) {
+		case PIPE_SHADER_GEOMETRY: cb = NV50_CB_PGP; break;
+		case PIPE_SHADER_FRAGMENT: cb = NV50_CB_PFP; break;
+		default:
+			cb = NV50_CB_PVP;
+			assert(p->type == PIPE_SHADER_VERTEX);
+			break;
+		}
+
+		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
+		pipe_buffer_unmap(pipe, nv50->constbuf[p->type],
+				  transfer);
+	}
+}
+
+static void
+nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program_exec *e;
+	uint32_t *up, i;
+	boolean upload = FALSE;
+	unsigned offset;
+	int width;
+
+	if (!p->bo) {
+		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
+			       p->exec_size * 4, &p->bo);
+		upload = TRUE;
+	}
+
+	if (p->data[0] && p->data[0]->start != p->data_start[0])
+		upload = TRUE;
+
+	if (!upload)
+		return;
+
+	up = MALLOC(p->exec_size * 4);
+
+	for (i = 0, e = p->exec_head; e; e = e->next) {
+		unsigned ei, ci, bs;
+
+		if (e->param.index >= 0 && e->param.mask) {
+			bs = (e->inst[1] >> 22) & 0x07;
+			assert(bs < 2);
+			ei = e->param.shift >> 5;
+			ci = e->param.index;
+			if (bs == 0)
+				ci += p->data[bs]->start;
+
+			e->inst[ei] &= ~e->param.mask;
+			e->inst[ei] |= (ci << e->param.shift);
+		} else
+		if (e->param.index >= 0) {
+			/* zero mask means param is a jump/branch offset */
+			assert(!(e->param.index & 1));
+			/* seem to be 8 byte steps */
+			ei = (e->param.index >> 1) + 0 /* START_ID */;
+
+			e->inst[0] &= 0xf0000fff;
+			e->inst[0] |= ei << 12;
+		}
+
+		up[i++] = e->inst[0];
+		if (is_long(e))
+			up[i++] = e->inst[1];
+	}
+	assert(i == p->exec_size);
+
+	if (p->data[0])
+		p->data_start[0] = p->data[0]->start;
+
+#ifdef NV50_PROGRAM_DUMP
+	NOUVEAU_ERR("-------\n");
+	for (e = p->exec_head; e; e = e->next) {
+		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
+		if (is_long(e))
+			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
+	}
+#endif
+
+	/* SIFC_HEIGHT/SIFC_WIDTH of 65536 do not work, and are not reported
+	 * as data error either. hw bug ? */
+#define SIFC_MAX_WIDTH (65536 - 256)
+	offset = 0;
+	width = p->exec_size * 4;
+	while (width > 0) {
+		nv50_upload_sifc(nv50, p->bo, offset, NOUVEAU_BO_VRAM,
+				 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144,
+				 &up[offset / 4], NV50_2D_SIFC_FORMAT_R8_UNORM,
+				 0, 0, 0, MIN2(SIFC_MAX_WIDTH, width), 1, 1);
+		width -= SIFC_MAX_WIDTH;
+		offset += SIFC_MAX_WIDTH;
+	}
+	BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1);
+	OUT_RING  (chan, 0);
+
+	FREE(up);
+}
+
+struct nouveau_stateobj *
+nv50_vertprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->vertprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	if (!(nv50->dirty & NV50_NEW_VERTPROG))
+		return NULL;
+
+	so = so_new(5, 7, 2);
+	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
+	so_data  (so, p->cfg.attr[0]);
+	so_data  (so, p->cfg.attr[1]);
+	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
+	so_data  (so, p->cfg.high_result);
+	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1);
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
+	so_data  (so, 0); /* program start offset */
+	return so;
+}
+
+struct nouveau_stateobj *
+nv50_fragprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->fragprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	if (!(nv50->dirty & NV50_NEW_FRAGPROG))
+		return NULL;
+
+	so = so_new(6, 7, 2);
+	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		      NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		      NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
+	so_data  (so, p->cfg.high_result);
+	so_method(so, tesla, NV50TCL_FP_CONTROL, 1);
+	so_data  (so, p->cfg.regs[2]);
+	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
+	so_data  (so, p->cfg.regs[3]);
+	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
+	so_data  (so, 0); /* program start offset */
+	return so;
+}
+
+struct nouveau_stateobj *
+nv50_geomprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->geomprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	if (!(nv50->dirty & NV50_NEW_GEOMPROG))
+		return NULL;
+
+	so = so_new(6, 7, 2);
+	so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1);
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1);
+	so_data  (so, p->cfg.high_result);
+	so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1);
+	so_data  (so, p->cfg.prim_type);
+	so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1);
+	so_data  (so, p->cfg.vert_count);
+	so_method(so, tesla, NV50TCL_GP_START_ID, 1);
+	so_data  (so, 0);
+	return so;
+}
+
+static uint32_t
+nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
+{
+	struct nv50_program *vp;
+	struct nv50_program *fp = nv50->fragprog;
+	unsigned i, c, m = base;
+	uint32_t origin = 0x00000010;
+
+	vp = nv50->geomprog ? nv50->geomprog : nv50->vertprog;
+
+	/* XXX: this might not work correctly in all cases yet - we'll
+	 * just assume that an FP generic input that is not written in
+	 * the VP is PointCoord.
+	 */
+	memset(pntc, 0, 8 * sizeof(uint32_t));
+
+	for (i = 0; i < fp->cfg.in_nr; i++) {
+		unsigned j, n = popcnt4(fp->cfg.in[i].mask);
+
+		if (fp->cfg.in[i].sn != TGSI_SEMANTIC_GENERIC) {
+			m += n;
+			continue;
+		}
+
+		for (j = 0; j < vp->cfg.out_nr; ++j)
+			if (vp->cfg.out[j].sn ==  fp->cfg.in[i].sn &&
+			    vp->cfg.out[j].si == fp->cfg.in[i].si)
+				break;
+
+		if (j < vp->info.num_outputs) {
+			ubyte enable =
+				 (nv50->rasterizer->pipe.sprite_coord_enable >> vp->cfg.out[j].si) & 1;
+
+			if (enable == 0) {
+				m += n;
+				continue;
+			}
+		}
+
+		/* this is either PointCoord or replaced by sprite coords */
+		for (c = 0; c < 4; c++) {
+			if (!(fp->cfg.in[i].mask & (1 << c)))
+				continue;
+			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
+			++m;
+		}
+	}
+	return (nv50->rasterizer->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT ? 0 : origin);
+}
+
+static int
+nv50_vec4_map(uint32_t *map32, int mid, uint8_t zval, uint32_t lin[4],
+	      struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
+{
+	int c;
+	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
+	uint8_t *map = (uint8_t *)map32;
+
+	for (c = 0; c < 4; ++c) {
+		if (mf & 1) {
+			if (fpi->linear == TRUE)
+				lin[mid / 32] |= 1 << (mid % 32);
+			if (mv & 1)
+				map[mid] = oid;
+			else
+				map[mid] = (c == 3) ? (zval + 1) : zval;
+			++mid;
+		}
+
+		oid += mv & 1;
+		mf >>= 1;
+		mv >>= 1;
+	}
+
+	return mid;
+}
+
+struct nouveau_stateobj *
+nv50_fp_linkage_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *vp = nv50->vertprog;
+	struct nv50_program *fp = nv50->fragprog;
+	struct nouveau_stateobj *so;
+	struct nv50_sreg4 dummy;
+	int i, n, c, m = 0;
+	uint32_t map[16], lin[4], reg[6], pcrd[8];
+	uint8_t zval = 0x40;
+
+	if (nv50->geomprog) {
+		vp = nv50->geomprog;
+		zval = 0x80;
+	}
+	memset(map, 0, sizeof(map));
+	memset(lin, 0, sizeof(lin));
+
+	reg[1] = 0x00000004; /* low and high clip distance map ids */
+	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
+	reg[3] = 0x00000000; /* point size map id & enable */
+	reg[5] = 0x00000000; /* primitive ID map slot */
+	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
+	reg[4] = fp->cfg.regs[1]; /* interpolant info */
+
+	dummy.linear = FALSE;
+	dummy.mask = 0xf; /* map all components of HPOS */
+	m = nv50_vec4_map(map, m, zval, lin, &dummy, &vp->cfg.out[0]);
+
+	dummy.mask = 0x0;
+
+	if (vp->cfg.clpd < 0x40) {
+		for (c = 0; c < vp->cfg.clpd_nr; ++c) {
+			map[m / 4] |= (vp->cfg.clpd + c) << ((m % 4) * 8);
+			++m;
+		}
+		reg[1] = (m << 8);
+	}
+
+	reg[0] |= m << 8; /* adjust BFC0 id */
+
+	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
+	if (nv50->rasterizer->pipe.light_twoside) {
+		struct nv50_sreg4 *vpo = &vp->cfg.two_side[0];
+		struct nv50_sreg4 *fpi = &fp->cfg.two_side[0];
+
+		m = nv50_vec4_map(map, m, zval, lin, &fpi[0], &vpo[0]);
+		m = nv50_vec4_map(map, m, zval, lin, &fpi[1], &vpo[1]);
+	}
+
+	reg[0] += m - 4; /* adjust FFC0 id */
+	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
+
+	for (i = 0; i < fp->cfg.in_nr; i++) {
+		/* maybe even remove these from cfg.io */
+		if (fp->cfg.in[i].sn == TGSI_SEMANTIC_POSITION ||
+		    fp->cfg.in[i].sn == TGSI_SEMANTIC_FACE)
+			continue;
+
+		for (n = 0; n < vp->cfg.out_nr; ++n)
+			if (vp->cfg.out[n].sn == fp->cfg.in[i].sn &&
+			    vp->cfg.out[n].si == fp->cfg.in[i].si)
+				break;
+
+		m = nv50_vec4_map(map, m, zval, lin, &fp->cfg.in[i],
+				  (n < vp->cfg.out_nr) ?
+				  &vp->cfg.out[n] : &dummy);
+	}
+	/* PrimitiveID either is replaced by the system value, or
+	 * written by the geometry shader into an output register
+	 */
+	if (fp->cfg.prim_id < 0x40) {
+		map[m / 4] |= vp->cfg.prim_id << ((m % 4) * 8);
+		reg[5] = m++;
+	}
+
+	if (nv50->rasterizer->pipe.point_size_per_vertex) {
+		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
+		reg[3] = (m++ << 4) | 1;
+	}
+
+	/* now fill the stateobj (at most 28 so_data)  */
+	so = so_new(10, 54, 0);
+
+	n = (m + 3) / 4;
+	assert(m <= 64);
+	if (vp->type == PIPE_SHADER_GEOMETRY) {
+		so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1);
+		so_data  (so, m);
+		so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n);
+		so_datap (so, map, n);
+	} else {
+		so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1);
+		so_data  (so, vp->cfg.regs[0]);
+
+		so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1);
+		so_data  (so, reg[5]);
+
+		so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+		so_data  (so, m);
+		so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
+		so_datap (so, map, n);
+	}
+
+	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
+	so_datap (so, reg, 4);
+
+	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
+	so_data  (so, reg[4]);
+
+	so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4);
+	so_datap (so, lin, 4);
+
+	if (nv50->rasterizer->pipe.sprite_coord_enable) {
+		so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1);
+		so_data  (so,
+			  nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff));
+
+		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
+		so_datap (so, pcrd, 8);
+	}
+
+	so_method(so, tesla, NV50TCL_GP_ENABLE, 1);
+	so_data  (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0);
+
+	return so;
+}
+
+static int
+construct_vp_gp_mapping(uint32_t *map32, int m,
+			struct nv50_program *vp, struct nv50_program *gp)
+{
+	uint8_t *map = (uint8_t *)map32;
+	int i, j, c;
+
+        for (i = 0; i < gp->cfg.in_nr; ++i) {
+                uint8_t oid = 0, mv = 0, mg = gp->cfg.in[i].mask;
+
+                for (j = 0; j < vp->cfg.out_nr; ++j) {
+                        if (vp->cfg.out[j].sn == gp->cfg.in[i].sn &&
+                            vp->cfg.out[j].si == gp->cfg.in[i].si) {
+				mv = vp->cfg.out[j].mask;
+				oid = vp->cfg.out[j].hw;
+                                break;
+			}
+		}
+
+                for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) {
+			if (mg & mv & 1)
+				map[m++] = oid;
+			else
+			if (mg & 1)
+				map[m++] = (c == 3) ? 0x41 : 0x40;
+                        oid += mv & 1;
+                }
+        }
+	return m;
+}
+
+struct nouveau_stateobj *
+nv50_gp_linkage_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so;
+	struct nv50_program *vp = nv50->vertprog;
+	struct nv50_program *gp = nv50->geomprog;
+	uint32_t map[16];
+	int m = 0;
+
+	if (!gp)
+		return NULL;
+	memset(map, 0, sizeof(map));
+
+	m = construct_vp_gp_mapping(map, m, vp, gp);
+
+	so = so_new(3, 24 - 3, 0);
+
+	so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1);
+	so_data  (so, vp->cfg.regs[0] | gp->cfg.regs[0]);
+
+	assert(m <= 32);
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+	so_data  (so, m);
+
+	m = (m + 3) / 4;
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m);
+	so_datap (so, map, m);
+
+	return so;
+}
+
+void
+nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
+{
+	while (p->exec_head) {
+		struct nv50_program_exec *e = p->exec_head;
+
+		p->exec_head = e->next;
+		FREE(e);
+	}
+	p->exec_tail = NULL;
+	p->exec_size = 0;
+
+	nouveau_bo_ref(NULL, &p->bo);
+
+	FREE(p->immd);
+	nouveau_resource_free(&p->data[0]);
+
+	p->translated = 0;
+}
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
new file mode 100644
index 0000000000..1e3ad6bff0
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -0,0 +1,75 @@
+#ifndef __NV50_PROGRAM_H__
+#define __NV50_PROGRAM_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv50_program_exec {
+	struct nv50_program_exec *next;
+
+	unsigned inst[2];
+	struct {
+		int index;
+		unsigned mask;
+		unsigned shift;
+	} param;
+};
+
+struct nv50_sreg4 {
+	uint8_t hw; /* hw index, nv50 wants flat FP inputs last */
+	uint8_t id; /* tgsi index */
+
+	uint8_t mask;
+	boolean linear;
+
+	ubyte sn, si; /* semantic name & index */
+};
+
+struct nv50_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+	boolean translated;
+
+	unsigned type;
+	struct nv50_program_exec *exec_head;
+	struct nv50_program_exec *exec_tail;
+	unsigned exec_size;
+	struct nouveau_resource *data[1];
+	unsigned data_start[1];
+
+	struct nouveau_bo *bo;
+
+	uint32_t *immd;
+	unsigned immd_nr;
+	unsigned param_nr;
+
+	struct {
+		unsigned high_temp;
+		unsigned high_result;
+
+		uint32_t attr[2];
+		uint32_t regs[4];
+
+		/* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */
+		unsigned in_nr, out_nr;
+		struct nv50_sreg4 in[PIPE_MAX_SHADER_INPUTS];
+		struct nv50_sreg4 out[PIPE_MAX_SHADER_OUTPUTS];
+
+		/* FP colour inputs, VP/GP back colour outputs */
+		struct nv50_sreg4 two_side[2];
+
+		/* GP only */
+		unsigned vert_count;
+		uint8_t prim_type;
+
+		/* VP & GP only */
+		uint8_t clpd, clpd_nr;
+		uint8_t psiz;
+		uint8_t edgeflag_in;
+
+		/* FP & GP only */
+		uint8_t prim_id;
+	} cfg;
+};
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c
new file mode 100644
index 0000000000..c3ac804146
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_push.c
@@ -0,0 +1,361 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+
+#include "nouveau/nouveau_util.h"
+#include "nv50_context.h"
+#include "nv50_resource.h"
+
+struct push_context {
+   struct nv50_context *nv50;
+
+   unsigned vtx_size;
+
+   void *idxbuf;
+   int32_t idxbias;
+   unsigned idxsize;
+
+   float edgeflag;
+   int edgeflag_attr;
+
+   struct {
+      void *map;
+      unsigned stride;
+      unsigned divisor;
+      unsigned step;
+      void (*push)(struct nouveau_channel *, void *);
+   } attr[16];
+   unsigned attr_nr;
+};
+
+static void
+emit_b32_1(struct nouveau_channel *chan, void *data)
+{
+   uint32_t *v = data;
+
+   OUT_RING(chan, v[0]);
+}
+
+static void
+emit_b32_2(struct nouveau_channel *chan, void *data)
+{
+   uint32_t *v = data;
+
+   OUT_RING(chan, v[0]);
+   OUT_RING(chan, v[1]);
+}
+
+static void
+emit_b32_3(struct nouveau_channel *chan, void *data)
+{
+   uint32_t *v = data;
+
+   OUT_RING(chan, v[0]);
+   OUT_RING(chan, v[1]);
+   OUT_RING(chan, v[2]);
+}
+
+static void
+emit_b32_4(struct nouveau_channel *chan, void *data)
+{
+   uint32_t *v = data;
+
+   OUT_RING(chan, v[0]);
+   OUT_RING(chan, v[1]);
+   OUT_RING(chan, v[2]);
+   OUT_RING(chan, v[3]);
+}
+
+static void
+emit_b16_1(struct nouveau_channel *chan, void *data)
+{
+   uint16_t *v = data;
+
+   OUT_RING(chan, v[0]);
+}
+
+static void
+emit_b16_3(struct nouveau_channel *chan, void *data)
+{
+   uint16_t *v = data;
+
+   OUT_RING(chan, (v[1] << 16) | v[0]);
+   OUT_RING(chan, v[2]);
+}
+
+static void
+emit_b08_1(struct nouveau_channel *chan, void *data)
+{
+   uint8_t *v = data;
+
+   OUT_RING(chan, v[0]);
+}
+
+static void
+emit_b08_3(struct nouveau_channel *chan, void *data)
+{
+   uint8_t *v = data;
+
+   OUT_RING(chan, (v[2] << 16) | (v[1] << 8) | v[0]);
+}
+
+static INLINE void
+emit_vertex(struct push_context *ctx, unsigned n)
+{
+   struct nouveau_grobj *tesla = ctx->nv50->screen->tesla;
+   struct nouveau_channel *chan = tesla->channel;
+   int i;
+
+   if (ctx->edgeflag_attr < 16) {
+      float *edgeflag = (uint8_t *)ctx->attr[ctx->edgeflag_attr].map +
+                        ctx->attr[ctx->edgeflag_attr].stride * n;
+
+      if (*edgeflag != ctx->edgeflag) {
+         BEGIN_RING(chan, tesla, NV50TCL_EDGEFLAG_ENABLE, 1);
+         OUT_RING  (chan, *edgeflag ? 1 : 0);
+         ctx->edgeflag = *edgeflag;
+      }
+   }
+
+   BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, ctx->vtx_size);
+   for (i = 0; i < ctx->attr_nr; i++)
+      ctx->attr[i].push(chan,
+			(uint8_t *)ctx->attr[i].map + ctx->attr[i].stride * n);
+}
+
+static void
+emit_edgeflag(void *priv, boolean enabled)
+{
+   struct push_context *ctx = priv;
+   struct nouveau_grobj *tesla = ctx->nv50->screen->tesla;
+   struct nouveau_channel *chan = tesla->channel;
+
+   BEGIN_RING(chan, tesla, NV50TCL_EDGEFLAG_ENABLE, 1);
+   OUT_RING  (chan, enabled ? 1 : 0);
+}
+
+static void
+emit_elt08(void *priv, unsigned start, unsigned count)
+{
+   struct push_context *ctx = priv;
+   uint8_t *idxbuf = ctx->idxbuf;
+
+   while (count--)
+      emit_vertex(ctx, idxbuf[start++]);
+}
+
+static void
+emit_elt08_biased(void *priv, unsigned start, unsigned count)
+{
+   struct push_context *ctx = priv;
+   uint8_t *idxbuf = ctx->idxbuf;
+
+   while (count--)
+      emit_vertex(ctx, idxbuf[start++] + ctx->idxbias);
+}
+
+static void
+emit_elt16(void *priv, unsigned start, unsigned count)
+{
+   struct push_context *ctx = priv;
+   uint16_t *idxbuf = ctx->idxbuf;
+
+   while (count--)
+      emit_vertex(ctx, idxbuf[start++]);
+}
+
+static void
+emit_elt16_biased(void *priv, unsigned start, unsigned count)
+{
+   struct push_context *ctx = priv;
+   uint16_t *idxbuf = ctx->idxbuf;
+
+   while (count--)
+      emit_vertex(ctx, idxbuf[start++] + ctx->idxbias);
+}
+
+static void
+emit_elt32(void *priv, unsigned start, unsigned count)
+{
+   struct push_context *ctx = priv;
+   uint32_t *idxbuf = ctx->idxbuf;
+
+   while (count--)
+      emit_vertex(ctx, idxbuf[start++]);
+}
+
+static void
+emit_elt32_biased(void *priv, unsigned start, unsigned count)
+{
+   struct push_context *ctx = priv;
+   uint32_t *idxbuf = ctx->idxbuf;
+
+   while (count--)
+      emit_vertex(ctx, idxbuf[start++] + ctx->idxbias);
+}
+
+static void
+emit_verts(void *priv, unsigned start, unsigned count)
+{
+   while (count--)
+      emit_vertex(priv, start++);
+}
+
+void
+nv50_push_elements_instanced(struct pipe_context *pipe,
+                             struct pipe_resource *idxbuf,
+                             unsigned idxsize, int idxbias,
+                             unsigned mode, unsigned start, unsigned count,
+                             unsigned i_start, unsigned i_count)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_grobj *tesla = nv50->screen->tesla;
+   struct nouveau_channel *chan = tesla->channel;
+   struct push_context ctx;
+   const unsigned p_overhead = 4 + /* begin/end */
+                               4; /* potential edgeflag enable/disable */
+   const unsigned v_overhead = 1 + /* VERTEX_DATA packet header */
+                               2; /* potential edgeflag modification */
+   struct u_split_prim s;
+   unsigned vtx_size;
+   boolean nzi = FALSE;
+   int i;
+
+   ctx.nv50 = nv50;
+   ctx.attr_nr = 0;
+   ctx.idxbuf = NULL;
+   ctx.vtx_size = 0;
+   ctx.edgeflag = 0.5f;
+   ctx.edgeflag_attr = nv50->vertprog->cfg.edgeflag_in;
+
+   /* map vertex buffers, determine vertex size */
+   for (i = 0; i < nv50->vtxelt->num_elements; i++) {
+      struct pipe_vertex_element *ve = &nv50->vtxelt->pipe[i];
+      struct pipe_vertex_buffer *vb = &nv50->vtxbuf[ve->vertex_buffer_index];
+      struct nouveau_bo *bo = nv50_resource(vb->buffer)->bo;
+      unsigned size, nr_components, n;
+
+      if (!(nv50->vbo_fifo & (1 << i)))
+         continue;
+      n = ctx.attr_nr++;
+
+      if (nouveau_bo_map(bo, NOUVEAU_BO_RD)) {
+         assert(bo->map);
+         return;
+      }
+      ctx.attr[n].map = (uint8_t *)bo->map + vb->buffer_offset + ve->src_offset;
+      nouveau_bo_unmap(bo);
+
+      ctx.attr[n].stride = vb->stride;
+      ctx.attr[n].divisor = ve->instance_divisor;
+      if (ctx.attr[n].divisor) {
+         ctx.attr[n].step = i_start % ve->instance_divisor;
+         ctx.attr[n].map = (uint8_t *)ctx.attr[n].map + i_start * vb->stride;
+      }
+
+      size = util_format_get_component_bits(ve->src_format,
+                                            UTIL_FORMAT_COLORSPACE_RGB, 0);
+      nr_components = util_format_get_nr_components(ve->src_format);
+      switch (size) {
+      case 8:
+         switch (nr_components) {
+         case 1: ctx.attr[n].push = emit_b08_1; break;
+         case 2: ctx.attr[n].push = emit_b16_1; break;
+         case 3: ctx.attr[n].push = emit_b08_3; break;
+         case 4: ctx.attr[n].push = emit_b32_1; break;
+         }
+         ctx.vtx_size++;
+         break;
+      case 16:
+         switch (nr_components) {
+         case 1: ctx.attr[n].push = emit_b16_1; break;
+         case 2: ctx.attr[n].push = emit_b32_1; break;
+         case 3: ctx.attr[n].push = emit_b16_3; break;
+         case 4: ctx.attr[n].push = emit_b32_2; break;
+         }
+         ctx.vtx_size += (nr_components + 1) >> 1;
+         break;
+      case 32:
+         switch (nr_components) {
+         case 1: ctx.attr[n].push = emit_b32_1; break;
+         case 2: ctx.attr[n].push = emit_b32_2; break;
+         case 3: ctx.attr[n].push = emit_b32_3; break;
+         case 4: ctx.attr[n].push = emit_b32_4; break;
+         }
+         ctx.vtx_size += nr_components;
+         break;
+      default:
+         assert(0);
+         return;
+      }
+   }
+   vtx_size = ctx.vtx_size + v_overhead;
+
+   /* map index buffer, if present */
+   if (idxbuf) {
+      struct nouveau_bo *bo = nv50_resource(idxbuf)->bo;
+
+      if (nouveau_bo_map(bo, NOUVEAU_BO_RD)) {
+         assert(bo->map);
+         return;
+      }
+      ctx.idxbuf = bo->map;
+      ctx.idxbias = idxbias;
+      ctx.idxsize = idxsize;
+      nouveau_bo_unmap(bo);
+   }
+
+   s.priv = &ctx;
+   s.edge = emit_edgeflag;
+   if (idxbuf) {
+      if (idxsize == 1)
+         s.emit = idxbias ? emit_elt08_biased : emit_elt08;
+      else
+      if (idxsize == 2)
+         s.emit = idxbias ? emit_elt16_biased : emit_elt16;
+      else
+         s.emit = idxbias ? emit_elt32_biased : emit_elt32;
+   } else
+      s.emit = emit_verts;
+
+   /* per-instance loop */
+   BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 2);
+   OUT_RING  (chan, NV50_CB_AUX | (24 << 8));
+   OUT_RING  (chan, i_start);
+   while (i_count--) {
+      unsigned max_verts;
+      boolean done;
+
+      for (i = 0; i < ctx.attr_nr; i++) {
+         if (!ctx.attr[i].divisor ||
+              ctx.attr[i].divisor != ++ctx.attr[i].step)
+            continue;
+         ctx.attr[i].step = 0;
+         ctx.attr[i].map = (uint8_t *)ctx.attr[i].map + ctx.attr[i].stride;
+      }
+
+      u_split_prim_init(&s, mode, start, count);
+      do {
+         if (AVAIL_RING(chan) < p_overhead + (6 * vtx_size)) {
+            FIRE_RING(chan);
+            if (!nv50_state_validate(nv50, p_overhead + (6 * vtx_size))) {
+               assert(0);
+               return;
+            }
+         }
+
+         max_verts  = AVAIL_RING(chan);
+         max_verts -= p_overhead;
+         max_verts /= vtx_size;
+
+         BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+         OUT_RING  (chan, nv50_prim(s.mode) | (nzi ? (1 << 28) : 0));
+         done = u_split_prim_next(&s, max_verts);
+         BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+         OUT_RING  (chan, 0);
+      } while (!done);
+
+      nzi = TRUE;
+   }
+}
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
new file mode 100644
index 0000000000..53f94820ce
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "util/u_inlines.h"
+
+#include "nv50_context.h"
+
+struct nv50_query {
+	struct nouveau_bo *bo;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv50_query *
+nv50_query(struct pipe_query *pipe)
+{
+	return (struct nv50_query *)pipe;
+}
+
+static struct pipe_query *
+nv50_query_create(struct pipe_context *pipe, unsigned type)
+{
+	struct nouveau_device *dev = nouveau_screen(pipe->screen)->device;
+	struct nv50_query *q = CALLOC_STRUCT(nv50_query);
+	int ret;
+
+	assert (q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+	q->type = type;
+
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 256,
+			     16, &q->bo);
+	if (ret) {
+		FREE(q);
+		return NULL;
+	}
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv50_query *q = nv50_query(pq);
+
+	if (q) {
+		nouveau_bo_ref(NULL, &q->bo);
+		FREE(q);
+	}
+}
+
+static void
+nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_query *q = nv50_query(pq);
+
+	BEGIN_RING(chan, tesla, NV50TCL_SAMPLECNT_RESET, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, tesla, NV50TCL_SAMPLECNT_ENABLE, 1);
+	OUT_RING  (chan, 1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_query *q = nv50_query(pq);
+
+	MARK_RING (chan, 5, 2); /* flush on lack of space or relocs */
+	BEGIN_RING(chan, tesla, NV50TCL_QUERY_ADDRESS_HIGH, 4);
+	OUT_RELOCh(chan, q->bo, 0, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, q->bo, 0, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+	OUT_RING  (chan, 0x00000000);
+	OUT_RING  (chan, 0x0100f002);
+
+	BEGIN_RING(chan, tesla, NV50TCL_SAMPLECNT_ENABLE, 1);
+	OUT_RING  (chan, 0);
+}
+
+static boolean
+nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, void *vresult)
+{
+	uint64_t *result = (uint64_t*)vresult;
+	struct nv50_query *q = nv50_query(pq);
+	int ret;
+
+	if (!q->ready) {
+		ret = nouveau_bo_map(q->bo, NOUVEAU_BO_RD |
+				     (wait ? 0 : NOUVEAU_BO_NOWAIT));
+		if (ret)
+			return false;
+		q->result = ((uint32_t *)q->bo->map)[1];
+		q->ready = TRUE;
+		nouveau_bo_unmap(q->bo);
+	}
+
+	*result = q->result;
+	return q->ready;
+}
+
+static void
+nv50_render_condition(struct pipe_context *pipe,
+		      struct pipe_query *pq, uint mode)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_query *q;
+
+	if (!pq) {
+		BEGIN_RING(chan, tesla, NV50TCL_COND_MODE, 1);
+		OUT_RING  (chan, NV50TCL_COND_MODE_ALWAYS);
+		return;
+	}
+	q = nv50_query(pq);
+
+	if (mode == PIPE_RENDER_COND_WAIT ||
+	    mode == PIPE_RENDER_COND_BY_REGION_WAIT) {
+		/* XXX: big fence, FIFO semaphore might be better */
+		BEGIN_RING(chan, tesla, 0x0110, 1);
+		OUT_RING  (chan, 0);
+	}
+
+	BEGIN_RING(chan, tesla, NV50TCL_COND_ADDRESS_HIGH, 3);
+	OUT_RELOCh(chan, q->bo, 0, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, q->bo, 0, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RING  (chan, NV50TCL_COND_MODE_RES);
+}
+
+void
+nv50_init_query_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.create_query = nv50_query_create;
+	nv50->pipe.destroy_query = nv50_query_destroy;
+	nv50->pipe.begin_query = nv50_query_begin;
+	nv50->pipe.end_query = nv50_query_end;
+	nv50->pipe.get_query_result = nv50_query_result;
+	nv50->pipe.render_condition = nv50_render_condition;
+}
diff --git a/src/gallium/drivers/nv50/nv50_resource.c b/src/gallium/drivers/nv50/nv50_resource.c
new file mode 100644
index 0000000000..cfdb60418b
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_resource.c
@@ -0,0 +1,67 @@
+
+#include "pipe/p_context.h"
+#include "nv50_resource.h"
+#include "nouveau/nouveau_screen.h"
+
+
+/* This doesn't look quite right - this query is supposed to ask
+ * whether the particular context has references to the resource in
+ * any unflushed rendering command buffer, and hence requires a
+ * pipe->flush() for serializing some modification to that resource.
+ *
+ * This seems to be answering the question of whether the resource is
+ * currently on hardware.
+ */
+static unsigned int
+nv50_resource_is_referenced(struct pipe_context *pipe,
+			    struct pipe_resource *resource,
+			    unsigned face, unsigned level)
+{
+	return nouveau_reference_flags(nv50_resource(resource)->bo);
+}
+
+static struct pipe_resource *
+nv50_resource_create(struct pipe_screen *screen,
+		     const struct pipe_resource *template)
+{
+	if (template->target == PIPE_BUFFER)
+		return nv50_buffer_create(screen, template);
+	else
+		return nv50_miptree_create(screen, template);
+}
+
+static struct pipe_resource *
+nv50_resource_from_handle(struct pipe_screen * screen,
+			  const struct pipe_resource *template,
+			  struct winsys_handle *whandle)
+{
+	if (template->target == PIPE_BUFFER)
+		return NULL;
+	else
+		return nv50_miptree_from_handle(screen, template, whandle);
+}
+
+void
+nv50_init_resource_functions(struct pipe_context *pcontext)
+{
+	pcontext->get_transfer = u_get_transfer_vtbl;
+	pcontext->transfer_map = u_transfer_map_vtbl;
+	pcontext->transfer_flush_region = u_transfer_flush_region_vtbl;
+	pcontext->transfer_unmap = u_transfer_unmap_vtbl;
+	pcontext->transfer_destroy = u_transfer_destroy_vtbl;
+	pcontext->transfer_inline_write = u_transfer_inline_write_vtbl;
+	pcontext->is_resource_referenced = nv50_resource_is_referenced;
+}
+
+void
+nv50_screen_init_resource_functions(struct pipe_screen *pscreen)
+{
+	pscreen->resource_create = nv50_resource_create;
+	pscreen->resource_from_handle = nv50_resource_from_handle;
+	pscreen->resource_get_handle = u_resource_get_handle_vtbl;
+	pscreen->resource_destroy = u_resource_destroy_vtbl;
+	pscreen->user_buffer_create = nv50_user_buffer_create;
+   
+	pscreen->get_tex_surface = nv50_miptree_surface_new;
+	pscreen->tex_surface_destroy = nv50_miptree_surface_del;
+}
diff --git a/src/gallium/drivers/nv50/nv50_resource.h b/src/gallium/drivers/nv50/nv50_resource.h
new file mode 100644
index 0000000000..f435a5892e
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_resource.h
@@ -0,0 +1,98 @@
+
+#ifndef NV50_RESOURCE_H
+#define NV50_RESOURCE_H
+
+#include "util/u_transfer.h"
+
+#include "nouveau/nouveau_winsys.h"
+
+struct pipe_resource;
+struct nouveau_bo;
+
+
+/* This gets further specialized into either buffer or texture
+ * structures.  In the future we'll want to remove much of that
+ * distinction, but for now try to keep as close to the existing code
+ * as possible and use the vtbl struct to choose between the two
+ * underlying implementations.
+ */
+struct nv50_resource {
+	struct pipe_resource base;
+	const struct u_resource_vtbl *vtbl;
+	struct nouveau_bo *bo;
+};
+
+struct nv50_miptree_level {
+	int *image_offset;
+	unsigned pitch;
+	unsigned tile_mode;
+};
+
+#define NV50_MAX_TEXTURE_LEVELS 16
+
+struct nv50_miptree {
+	struct nv50_resource base;
+
+	struct nv50_miptree_level level[NV50_MAX_TEXTURE_LEVELS];
+	int image_nr;
+	int total_size;
+};
+
+static INLINE struct nv50_miptree *
+nv50_miptree(struct pipe_resource *pt)
+{
+	return (struct nv50_miptree *)pt;
+}
+
+
+static INLINE 
+struct nv50_resource *nv50_resource(struct pipe_resource *resource)
+{
+	return (struct nv50_resource *)resource;
+}
+
+/* is resource mapped into the GPU's address space (i.e. VRAM or GART) ? */
+static INLINE boolean
+nv50_resource_mapped_by_gpu(struct pipe_resource *resource)
+{
+   return nv50_resource(resource)->bo->handle;
+}
+
+void
+nv50_init_resource_functions(struct pipe_context *pcontext);
+
+void
+nv50_screen_init_resource_functions(struct pipe_screen *pscreen);
+
+/* Internal functions
+ */
+struct pipe_resource *
+nv50_miptree_create(struct pipe_screen *pscreen,
+		    const struct pipe_resource *tmp);
+
+struct pipe_resource *
+nv50_miptree_from_handle(struct pipe_screen *pscreen,
+			 const struct pipe_resource *template,
+			 struct winsys_handle *whandle);
+
+struct pipe_resource *
+nv50_buffer_create(struct pipe_screen *pscreen,
+		   const struct pipe_resource *template);
+
+struct pipe_resource *
+nv50_user_buffer_create(struct pipe_screen *screen,
+			void *ptr,
+			unsigned bytes,
+			unsigned usage);
+
+
+struct pipe_surface *
+nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags);
+
+void
+nv50_miptree_surface_del(struct pipe_surface *ps);
+
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
new file mode 100644
index 0000000000..21908bcd3c
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -0,0 +1,542 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "util/u_format_s3tc.h"
+#include "pipe/p_screen.h"
+
+#include "nv50_context.h"
+#include "nv50_screen.h"
+#include "nv50_resource.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+static boolean
+nv50_screen_is_format_supported(struct pipe_screen *pscreen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned sample_count,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (sample_count > 1)
+		return FALSE;
+
+	if (tex_usage & PIPE_BIND_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_B8G8R8X8_UNORM:
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case PIPE_FORMAT_B5G6R5_UNORM:
+		case PIPE_FORMAT_R16G16B16A16_SNORM:
+		case PIPE_FORMAT_R16G16B16A16_UNORM:
+		case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		case PIPE_FORMAT_R16G16_SNORM:
+		case PIPE_FORMAT_R16G16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else
+	if (tex_usage & PIPE_BIND_DEPTH_STENCIL) {
+		switch (format) {
+		case PIPE_FORMAT_Z32_FLOAT:
+		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+		case PIPE_FORMAT_Z24X8_UNORM:
+		case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		if (tex_usage & PIPE_BIND_SAMPLER_VIEW) {
+			switch (format) {
+			case PIPE_FORMAT_DXT1_RGB:
+			case PIPE_FORMAT_DXT1_RGBA:
+			case PIPE_FORMAT_DXT3_RGBA:
+			case PIPE_FORMAT_DXT5_RGBA:
+				return util_format_s3tc_enabled;
+			default:
+				break;
+			}
+		}
+		switch (format) {
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case PIPE_FORMAT_B8G8R8X8_UNORM:
+		case PIPE_FORMAT_B8G8R8A8_SRGB:
+		case PIPE_FORMAT_B8G8R8X8_SRGB:
+		case PIPE_FORMAT_B5G5R5A1_UNORM:
+		case PIPE_FORMAT_B4G4R4A4_UNORM:
+		case PIPE_FORMAT_B5G6R5_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_L8A8_UNORM:
+		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+		case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+		case PIPE_FORMAT_Z32_FLOAT:
+		case PIPE_FORMAT_R16G16B16A16_SNORM:
+		case PIPE_FORMAT_R16G16B16A16_UNORM:
+		case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		case PIPE_FORMAT_R16G16_SNORM:
+		case PIPE_FORMAT_R16G16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static int
+nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 32;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 32;
+	case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+		return 64;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 1;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 1;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 8;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+        case PIPE_CAP_TIMER_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case PIPE_CAP_TGSI_CONT_SUPPORTED:
+		return 1;
+	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+		return 1;
+	case PIPE_CAP_INDEP_BLEND_ENABLE:
+		return 1;
+	case PIPE_CAP_INDEP_BLEND_FUNC:
+		return 0;
+	case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
+		return 1;
+	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+		return 1;
+	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+		return 0;
+	case PIPE_CAP_MAX_VS_INSTRUCTIONS:
+	case PIPE_CAP_MAX_FS_INSTRUCTIONS:
+	case PIPE_CAP_MAX_VS_ALU_INSTRUCTIONS:
+	case PIPE_CAP_MAX_FS_ALU_INSTRUCTIONS:
+	case PIPE_CAP_MAX_VS_TEX_INSTRUCTIONS:
+	case PIPE_CAP_MAX_FS_TEX_INSTRUCTIONS:
+	case PIPE_CAP_MAX_VS_TEX_INDIRECTIONS:
+	case PIPE_CAP_MAX_FS_TEX_INDIRECTIONS: /* arbitrary limit */
+		return 16384;
+	case PIPE_CAP_MAX_VS_CONTROL_FLOW_DEPTH:
+	case PIPE_CAP_MAX_FS_CONTROL_FLOW_DEPTH: /* need stack bo */
+		return 4;
+	case PIPE_CAP_MAX_VS_INPUTS:
+		return 16;
+	case PIPE_CAP_MAX_FS_INPUTS: /* 128 / 4 with GP */
+		return 64 / 4;
+	case PIPE_CAP_MAX_VS_CONSTS:
+	case PIPE_CAP_MAX_FS_CONSTS:
+		return 65536 / 16;
+	case PIPE_CAP_MAX_VS_ADDRS:
+	case PIPE_CAP_MAX_FS_ADDRS: /* no spilling atm */
+		return 1;
+	case PIPE_CAP_MAX_VS_PREDS:
+	case PIPE_CAP_MAX_FS_PREDS: /* not yet handled */
+		return 0;
+	case PIPE_CAP_MAX_VS_TEMPS:
+	case PIPE_CAP_MAX_FS_TEMPS: /* no spilling atm */
+		return 128 / 4;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv50_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_cap param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 16.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static void
+nv50_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	unsigned i;
+
+	for (i = 0; i < 3; i++) {
+		if (screen->constbuf_parm[i])
+			nouveau_bo_ref(NULL, &screen->constbuf_parm[i]);
+	}
+
+	if (screen->constbuf_misc[0])
+		nouveau_bo_ref(NULL, &screen->constbuf_misc[0]);
+	if (screen->tic)
+		nouveau_bo_ref(NULL, &screen->tic);
+	if (screen->tsc)
+		nouveau_bo_ref(NULL, &screen->tsc);
+
+	nouveau_notifier_free(&screen->sync);
+	nouveau_grobj_free(&screen->tesla);
+	nouveau_grobj_free(&screen->eng2d);
+	nouveau_grobj_free(&screen->m2mf);
+	nouveau_resource_destroy(&screen->immd_heap);
+	nouveau_screen_fini(&screen->base);
+	FREE(screen);
+}
+
+#define BGN_RELOC(ch, bo, gr, m, n, fl) \
+   OUT_RELOC(ch, bo, (n << 18) | (gr->subc << 13) | m, fl, 0, 0)
+
+void
+nv50_screen_relocs(struct nv50_screen *screen)
+{
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *tesla = screen->tesla;
+	unsigned i;
+	const unsigned rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
+
+	MARK_RING (chan, 28, 26);
+
+	/* cause grobj autobind */
+	BEGIN_RING(chan, tesla, 0x0100, 1);
+	OUT_RING  (chan, 0);
+
+	BGN_RELOC (chan, screen->tic, tesla, NV50TCL_TIC_ADDRESS_HIGH, 2, rl);
+	OUT_RELOCh(chan, screen->tic, 0, rl);
+	OUT_RELOCl(chan, screen->tic, 0, rl);
+
+	BGN_RELOC (chan, screen->tsc, tesla, NV50TCL_TSC_ADDRESS_HIGH, 2, rl);
+	OUT_RELOCh(chan, screen->tsc, 0, rl);
+	OUT_RELOCl(chan, screen->tsc, 0, rl);
+
+	BGN_RELOC (chan, screen->constbuf_misc[0],
+		   tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3, rl);
+	OUT_RELOCh(chan, screen->constbuf_misc[0], 0, rl);
+	OUT_RELOCl(chan, screen->constbuf_misc[0], 0, rl);
+	OUT_RELOC (chan, screen->constbuf_misc[0],
+		   (NV50_CB_PMISC << 16) | 0x0200, rl, 0, 0);
+
+	BGN_RELOC (chan, screen->constbuf_misc[0],
+		   tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3, rl);
+	OUT_RELOCh(chan, screen->constbuf_misc[0], 0x200, rl);
+	OUT_RELOCl(chan, screen->constbuf_misc[0], 0x200, rl);
+	OUT_RELOC (chan, screen->constbuf_misc[0],
+		   (NV50_CB_AUX << 16) | 0x0200, rl, 0, 0);
+
+	for (i = 0; i < 3; ++i) {
+		BGN_RELOC (chan, screen->constbuf_parm[i],
+			   tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3, rl);
+		OUT_RELOCh(chan, screen->constbuf_parm[i], 0, rl);
+		OUT_RELOCl(chan, screen->constbuf_parm[i], 0, rl);
+		OUT_RELOC (chan, screen->constbuf_parm[i],
+			   ((NV50_CB_PVP + i) << 16) | 0x0000, rl, 0, 0);
+	}
+}
+
+struct pipe_screen *
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
+{
+	struct nv50_screen *screen = CALLOC_STRUCT(nv50_screen);
+	struct nouveau_channel *chan;
+	struct pipe_screen *pscreen;
+	unsigned chipset = dev->chipset;
+	unsigned tesla_class = 0;
+	int ret, i;
+	const unsigned rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
+
+	if (!screen)
+		return NULL;
+	pscreen = &screen->base.base;
+
+	ret = nouveau_screen_init(&screen->base, dev);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+	chan = screen->base.channel;
+
+	pscreen->winsys = ws;
+	pscreen->destroy = nv50_screen_destroy;
+	pscreen->get_param = nv50_screen_get_param;
+	pscreen->get_paramf = nv50_screen_get_paramf;
+	pscreen->is_format_supported = nv50_screen_is_format_supported;
+	pscreen->context_create = nv50_create;
+
+	nv50_screen_init_resource_functions(pscreen);
+
+	/* DMA engine object */
+	ret = nouveau_grobj_alloc(chan, 0xbeef5039,
+		NV50_MEMORY_TO_MEMORY_FORMAT, &screen->m2mf);
+	if (ret) {
+		NOUVEAU_ERR("Error creating M2MF object: %d\n", ret);
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	/* 2D object */
+	ret = nouveau_grobj_alloc(chan, 0xbeef502d, NV50_2D, &screen->eng2d);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 2D object: %d\n", ret);
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	/* 3D object */
+	switch (chipset & 0xf0) {
+	case 0x50:
+		tesla_class = NV50TCL;
+		break;
+	case 0x80:
+	case 0x90:
+		tesla_class = NV84TCL;
+		break;
+	case 0xa0:
+		switch (chipset) {
+		case 0xa0:
+		case 0xaa:
+		case 0xac:
+			tesla_class = NVA0TCL;
+			break;
+		default:
+			tesla_class = NVA8TCL;
+			break;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("Not a known NV50 chipset: NV%02x\n", chipset);
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	ret = nouveau_grobj_alloc(chan, 0xbeef5097, tesla_class,
+		&screen->tesla);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	/* this is necessary for the new RING_3D / statebuffer code */
+	BIND_RING(chan, screen->tesla, 7);
+
+	/* Sync notifier */
+	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	/* Static M2MF init */
+	BEGIN_RING(chan, screen->m2mf,
+		   NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 3);
+	OUT_RING  (chan, screen->sync->handle);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+
+	/* Static 2D init */
+	BEGIN_RING(chan, screen->eng2d, NV50_2D_DMA_NOTIFY, 4);
+	OUT_RING  (chan, screen->sync->handle);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+	BEGIN_RING(chan, screen->eng2d, NV50_2D_OPERATION, 1);
+	OUT_RING  (chan, NV50_2D_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, screen->eng2d, NV50_2D_CLIP_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, screen->eng2d, 0x0888, 1);
+	OUT_RING  (chan, 1);
+
+	/* Static tesla init */
+	BEGIN_RING(chan, screen->tesla, NV50TCL_COND_MODE, 1);
+	OUT_RING  (chan, NV50TCL_COND_MODE_ALWAYS);
+	BEGIN_RING(chan, screen->tesla, NV50TCL_DMA_NOTIFY, 1);
+	OUT_RING  (chan, screen->sync->handle);
+	BEGIN_RING(chan, screen->tesla, NV50TCL_DMA_ZETA, 11);
+	for (i = 0; i < 11; i++)
+		OUT_RING  (chan, chan->vram->handle);
+	BEGIN_RING(chan, screen->tesla,
+		   NV50TCL_DMA_COLOR(0), NV50TCL_DMA_COLOR__SIZE);
+	for (i = 0; i < NV50TCL_DMA_COLOR__SIZE; i++)
+		OUT_RING  (chan, chan->vram->handle);
+
+	BEGIN_RING(chan, screen->tesla, NV50TCL_RT_CONTROL, 1);
+	OUT_RING  (chan, 1);
+
+	/* activate all 32 lanes (threads) in a warp */
+	BEGIN_RING(chan, screen->tesla, NV50TCL_REG_MODE, 1);
+	OUT_RING  (chan, NV50TCL_REG_MODE_STRIPED);
+	BEGIN_RING(chan, screen->tesla, 0x1400, 1);
+	OUT_RING  (chan, 0xf);
+
+	/* max TIC (bits 4:8) & TSC (ignored) bindings, per program type */
+	for (i = 0; i < 3; ++i) {
+		BEGIN_RING(chan, screen->tesla, NV50TCL_TEX_LIMITS(i), 1);
+		OUT_RING  (chan, 0x54);
+	}
+
+	/* origin is top left (set to 1 for bottom left) */
+	BEGIN_RING(chan, screen->tesla, NV50TCL_Y_ORIGIN_BOTTOM, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, screen->tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
+	OUT_RING  (chan, 8);
+
+	/* constant buffers for immediates and VP/FP parameters */
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, (32 * 4) * 4,
+			     &screen->constbuf_misc[0]);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+	BEGIN_RING(chan, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
+	OUT_RELOCh(chan, screen->constbuf_misc[0], 0, rl);
+	OUT_RELOCl(chan, screen->constbuf_misc[0], 0, rl);
+	OUT_RING  (chan, (NV50_CB_PMISC << 16) | 0x0200);
+	BEGIN_RING(chan, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
+	OUT_RELOCh(chan, screen->constbuf_misc[0], 0x200, rl);
+	OUT_RELOCl(chan, screen->constbuf_misc[0], 0x200, rl);
+	OUT_RING  (chan, (NV50_CB_AUX << 16) | 0x0200);
+
+	for (i = 0; i < 3; i++) {
+		ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, (4096 * 4) * 4,
+				     &screen->constbuf_parm[i]);
+		if (ret) {
+			nv50_screen_destroy(pscreen);
+			return NULL;
+		}
+		BEGIN_RING(chan, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
+		OUT_RELOCh(chan, screen->constbuf_parm[i], 0, rl);
+		OUT_RELOCl(chan, screen->constbuf_parm[i], 0, rl);
+		/* CB_DEF_SET_SIZE value of 0x0000 means 65536 */
+		OUT_RING  (chan, ((NV50_CB_PVP + i) << 16) | 0x0000);
+	}
+
+	if (nouveau_resource_init(&screen->immd_heap, 0, 128)) {
+		NOUVEAU_ERR("Error initialising shader immediates heap.\n");
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 3 * 32 * (8 * 4),
+			     &screen->tic);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+	BEGIN_RING(chan, screen->tesla, NV50TCL_TIC_ADDRESS_HIGH, 3);
+	OUT_RELOCh(chan, screen->tic, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, screen->tic, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RING  (chan, 3 * 32 - 1);
+
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 3 * 32 * (8 * 4),
+			     &screen->tsc);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+	BEGIN_RING(chan, screen->tesla, NV50TCL_TSC_ADDRESS_HIGH, 3);
+	OUT_RELOCh(chan, screen->tsc, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, screen->tsc, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RING  (chan, 0); /* ignored if TSC_LINKED (0x1234) == 1 */
+
+	/* map constant buffers:
+	 *  B = buffer ID (maybe more than 1 byte)
+	 *  N = CB index used in shader instruction
+	 *  P = program type (0 = VP, 2 = GP, 3 = FP)
+	 * SET_PROGRAM_CB = 0x000BBNP1
+	 */
+	BEGIN_RING_NI(chan, screen->tesla, NV50TCL_SET_PROGRAM_CB, 8);
+	/* bind immediate buffer */
+	OUT_RING  (chan, 0x001 | (NV50_CB_PMISC << 12));
+	OUT_RING  (chan, 0x021 | (NV50_CB_PMISC << 12));
+	OUT_RING  (chan, 0x031 | (NV50_CB_PMISC << 12));
+	/* bind auxiliary constbuf to immediate data bo */
+	OUT_RING  (chan, 0x201 | (NV50_CB_AUX << 12));
+	OUT_RING  (chan, 0x221 | (NV50_CB_AUX << 12));
+	/* bind parameter buffers */
+	OUT_RING  (chan, 0x101 | (NV50_CB_PVP << 12));
+	OUT_RING  (chan, 0x121 | (NV50_CB_PGP << 12));
+	OUT_RING  (chan, 0x131 | (NV50_CB_PFP << 12));
+
+	/* Vertex array limits - max them out */
+	for (i = 0; i < 16; i++) {
+		BEGIN_RING(chan, screen->tesla,
+			   NV50TCL_VERTEX_ARRAY_LIMIT_HIGH(i), 2);
+		OUT_RING  (chan, 0x000000ff);
+		OUT_RING  (chan, 0xffffffff);
+	}
+
+	BEGIN_RING(chan, screen->tesla, NV50TCL_DEPTH_RANGE_NEAR(0), 2);
+	OUT_RINGf (chan, 0.0f);
+	OUT_RINGf (chan, 1.0f);
+
+	/* no dynamic combination of TIC & TSC entries => only BIND_TIC used */
+	BEGIN_RING(chan, screen->tesla, NV50TCL_LINKED_TSC, 1);
+	OUT_RING  (chan, 1);
+
+	BEGIN_RING(chan, screen->tesla, NV50TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (chan, 1); /* default edgeflag to TRUE */
+
+	FIRE_RING (chan);
+
+	screen->force_push = debug_get_bool_option("NV50_ALWAYS_PUSH", FALSE);
+	if(!screen->force_push)
+		screen->base.vertex_buffer_flags = screen->base.index_buffer_flags = NOUVEAU_BO_GART;
+	return pscreen;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
new file mode 100644
index 0000000000..fbf15a7596
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -0,0 +1,41 @@
+#ifndef __NV50_SCREEN_H__
+#define __NV50_SCREEN_H__
+
+#include "nouveau/nouveau_screen.h"
+
+struct nv50_context;
+
+struct nv50_screen {
+	struct nouveau_screen base;
+
+	struct nouveau_winsys *nvws;
+
+	struct nv50_context *cur_ctx;
+
+	struct nouveau_grobj *tesla;
+	struct nouveau_grobj *eng2d;
+	struct nouveau_grobj *m2mf;
+	struct nouveau_notifier *sync;
+
+	struct nouveau_bo *constbuf_misc[1];
+	struct nouveau_bo *constbuf_parm[PIPE_SHADER_TYPES];
+
+	struct nouveau_resource *immd_heap;
+
+	struct pipe_resource *strm_vbuf[16];
+
+	struct nouveau_bo *tic;
+	struct nouveau_bo *tsc;
+
+	boolean force_push;
+};
+
+static INLINE struct nv50_screen *
+nv50_screen(struct pipe_screen *screen)
+{
+	return (struct nv50_screen *)screen;
+}
+
+extern void nv50_screen_relocs(struct nv50_screen *);
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
new file mode 100644
index 0000000000..f8bff764f2
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -0,0 +1,827 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv50_context.h"
+#include "nv50_texture.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+static INLINE uint32_t
+nv50_colormask(unsigned mask)
+{
+	uint32_t cmask = 0;
+
+	if (mask & PIPE_MASK_R)
+		cmask |= 0x0001;
+	if (mask & PIPE_MASK_G)
+		cmask |= 0x0010;
+	if (mask & PIPE_MASK_B)
+		cmask |= 0x0100;
+	if (mask & PIPE_MASK_A)
+		cmask |= 0x1000;
+
+	return cmask;
+}
+
+static void *
+nv50_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nouveau_stateobj *so = so_new(5, 24, 0);
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_blend_stateobj *bso = CALLOC_STRUCT(nv50_blend_stateobj);
+	unsigned i, blend_enabled = 0;
+
+	/*XXX ignored:
+	 * 	- dither
+	 */
+
+	so_method(so, tesla, NV50TCL_BLEND_ENABLE(0), 8);
+	if (cso->independent_blend_enable) {
+		for (i = 0; i < 8; ++i) {
+			so_data(so, cso->rt[i].blend_enable);
+			if (cso->rt[i].blend_enable)
+				blend_enabled = 1;
+		}
+	} else
+	if (cso->rt[0].blend_enable) {
+		blend_enabled = 1;
+		for (i = 0; i < 8; i++)
+			so_data(so, 1);
+	} else {
+		for (i = 0; i < 8; i++)
+			so_data(so, 0);
+	}
+	if (blend_enabled) {
+		so_method(so, tesla, NV50TCL_BLEND_EQUATION_RGB, 5);
+		so_data  (so, nvgl_blend_eqn(cso->rt[0].rgb_func));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->rt[0].rgb_src_factor));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->rt[0].rgb_dst_factor));
+		so_data  (so, nvgl_blend_eqn(cso->rt[0].alpha_func));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->rt[0].alpha_src_factor));
+		so_method(so, tesla, NV50TCL_BLEND_FUNC_DST_ALPHA, 1);
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->rt[0].alpha_dst_factor));
+	}
+
+	if (cso->logicop_enable == 0 ) {
+		so_method(so, tesla, NV50TCL_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	} else {
+		so_method(so, tesla, NV50TCL_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	}
+
+	so_method(so, tesla, NV50TCL_COLOR_MASK(0), 8);
+	if (cso->independent_blend_enable)
+		for (i = 0; i < 8; ++i)
+			so_data(so, nv50_colormask(cso->rt[i].colormask));
+	else {
+		uint32_t cmask = nv50_colormask(cso->rt[0].colormask);
+		for (i = 0; i < 8; i++)
+			so_data(so, cmask);
+	}
+
+	bso->pipe = *cso;
+	so_ref(so, &bso->so);
+	so_ref(NULL, &so);
+	return (void *)bso;
+}
+
+static void
+nv50_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->blend = hwcso;
+	nv50->dirty |= NV50_NEW_BLEND;
+}
+
+static void
+nv50_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_blend_stateobj *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+static INLINE unsigned
+wrap_mode(unsigned wrap)
+{
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		return NV50TSC_1_0_WRAPS_REPEAT;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		return NV50TSC_1_0_WRAPS_MIRROR_REPEAT;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		return NV50TSC_1_0_WRAPS_CLAMP_TO_EDGE;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		return NV50TSC_1_0_WRAPS_CLAMP_TO_BORDER;
+	case PIPE_TEX_WRAP_CLAMP:
+		return NV50TSC_1_0_WRAPS_CLAMP;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_EDGE;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_BORDER;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		return NV50TSC_1_0_WRAPS_REPEAT;
+	}
+}
+static void *
+nv50_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv50_sampler_stateobj *sso = CALLOC(1, sizeof(*sso));
+	unsigned *tsc = sso->tsc;
+	float limit;
+
+	tsc[0] = (0x00026000 |
+		  (wrap_mode(cso->wrap_s) << 0) |
+		  (wrap_mode(cso->wrap_t) << 3) |
+		  (wrap_mode(cso->wrap_r) << 6));
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MAGF_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		tsc[1] |= NV50TSC_1_1_MAGF_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MINF_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		tsc[1] |= NV50TSC_1_1_MINF_NEAREST;
+		break;
+	}
+
+	switch (cso->min_mip_filter) {
+	case PIPE_TEX_MIPFILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MIPF_LINEAR;
+		break;
+	case PIPE_TEX_MIPFILTER_NEAREST:
+		tsc[1] |= NV50TSC_1_1_MIPF_NEAREST;
+		break;
+	case PIPE_TEX_MIPFILTER_NONE:
+	default:
+		tsc[1] |= NV50TSC_1_1_MIPF_NONE;
+		break;
+	}
+
+	if (cso->max_anisotropy >= 16)
+		tsc[0] |= (7 << 20);
+	else
+	if (cso->max_anisotropy >= 12)
+		tsc[0] |= (6 << 20);
+	else {
+		tsc[0] |= (cso->max_anisotropy >> 1) << 20;
+
+		if (cso->max_anisotropy >= 4)
+			tsc[1] |= NV50TSC_1_1_UNKN_ANISO_35;
+		else
+		if (cso->max_anisotropy >= 2)
+			tsc[1] |= NV50TSC_1_1_UNKN_ANISO_15;
+	}
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		/* XXX: must be deactivated for non-shadow textures */
+		tsc[0] |= (1 << 9);
+		tsc[0] |= (nvgl_comparison_op(cso->compare_func) & 0x7) << 10;
+	}
+
+	limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+	tsc[1] |= ((int)(limit * 256.0) & 0x1fff) << 12;
+
+	tsc[2] |= ((int)CLAMP(cso->max_lod, 0.0, 15.0) << 20) |
+		  ((int)CLAMP(cso->min_lod, 0.0, 15.0) << 8);
+
+	tsc[4] = fui(cso->border_color[0]);
+	tsc[5] = fui(cso->border_color[1]);
+	tsc[6] = fui(cso->border_color[2]);
+	tsc[7] = fui(cso->border_color[3]);
+
+	sso->normalized = cso->normalized_coords;
+	return (void *)sso;
+}
+
+/* type == 0 for VPs, 1 for GPs, 2 for FPs, which is how the
+ * relevant tesla methods are indexed (NV50TCL_BIND_TSC etc.)
+ */
+static INLINE void
+nv50_sampler_state_bind(struct pipe_context *pipe, unsigned type,
+			unsigned nr, void **sampler)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	memcpy(nv50->sampler[type], sampler, nr * sizeof(void *));
+
+	nv50->sampler_nr[type] = nr;
+	nv50->dirty |= NV50_NEW_SAMPLER;
+}
+
+static void
+nv50_vp_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **s)
+{
+	nv50_sampler_state_bind(pipe, 0, nr, s);
+}
+
+static void
+nv50_fp_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **s)
+{
+	nv50_sampler_state_bind(pipe, 2, nr, s);
+}
+
+static void
+nv50_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static INLINE void
+nv50_set_sampler_views(struct pipe_context *pipe, unsigned p,
+		       unsigned nr,
+		       struct pipe_sampler_view **views)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	unsigned i;
+
+	for (i = 0; i < nr; i++)
+		pipe_sampler_view_reference(&nv50->sampler_views[p][i],
+					    views[i]);
+
+	for (i = nr; i < nv50->sampler_view_nr[p]; i++)
+		pipe_sampler_view_reference(&nv50->sampler_views[p][i], NULL);
+
+	nv50->sampler_view_nr[p] = nr;
+	nv50->dirty |= NV50_NEW_TEXTURE;
+}
+
+static void
+nv50_set_vp_sampler_views(struct pipe_context *pipe,
+			  unsigned nr,
+			  struct pipe_sampler_view **views)
+{
+	nv50_set_sampler_views(pipe, 0, nr, views);
+}
+
+static void
+nv50_set_fp_sampler_views(struct pipe_context *pipe,
+			  unsigned nr,
+			  struct pipe_sampler_view **views)
+{
+	nv50_set_sampler_views(pipe, 2, nr, views);
+}
+
+static void
+nv50_sampler_view_destroy(struct pipe_context *pipe,
+			  struct pipe_sampler_view *view)
+{
+	pipe_resource_reference(&view->texture, NULL);
+	FREE(nv50_sampler_view(view));
+}
+
+static struct pipe_sampler_view *
+nv50_create_sampler_view(struct pipe_context *pipe,
+			 struct pipe_resource *texture,
+			 const struct pipe_sampler_view *templ)
+{
+	struct nv50_sampler_view *view = CALLOC_STRUCT(nv50_sampler_view);
+
+	view->pipe = *templ;
+	view->pipe.reference.count = 1;
+	view->pipe.texture = NULL;
+	pipe_resource_reference(&view->pipe.texture, texture);
+	view->pipe.context = pipe;
+
+	if (!nv50_tex_construct(view)) {
+		nv50_sampler_view_destroy(pipe, &view->pipe);
+		return NULL;
+	}
+	return &view->pipe;
+}
+
+
+static void *
+nv50_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nouveau_stateobj *so = so_new(16, 22, 0);
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_rasterizer_stateobj *rso =
+		CALLOC_STRUCT(nv50_rasterizer_stateobj);
+
+	/*XXX: ignored
+	 * 	- light_twoside
+	 * 	- point_smooth
+	 * 	- multisample
+	 * 	- point_sprite / sprite_coord_mode
+	 */
+
+	so_method(so, tesla, NV50TCL_SCISSOR_ENABLE(0), 1);
+	so_data  (so, cso->scissor);
+
+	so_method(so, tesla, NV50TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV50TCL_SHADE_MODEL_FLAT :
+				       NV50TCL_SHADE_MODEL_SMOOTH);
+	so_method(so, tesla, NV50TCL_PROVOKING_VERTEX_LAST, 1);
+	so_data  (so, cso->flatshade_first ? 0 : 1);
+
+	so_method(so, tesla, NV50TCL_VERTEX_TWO_SIDE_ENABLE, 1);
+	so_data  (so, cso->light_twoside);
+
+	so_method(so, tesla, NV50TCL_LINE_WIDTH, 1);
+	so_data  (so, fui(cso->line_width));
+	so_method(so, tesla, NV50TCL_LINE_SMOOTH_ENABLE, 1);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	if (cso->line_stipple_enable) {
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_PATTERN, 1);
+		so_data  (so, (cso->line_stipple_pattern << 8) |
+			       cso->line_stipple_factor);
+	} else {
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, tesla, NV50TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, tesla, NV50TCL_POINT_SPRITE_ENABLE, 1);
+	so_data  (so, cso->point_quad_rasterization ? 1 : 0);
+
+	so_method(so, tesla, NV50TCL_POLYGON_MODE_FRONT, 3);
+        so_data(so, nvgl_polygon_mode(cso->fill_front));
+        so_data(so, nvgl_polygon_mode(cso->fill_back));
+	so_data(so, cso->poly_smooth ? 1 : 0);
+
+	so_method(so, tesla, NV50TCL_CULL_FACE_ENABLE, 3);
+	so_data  (so, cso->cull_face != PIPE_FACE_NONE);
+	if (cso->front_ccw) {
+		so_data(so, NV50TCL_FRONT_FACE_CCW);
+        }
+        else {
+		so_data(so, NV50TCL_FRONT_FACE_CW);
+        }
+	switch (cso->cull_face) {
+	case PIPE_FACE_FRONT:
+		so_data(so, NV50TCL_CULL_FACE_FRONT);
+		break;
+	case PIPE_FACE_BACK:
+		so_data(so, NV50TCL_CULL_FACE_BACK);
+		break;
+	case PIPE_FACE_FRONT_AND_BACK:
+		so_data(so, NV50TCL_CULL_FACE_FRONT_AND_BACK);
+		break;
+	default:
+		so_data(so, NV50TCL_CULL_FACE_BACK);
+		break;
+	}
+
+	so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, tesla, NV50TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+        so_data(so, cso->offset_point);
+        so_data(so, cso->offset_line);
+        so_data(so, cso->offset_tri);
+
+	if (cso->offset_point ||
+            cso->offset_line ||
+            cso->offset_tri) {
+		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_FACTOR, 1);
+		so_data  (so, fui(cso->offset_scale));
+		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_UNITS, 1);
+		so_data  (so, fui(cso->offset_units * 2.0f));
+	}
+
+	rso->pipe = *cso;
+	so_ref(so, &rso->so);
+	so_ref(NULL, &so);
+	return (void *)rso;
+}
+
+static void
+nv50_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->rasterizer = hwcso;
+	nv50->dirty |= NV50_NEW_RASTERIZER;
+}
+
+static void
+nv50_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_rasterizer_stateobj *rso = hwcso;
+
+	so_ref(NULL, &rso->so);
+	FREE(rso);
+}
+
+static void *
+nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_zsa_stateobj *zsa = CALLOC_STRUCT(nv50_zsa_stateobj);
+	struct nouveau_stateobj *so = so_new(9, 21, 0);
+
+	so_method(so, tesla, NV50TCL_DEPTH_WRITE_ENABLE, 1);
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	if (cso->depth.enabled) {
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_FUNC, 1);
+		so_data  (so, nvgl_comparison_op(cso->depth.func));
+	} else {
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[0].enabled) {
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 5);
+		so_data  (so, 1);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_MASK, 2);
+		so_data  (so, cso->stencil[0].writemask);
+		so_data  (so, cso->stencil[0].valuemask);
+	} else {
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 5);
+		so_data  (so, 1);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_MASK, 2);
+		so_data  (so, cso->stencil[1].writemask);
+		so_data  (so, cso->stencil[1].valuemask);
+	} else {
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->alpha.enabled) {
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_REF, 2);
+		so_data  (so, fui(cso->alpha.ref_value));
+		so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	} else {
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	zsa->pipe = *cso;
+	so_ref(so, &zsa->so);
+	so_ref(NULL, &so);
+	return (void *)zsa;
+}
+
+static void
+nv50_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->zsa = hwcso;
+	nv50->dirty |= NV50_NEW_ZSA;
+}
+
+static void
+nv50_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_zsa_stateobj *zsa = hwcso;
+
+	so_ref(NULL, &zsa->so);
+	FREE(zsa);
+}
+
+static void *
+nv50_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_VERTEX;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->vertprog = hwcso;
+	nv50->dirty |= NV50_NEW_VERTPROG;
+}
+
+static void
+nv50_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void *)p->pipe.tokens);
+	FREE(p);
+}
+
+static void *
+nv50_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_FRAGMENT;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->fragprog = hwcso;
+	nv50->dirty |= NV50_NEW_FRAGPROG;
+}
+
+static void
+nv50_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void *)p->pipe.tokens);
+	FREE(p);
+}
+
+static void *
+nv50_gp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_GEOMETRY;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->fragprog = hwcso;
+	nv50->dirty |= NV50_NEW_GEOMPROG;
+}
+
+static void
+nv50_gp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void *)p->pipe.tokens);
+	FREE(p);
+}
+
+static void
+nv50_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->blend_colour = *bcol;
+	nv50->dirty |= NV50_NEW_BLEND_COLOUR;
+}
+
+ static void
+nv50_set_stencil_ref(struct pipe_context *pipe,
+		     const struct pipe_stencil_ref *sr)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->stencil_ref = *sr;
+	nv50->dirty |= NV50_NEW_STENCIL_REF;
+}
+
+static void
+nv50_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv50_set_sample_mask(struct pipe_context *pipe,
+		     unsigned sample_mask)
+{
+}
+
+static void
+nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 struct pipe_resource *buf )
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv50->constbuf[PIPE_SHADER_VERTEX] = buf;
+		nv50->dirty |= NV50_NEW_VERTPROG_CB;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv50->constbuf[PIPE_SHADER_FRAGMENT] = buf;
+		nv50->dirty |= NV50_NEW_FRAGPROG_CB;
+	} else
+	if (shader == PIPE_SHADER_GEOMETRY) {
+		nv50->constbuf[PIPE_SHADER_GEOMETRY] = buf;
+		nv50->dirty |= NV50_NEW_GEOMPROG_CB;
+	}
+}
+
+static void
+nv50_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->framebuffer = *fb;
+	nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+}
+
+static void
+nv50_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->stipple = *stipple;
+	nv50->dirty |= NV50_NEW_STIPPLE;
+}
+
+static void
+nv50_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->scissor = *s;
+	nv50->dirty |= NV50_NEW_SCISSOR;
+}
+
+static void
+nv50_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->viewport = *vpt;
+	nv50->dirty |= NV50_NEW_VIEWPORT;
+}
+
+static void
+nv50_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	memcpy(nv50->vtxbuf, vb, sizeof(*vb) * count);
+	nv50->vtxbuf_nr = count;
+
+	nv50->dirty |= NV50_NEW_ARRAYS;
+}
+
+static void *
+nv50_vtxelts_state_create(struct pipe_context *pipe,
+			  unsigned num_elements,
+			  const struct pipe_vertex_element *elements)
+{
+	struct nv50_vtxelt_stateobj *cso = CALLOC_STRUCT(nv50_vtxelt_stateobj);
+
+	assert(num_elements < 16); /* not doing fallbacks yet */
+	cso->num_elements = num_elements;
+	memcpy(cso->pipe, elements, num_elements * sizeof(*elements));
+
+	nv50_vtxelt_construct(cso);
+
+	return (void *)cso;
+}
+
+static void
+nv50_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv50_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->vtxelt = hwcso;
+	nv50->dirty |= NV50_NEW_ARRAYS;
+}
+
+void
+nv50_init_state_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.create_blend_state = nv50_blend_state_create;
+	nv50->pipe.bind_blend_state = nv50_blend_state_bind;
+	nv50->pipe.delete_blend_state = nv50_blend_state_delete;
+
+	nv50->pipe.create_sampler_state = nv50_sampler_state_create;
+	nv50->pipe.delete_sampler_state = nv50_sampler_state_delete;
+	nv50->pipe.bind_fragment_sampler_states = nv50_fp_sampler_state_bind;
+	nv50->pipe.bind_vertex_sampler_states   = nv50_vp_sampler_state_bind;
+	nv50->pipe.set_fragment_sampler_views = nv50_set_fp_sampler_views;
+	nv50->pipe.set_vertex_sampler_views   = nv50_set_vp_sampler_views;
+	nv50->pipe.create_sampler_view = nv50_create_sampler_view;
+	nv50->pipe.sampler_view_destroy = nv50_sampler_view_destroy;
+
+	nv50->pipe.create_rasterizer_state = nv50_rasterizer_state_create;
+	nv50->pipe.bind_rasterizer_state = nv50_rasterizer_state_bind;
+	nv50->pipe.delete_rasterizer_state = nv50_rasterizer_state_delete;
+
+	nv50->pipe.create_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_create;
+	nv50->pipe.bind_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_bind;
+	nv50->pipe.delete_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_delete;
+
+	nv50->pipe.create_vs_state = nv50_vp_state_create;
+	nv50->pipe.bind_vs_state = nv50_vp_state_bind;
+	nv50->pipe.delete_vs_state = nv50_vp_state_delete;
+
+	nv50->pipe.create_fs_state = nv50_fp_state_create;
+	nv50->pipe.bind_fs_state = nv50_fp_state_bind;
+	nv50->pipe.delete_fs_state = nv50_fp_state_delete;
+
+	nv50->pipe.create_gs_state = nv50_gp_state_create;
+	nv50->pipe.bind_gs_state = nv50_gp_state_bind;
+	nv50->pipe.delete_gs_state = nv50_gp_state_delete;
+
+	nv50->pipe.set_blend_color = nv50_set_blend_color;
+        nv50->pipe.set_stencil_ref = nv50_set_stencil_ref;
+	nv50->pipe.set_clip_state = nv50_set_clip_state;
+	nv50->pipe.set_sample_mask = nv50_set_sample_mask;
+	nv50->pipe.set_constant_buffer = nv50_set_constant_buffer;
+	nv50->pipe.set_framebuffer_state = nv50_set_framebuffer_state;
+	nv50->pipe.set_polygon_stipple = nv50_set_polygon_stipple;
+	nv50->pipe.set_scissor_state = nv50_set_scissor_state;
+	nv50->pipe.set_viewport_state = nv50_set_viewport_state;
+
+	nv50->pipe.create_vertex_elements_state = nv50_vtxelts_state_create;
+	nv50->pipe.delete_vertex_elements_state = nv50_vtxelts_state_delete;
+	nv50->pipe.bind_vertex_elements_state = nv50_vtxelts_state_bind;
+
+	nv50->pipe.set_vertex_buffers = nv50_set_vertex_buffers;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
new file mode 100644
index 0000000000..14c3490599
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -0,0 +1,481 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "util/u_format.h"
+
+#include "nv50_context.h"
+#include "nv50_resource.h"
+#include "nouveau/nouveau_stateobj.h"
+
+static struct nouveau_stateobj *
+validate_fb(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so = so_new(32, 79, 18);
+	struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+	unsigned i, w = 0, h = 0, gw = 0;
+
+	/* Set nr of active RTs and select RT for each colour output.
+	 * FP result 0 always goes to RT[0], bits 4 - 6 are ignored.
+	 * Ambiguous assignment results in no rendering (no DATA_ERROR).
+	 */
+	so_method(so, tesla, NV50TCL_RT_CONTROL, 1);
+	so_data  (so, fb->nr_cbufs |
+		  (0 <<  4) | (1 <<  7) | (2 << 10) | (3 << 13) |
+		  (4 << 16) | (5 << 19) | (6 << 22) | (7 << 25));
+
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		struct pipe_resource *pt = fb->cbufs[i]->texture;
+		struct nouveau_bo *bo = nv50_miptree(pt)->base.bo;
+
+		if (!gw) {
+			w = fb->cbufs[i]->width;
+			h = fb->cbufs[i]->height;
+			gw = 1;
+		} else {
+			assert(w == fb->cbufs[i]->width);
+			assert(h == fb->cbufs[i]->height);
+		}
+
+		so_method(so, tesla, NV50TCL_RT_HORIZ(i), 2);
+		so_data  (so, fb->cbufs[i]->width);
+		so_data  (so, fb->cbufs[i]->height);
+
+		so_method(so, tesla, NV50TCL_RT_ADDRESS_HIGH(i), 5);
+		so_reloc (so, bo, fb->cbufs[i]->offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_HIGH | NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, bo, fb->cbufs[i]->offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0);
+		switch (fb->cbufs[i]->format) {
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+			so_data(so, NV50TCL_RT_FORMAT_A8R8G8B8_UNORM);
+			break;
+		case PIPE_FORMAT_B8G8R8X8_UNORM:
+			so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM);
+			break;
+		case PIPE_FORMAT_B5G6R5_UNORM:
+			so_data(so, NV50TCL_RT_FORMAT_R5G6B5_UNORM);
+			break;
+		case PIPE_FORMAT_R16G16B16A16_SNORM:
+			so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_SNORM);
+			break;
+		case PIPE_FORMAT_R16G16B16A16_UNORM:
+			so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_UNORM);
+			break;
+		case PIPE_FORMAT_R32G32B32A32_FLOAT:
+			so_data(so, NV50TCL_RT_FORMAT_R32G32B32A32_FLOAT);
+			break;
+		case PIPE_FORMAT_R16G16_SNORM:
+			so_data(so, NV50TCL_RT_FORMAT_R16G16_SNORM);
+			break;
+		case PIPE_FORMAT_R16G16_UNORM:
+			so_data(so, NV50TCL_RT_FORMAT_R16G16_UNORM);
+			break;
+		default:
+			NOUVEAU_ERR("AIIII unknown format %s\n",
+			            util_format_name(fb->cbufs[i]->format));
+			so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM);
+			break;
+		}
+		so_data(so, nv50_miptree(pt)->
+				level[fb->cbufs[i]->level].tile_mode << 4);
+		so_data(so, 0x00000000);
+
+		so_method(so, tesla, NV50TCL_RT_ARRAY_MODE, 1);
+		so_data  (so, 1);
+	}
+
+	if (fb->zsbuf) {
+		struct pipe_resource *pt = fb->zsbuf->texture;
+		struct nouveau_bo *bo = nv50_miptree(pt)->base.bo;
+
+		if (!gw) {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+			gw = 1;
+		} else {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		}
+
+		so_method(so, tesla, NV50TCL_ZETA_ADDRESS_HIGH, 5);
+		so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_HIGH | NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0);
+		switch (fb->zsbuf->format) {
+		case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+			so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM);
+			break;
+		case PIPE_FORMAT_Z24X8_UNORM:
+			so_data(so, NV50TCL_ZETA_FORMAT_X8Z24_UNORM);
+			break;
+		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+			so_data(so, NV50TCL_ZETA_FORMAT_Z24S8_UNORM);
+			break;
+		case PIPE_FORMAT_Z32_FLOAT:
+			so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT);
+			break;
+		default:
+			NOUVEAU_ERR("AIIII unknown format %s\n",
+			            util_format_name(fb->zsbuf->format));
+			so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM);
+			break;
+		}
+		so_data(so, nv50_miptree(pt)->
+				level[fb->zsbuf->level].tile_mode << 4);
+		so_data(so, 0x00000000);
+
+		so_method(so, tesla, NV50TCL_ZETA_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_ZETA_HORIZ, 3);
+		so_data  (so, fb->zsbuf->width);
+		so_data  (so, fb->zsbuf->height);
+		so_data  (so, 0x00010001);
+	} else {
+		so_method(so, tesla, NV50TCL_ZETA_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ(0), 2);
+	so_data  (so, w << 16);
+	so_data  (so, h << 16);
+	/* set window lower left corner */
+	so_method(so, tesla, NV50TCL_WINDOW_OFFSET_X, 2);
+	so_data  (so, 0);
+	so_data  (so, 0);
+	/* set screen scissor rectangle */
+	so_method(so, tesla, NV50TCL_SCREEN_SCISSOR_HORIZ, 2);
+	so_data  (so, w << 16);
+	so_data  (so, h << 16);
+
+	return so;
+}
+
+static void
+nv50_validate_samplers(struct nv50_context *nv50, struct nouveau_stateobj *so,
+		       unsigned p)
+{
+	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
+	unsigned i, j, dw = nv50->sampler_nr[p] * 8;
+
+	if (!dw)
+		return;
+	nv50_so_init_sifc(nv50, so, nv50->screen->tsc, NOUVEAU_BO_VRAM,
+			  p * (32 * 8 * 4), dw * 4);
+
+	so_method(so, eng2d, NV50_2D_SIFC_DATA | (2 << 29), dw);
+
+	for (i = 0; i < nv50->sampler_nr[p]; ++i) {
+		if (nv50->sampler[p][i])
+			so_datap(so, nv50->sampler[p][i]->tsc, 8);
+		else {
+			for (j = 0; j < 8; ++j) /* you get punished */
+				so_data(so, 0); /* ... for leaving holes */
+		}
+	}
+}
+
+static struct nouveau_stateobj *
+validate_blend(struct nv50_context *nv50)
+{
+	struct nouveau_stateobj *so = NULL;
+	so_ref(nv50->blend->so, &so);
+	return so;
+}
+
+static struct nouveau_stateobj *
+validate_zsa(struct nv50_context *nv50)
+{
+	struct nouveau_stateobj *so = NULL;
+	so_ref(nv50->zsa->so, &so);
+	return so;
+}
+
+static struct nouveau_stateobj *
+validate_rast(struct nv50_context *nv50)
+{
+	struct nouveau_stateobj *so = NULL;
+	so_ref(nv50->rasterizer->so, &so);
+	return so;
+}
+
+static struct nouveau_stateobj *
+validate_blend_colour(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so = so_new(1, 4, 0);
+
+	so_method(so, tesla, NV50TCL_BLEND_COLOR(0), 4);
+	so_data  (so, fui(nv50->blend_colour.color[0]));
+	so_data  (so, fui(nv50->blend_colour.color[1]));
+	so_data  (so, fui(nv50->blend_colour.color[2]));
+	so_data  (so, fui(nv50->blend_colour.color[3]));
+	return so;
+}
+
+static struct nouveau_stateobj *
+validate_stencil_ref(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so = so_new(2, 2, 0);
+
+	so_method(so, tesla, NV50TCL_STENCIL_FRONT_FUNC_REF, 1);
+	so_data  (so, nv50->stencil_ref.ref_value[0]);
+	so_method(so, tesla, NV50TCL_STENCIL_BACK_FUNC_REF, 1);
+	so_data  (so, nv50->stencil_ref.ref_value[1]);
+	return so;
+}
+
+static struct nouveau_stateobj *
+validate_stipple(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so = so_new(1, 32, 0);
+	int i;
+
+	so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+	for (i = 0; i < 32; i++)
+		so_data(so, util_bswap32(nv50->stipple.stipple[i]));
+	return so;
+}
+
+static struct nouveau_stateobj *
+validate_scissor(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+        struct pipe_scissor_state *s = &nv50->scissor;
+	struct nouveau_stateobj *so;
+
+	so = so_new(1, 2, 0);
+	so_method(so, tesla, NV50TCL_SCISSOR_HORIZ(0), 2);
+	so_data  (so, (s->maxx << 16) | s->minx);
+	so_data  (so, (s->maxy << 16) | s->miny);
+	return so;
+}
+
+static struct nouveau_stateobj *
+validate_viewport(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so = so_new(5, 9, 0);
+
+	so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE_X(0), 3);
+	so_data  (so, fui(nv50->viewport.translate[0]));
+	so_data  (so, fui(nv50->viewport.translate[1]));
+	so_data  (so, fui(nv50->viewport.translate[2]));
+	so_method(so, tesla, NV50TCL_VIEWPORT_SCALE_X(0), 3);
+	so_data  (so, fui(nv50->viewport.scale[0]));
+	so_data  (so, fui(nv50->viewport.scale[1]));
+	so_data  (so, fui(nv50->viewport.scale[2]));
+
+	so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1);
+	so_data  (so, 1);
+	/* 0x0000 = remove whole primitive only (xyz)
+	 * 0x1018 = remove whole primitive only (xy), clamp z
+	 * 0x1080 = clip primitive (xyz)
+	 * 0x1098 = clip primitive (xy), clamp z
+	 */
+	so_method(so, tesla, NV50TCL_VIEW_VOLUME_CLIP_CTRL, 1);
+	so_data  (so, 0x1080);
+	/* no idea what 0f90 does */
+	so_method(so, tesla, 0x0f90, 1);
+	so_data  (so, 0);
+
+	return so;
+}
+
+static struct nouveau_stateobj *
+validate_sampler(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so;
+	unsigned nr = 0, i;
+
+	for (i = 0; i < 3; ++i)
+		nr += nv50->sampler_nr[i];
+
+	so = so_new(1 + 5 * 3, 1 + 19 * 3 + nr * 8, 3 * 2);
+
+	nv50_validate_samplers(nv50, so, 0); /* VP */
+	nv50_validate_samplers(nv50, so, 2); /* FP */
+
+	so_method(so, tesla, 0x1334, 1); /* flush TSC */
+	so_data  (so, 0);
+
+	return so;
+}
+
+static struct nouveau_stateobj *
+validate_vtxbuf(struct nv50_context *nv50)
+{
+	struct nouveau_stateobj *so = NULL;
+	so_ref(nv50->state.vtxbuf, &so);
+	return so;
+}
+
+static struct nouveau_stateobj *
+validate_vtxattr(struct nv50_context *nv50)
+{
+	struct nouveau_stateobj *so = NULL;
+	so_ref(nv50->state.vtxattr, &so);
+	return so;
+}
+
+struct state_validate {
+	struct nouveau_stateobj *(*func)(struct nv50_context *nv50);
+	unsigned states;
+} validate_list[] = {
+	{ validate_fb             , NV50_NEW_FRAMEBUFFER                      },
+	{ validate_blend          , NV50_NEW_BLEND                            },
+	{ validate_zsa            , NV50_NEW_ZSA                              },
+	{ nv50_vertprog_validate  , NV50_NEW_VERTPROG | NV50_NEW_VERTPROG_CB  },
+	{ nv50_fragprog_validate  , NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB  },
+	{ nv50_geomprog_validate  , NV50_NEW_GEOMPROG | NV50_NEW_GEOMPROG_CB  },
+	{ nv50_fp_linkage_validate, NV50_NEW_VERTPROG | NV50_NEW_GEOMPROG |
+				    NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER   },
+	{ nv50_gp_linkage_validate, NV50_NEW_VERTPROG | NV50_NEW_GEOMPROG     },
+	{ validate_rast           , NV50_NEW_RASTERIZER                       },
+	{ validate_blend_colour   , NV50_NEW_BLEND_COLOUR                     },
+	{ validate_stencil_ref    , NV50_NEW_STENCIL_REF                      },
+	{ validate_stipple        , NV50_NEW_STIPPLE                          },
+	{ validate_scissor        , NV50_NEW_SCISSOR                          },
+	{ validate_viewport       , NV50_NEW_VIEWPORT                         },
+	{ validate_sampler        , NV50_NEW_SAMPLER                          },
+	{ nv50_tex_validate       , NV50_NEW_TEXTURE | NV50_NEW_SAMPLER       },
+	{ nv50_vbo_validate       , NV50_NEW_ARRAYS                           },
+	{ validate_vtxbuf         , NV50_NEW_ARRAYS                           },
+	{ validate_vtxattr        , NV50_NEW_ARRAYS                           },
+	{}
+};
+#define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
+
+boolean
+nv50_state_validate(struct nv50_context *nv50, unsigned wait_dwords)
+{
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	unsigned nr_relocs = 128, nr_dwords = wait_dwords + 128 + 4;
+	int ret, i;
+
+	for (i = 0; i < validate_list_len; i++) {
+		struct state_validate *validate = &validate_list[i];
+		struct nouveau_stateobj *so;
+
+		if (!(nv50->dirty & validate->states))
+			continue;
+
+		so = validate->func(nv50);
+		if (!so)
+			continue;
+
+		nr_dwords += (so->total + so->cur);
+		nr_relocs += so->cur_reloc;
+
+		so_ref(so, &nv50->state.hw[i]);
+		so_ref(NULL, &so);
+		nv50->state.hw_dirty |= (1 << i);
+	}
+	nv50->dirty = 0;
+
+	if (nv50->screen->cur_ctx != nv50) {
+		for (i = 0; i < validate_list_len; i++) {
+			if (!nv50->state.hw[i] ||
+			    (nv50->state.hw_dirty & (1 << i)))
+				continue;
+
+			nr_dwords += (nv50->state.hw[i]->total +
+				      nv50->state.hw[i]->cur);
+			nr_relocs += nv50->state.hw[i]->cur_reloc;
+			nv50->state.hw_dirty |= (1 << i);
+		}
+
+		nv50->screen->cur_ctx = nv50;
+	}
+
+	ret = MARK_RING(chan, nr_dwords, nr_relocs);
+	if (ret) {
+		debug_printf("MARK_RING(%d, %d) failed: %d\n",
+			     nr_dwords, nr_relocs, ret);
+		return FALSE;
+	}
+
+	while (nv50->state.hw_dirty) {
+		i = ffs(nv50->state.hw_dirty) - 1;
+		nv50->state.hw_dirty &= ~(1 << i);
+
+		so_emit(chan, nv50->state.hw[i]);
+	}
+
+	/* Yes, really, we need to do this.  If a buffer that is referenced
+	 * on the hardware isn't part of changed state above, without doing
+	 * this the kernel is given no clue that the buffer is being used
+	 * still.  This can cause all sorts of fun issues.
+	 */
+	nv50_tex_relocs(nv50);
+	so_emit_reloc_markers(chan, nv50->state.hw[0]); /* fb */
+	so_emit_reloc_markers(chan, nv50->state.hw[3]); /* vp */
+	so_emit_reloc_markers(chan, nv50->state.hw[4]); /* fp */
+	so_emit_reloc_markers(chan, nv50->state.hw[17]); /* vb */
+	nv50_screen_relocs(nv50->screen);
+
+	/* No idea.. */
+	BEGIN_RING(chan, tesla, 0x142c, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, tesla, 0x142c, 1);
+	OUT_RING  (chan, 0);
+	return TRUE;
+}
+
+void nv50_so_init_sifc(struct nv50_context *nv50,
+		       struct nouveau_stateobj *so,
+		       struct nouveau_bo *bo, unsigned reloc,
+		       unsigned offset, unsigned size)
+{
+	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
+
+	reloc |= NOUVEAU_BO_WR;
+
+	so_method(so, eng2d, NV50_2D_DST_FORMAT, 2);
+	so_data  (so, NV50_2D_DST_FORMAT_R8_UNORM);
+	so_data  (so, 1);
+	so_method(so, eng2d, NV50_2D_DST_PITCH, 5);
+	so_data  (so, 262144);
+	so_data  (so, 65536);
+	so_data  (so, 1);
+	so_reloc (so, bo, offset, reloc | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, bo, offset, reloc | NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, eng2d, NV50_2D_SIFC_BITMAP_ENABLE, 2);
+	so_data  (so, 0);
+	so_data  (so, NV50_2D_SIFC_FORMAT_R8_UNORM);
+	so_method(so, eng2d, NV50_2D_SIFC_WIDTH, 10);
+	so_data  (so, size);
+	so_data  (so, 1);
+	so_data  (so, 0);
+	so_data  (so, 1);
+	so_data  (so, 0);
+	so_data  (so, 1);
+	so_data  (so, 0);
+	so_data  (so, 0);
+	so_data  (so, 0);
+	so_data  (so, 0);
+}
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
new file mode 100644
index 0000000000..3e61203adf
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define __NOUVEAU_PUSH_H__
+#include <stdint.h>
+#include "nouveau/nouveau_pushbuf.h"
+#include "nv50_context.h"
+#include "nv50_resource.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_pack_color.h"
+
+#include "util/u_format.h"
+
+/* return TRUE for formats that can be converted among each other by NV50_2D */
+static INLINE boolean
+nv50_2d_format_faithful(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+	case PIPE_FORMAT_B8G8R8A8_SRGB:
+	case PIPE_FORMAT_B8G8R8X8_SRGB:
+	case PIPE_FORMAT_B5G6R5_UNORM:
+	case PIPE_FORMAT_B5G5R5A1_UNORM:
+	case PIPE_FORMAT_B10G10R10A2_UNORM:
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+		return TRUE;
+	default:
+		return FALSE;
+	}
+}
+
+static INLINE int
+nv50_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+		return NV50_2D_DST_FORMAT_A8R8G8B8_UNORM;
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+		return NV50_2D_DST_FORMAT_X8R8G8B8_UNORM;
+	case PIPE_FORMAT_B8G8R8A8_SRGB:
+		return NV50_2D_DST_FORMAT_A8R8G8B8_SRGB;
+	case PIPE_FORMAT_B8G8R8X8_SRGB:
+		return NV50_2D_DST_FORMAT_X8R8G8B8_SRGB;
+	case PIPE_FORMAT_B5G6R5_UNORM:
+		return NV50_2D_DST_FORMAT_R5G6B5_UNORM;
+	case PIPE_FORMAT_B5G5R5A1_UNORM:
+		return NV50_2D_DST_FORMAT_A1R5G5B5_UNORM;
+	case PIPE_FORMAT_B10G10R10A2_UNORM:
+		return NV50_2D_DST_FORMAT_A2R10G10B10_UNORM;
+	case PIPE_FORMAT_A8_UNORM:
+	case PIPE_FORMAT_I8_UNORM:
+	case PIPE_FORMAT_L8_UNORM:
+	case PIPE_FORMAT_R8_UNORM:
+		return NV50_2D_DST_FORMAT_R8_UNORM;
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		return NV50_2D_DST_FORMAT_R32G32B32A32_FLOAT;
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+		return NV50_2D_DST_FORMAT_R32G32B32X32_FLOAT;
+	case PIPE_FORMAT_Z32_FLOAT:
+		return NV50_2D_DST_FORMAT_R32_FLOAT;
+
+	/* only because we require src format == dst format: */
+	case PIPE_FORMAT_R16G16_SNORM:
+	case PIPE_FORMAT_R16G16_UNORM:
+	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+	case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+		return NV50_2D_DST_FORMAT_A8R8G8B8_UNORM;
+	case PIPE_FORMAT_L8A8_UNORM:
+	case PIPE_FORMAT_B4G4R4A4_UNORM:
+		return NV50_2D_DST_FORMAT_R16_UNORM;
+
+	default:
+		return -1;
+	}
+}
+
+static int
+nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
+{
+	struct nv50_miptree *mt = nv50_miptree(ps->texture);
+	struct nouveau_channel *chan = screen->eng2d->channel;
+	struct nouveau_grobj *eng2d = screen->eng2d;
+	struct nouveau_bo *bo = nv50_miptree(ps->texture)->base.bo;
+ 	int format, mthd = dst ? NV50_2D_DST_FORMAT : NV50_2D_SRC_FORMAT;
+ 	int flags = NOUVEAU_BO_VRAM | (dst ? NOUVEAU_BO_WR : NOUVEAU_BO_RD);
+
+ 	format = nv50_format(ps->format);
+	if (format < 0) {
+		NOUVEAU_ERR("invalid/unsupported surface format: %s\n",
+			    util_format_name(ps->format));
+ 		return 1;
+	}
+
+ 	if (!bo->tile_flags) {
+ 		BEGIN_RING(chan, eng2d, mthd, 2);
+ 		OUT_RING  (chan, format);
+ 		OUT_RING  (chan, 1);
+ 		BEGIN_RING(chan, eng2d, mthd + 0x14, 5);
+		OUT_RING  (chan, mt->level[ps->level].pitch);
+ 		OUT_RING  (chan, ps->width);
+ 		OUT_RING  (chan, ps->height);
+ 		OUT_RELOCh(chan, bo, ps->offset, flags);
+ 		OUT_RELOCl(chan, bo, ps->offset, flags);
+ 	} else {
+ 		BEGIN_RING(chan, eng2d, mthd, 5);
+ 		OUT_RING  (chan, format);
+ 		OUT_RING  (chan, 0);
+		OUT_RING  (chan, mt->level[ps->level].tile_mode << 4);
+ 		OUT_RING  (chan, 1);
+ 		OUT_RING  (chan, 0);
+ 		BEGIN_RING(chan, eng2d, mthd + 0x18, 4);
+ 		OUT_RING  (chan, ps->width);
+ 		OUT_RING  (chan, ps->height);
+ 		OUT_RELOCh(chan, bo, ps->offset, flags);
+ 		OUT_RELOCl(chan, bo, ps->offset, flags);
+ 	}
+ 
+#if 0
+ 	if (dst) {
+ 		BEGIN_RING(chan, eng2d, NV50_2D_CLIP_X, 4);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, surf->width);
+ 		OUT_RING  (chan, surf->height);
+ 	}
+#endif
+  
+ 	return 0;
+}
+
+int
+nv50_surface_do_copy(struct nv50_screen *screen, struct pipe_surface *dst,
+		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		     int w, int h)
+{
+	struct nouveau_channel *chan = screen->eng2d->channel;
+	struct nouveau_grobj *eng2d = screen->eng2d;
+	int ret;
+
+	ret = MARK_RING(chan, 2*16 + 32, 4);
+	if (ret)
+		return ret;
+
+	ret = nv50_surface_set(screen, dst, 1);
+	if (ret)
+		return ret;
+
+	ret = nv50_surface_set(screen, src, 0);
+	if (ret)
+		return ret;
+
+	BEGIN_RING(chan, eng2d, 0x088c, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, eng2d, NV50_2D_BLIT_DST_X, 4);
+	OUT_RING  (chan, dx);
+	OUT_RING  (chan, dy);
+	OUT_RING  (chan, w);
+	OUT_RING  (chan, h);
+	BEGIN_RING(chan, eng2d, 0x08c0, 4);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, eng2d, 0x08d0, 4);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, sx);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, sy);
+
+	return 0;
+}
+
+static void
+nv50_surface_copy(struct pipe_context *pipe,
+		  struct pipe_resource *dest, struct pipe_subresource subdst,
+		  unsigned destx, unsigned desty, unsigned destz,
+		  struct pipe_resource *src, struct pipe_subresource subsrc,
+		  unsigned srcx, unsigned srcy, unsigned srcz,
+		  unsigned width, unsigned height)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_screen *screen = nv50->screen;
+	struct pipe_surface *ps_dst, *ps_src;
+
+	assert((src->format == dest->format) ||
+	       (nv50_2d_format_faithful(src->format) &&
+		nv50_2d_format_faithful(dest->format)));
+
+	ps_src = nv50_miptree_surface_new(pipe->screen, src, subsrc.face,
+					  subsrc.level, srcz, 0 /* bind flags */);
+	ps_dst = nv50_miptree_surface_new(pipe->screen, dest, subdst.face,
+					  subdst.level, destz, 0 /* bindflags */);
+
+	nv50_surface_do_copy(screen, ps_dst, destx, desty, ps_src, srcx,
+			     srcy, width, height);
+
+	nv50_miptree_surface_del(ps_src);
+	nv50_miptree_surface_del(ps_dst);
+}
+
+/* XXX this should probably look more along the lines of nv50_clear */
+static void
+nv50_clear_render_target(struct pipe_context *pipe,
+			 struct pipe_surface *dst,
+			 const float *rgba,
+			 unsigned dstx, unsigned dsty,
+			 unsigned width, unsigned height)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_screen *screen = nv50->screen;
+	struct nouveau_channel *chan = screen->eng2d->channel;
+	struct nouveau_grobj *eng2d = screen->eng2d;
+	int format, ret;
+	union util_color uc;
+	util_pack_color(rgba, dst->format, &uc);
+
+	format = nv50_format(dst->format);
+	if (format < 0)
+		return;
+
+	ret = MARK_RING (chan, 16 + 32, 2);
+	if (ret)
+		return;
+
+	ret = nv50_surface_set(screen, dst, 1);
+	if (ret)
+		return;
+
+	BEGIN_RING(chan, eng2d, NV50_2D_DRAW_SHAPE, 3);
+	OUT_RING  (chan, NV50_2D_DRAW_SHAPE_RECTANGLES);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, uc.ui);
+	BEGIN_RING(chan, eng2d, NV50_2D_DRAW_POINT32_X(0), 4);
+	OUT_RING  (chan, dstx);
+	OUT_RING  (chan, dsty);
+	OUT_RING  (chan, width);
+	OUT_RING  (chan, height);
+
+}
+
+void
+nv50_init_surface_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.resource_copy_region = nv50_surface_copy;
+	nv50->pipe.clear_render_target = nv50_clear_render_target;
+}
+
+
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
new file mode 100644
index 0000000000..5ea0c1d726
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50_context.h"
+#include "nv50_texture.h"
+#include "nv50_resource.h"
+
+#include "nouveau/nouveau_stateobj.h"
+#include "nouveau/nouveau_reloc.h"
+
+#include "util/u_format.h"
+
+#define _MIXED(pf, t0, t1, t2, t3, cr, cg, cb, ca, f)		\
+[PIPE_FORMAT_##pf] = (						\
+	NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 |	\
+	NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 |	\
+	NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 |	\
+	NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 |	\
+	NV50TIC_0_0_FMT_##f)
+
+#define _(pf, t, cr, cg, cb, ca, f) _MIXED(pf, t, t, t, t, cr, cg, cb, ca, f)
+
+static const uint32_t nv50_texture_formats[PIPE_FORMAT_COUNT] =
+{
+	_(B8G8R8A8_UNORM, UNORM, C2, C1, C0, C3,  8_8_8_8),
+	_(B8G8R8A8_SRGB,  UNORM, C2, C1, C0, C3,  8_8_8_8),
+	_(B8G8R8X8_UNORM, UNORM, C2, C1, C0, ONE, 8_8_8_8),
+	_(B8G8R8X8_SRGB,  UNORM, C2, C1, C0, ONE, 8_8_8_8),
+	_(B5G5R5A1_UNORM, UNORM, C2, C1, C0, C3,  1_5_5_5),
+	_(B4G4R4A4_UNORM, UNORM, C2, C1, C0, C3,  4_4_4_4),
+
+	_(B5G6R5_UNORM, UNORM, C2, C1, C0, ONE, 5_6_5),
+
+	_(L8_UNORM, UNORM, C0, C0, C0, ONE, 8),
+	_(L8_SRGB,  UNORM, C0, C0, C0, ONE, 8),
+	_(A8_UNORM, UNORM, ZERO, ZERO, ZERO, C0, 8),
+	_(I8_UNORM, UNORM, C0, C0, C0, C0, 8),
+
+	_(L8A8_UNORM, UNORM, C0, C0, C0, C1, 8_8),
+	_(L8A8_SRGB,  UNORM, C0, C0, C0, C1, 8_8),
+
+	_(DXT1_RGB, UNORM, C0, C1, C2, ONE, DXT1),
+	_(DXT1_RGBA, UNORM, C0, C1, C2, C3, DXT1),
+	_(DXT3_RGBA, UNORM, C0, C1, C2, C3, DXT3),
+	_(DXT5_RGBA, UNORM, C0, C1, C2, C3, DXT5),
+
+	_MIXED(S8_USCALED_Z24_UNORM, UINT, UNORM, UINT, UINT, C1, C1, C1, ONE, 24_8),
+	_MIXED(Z24_UNORM_S8_USCALED, UNORM, UINT, UINT, UINT, C0, C0, C0, ONE, 8_24),
+
+	_(R16G16B16A16_SNORM, UNORM, C0, C1, C2, C3, 16_16_16_16),
+	_(R16G16B16A16_UNORM, SNORM, C0, C1, C2, C3, 16_16_16_16),
+	_(R32G32B32A32_FLOAT, FLOAT, C0, C1, C2, C3, 32_32_32_32),
+
+	_(R16G16_SNORM, SNORM, C0, C1, ZERO, ONE, 16_16),
+	_(R16G16_UNORM, UNORM, C0, C1, ZERO, ONE, 16_16),
+
+	_MIXED(Z32_FLOAT, FLOAT, UINT, UINT, UINT, C0, C0, C0, ONE, 32_DEPTH)
+};
+
+#undef _
+#undef _MIXED
+
+static INLINE uint32_t
+nv50_tic_swizzle(uint32_t tc, unsigned swz)
+{
+	switch (swz) {
+	case PIPE_SWIZZLE_RED:
+		return (tc & NV50TIC_0_0_MAPR_MASK) >> NV50TIC_0_0_MAPR_SHIFT;
+	case PIPE_SWIZZLE_GREEN:
+		return (tc & NV50TIC_0_0_MAPG_MASK) >> NV50TIC_0_0_MAPG_SHIFT;
+	case PIPE_SWIZZLE_BLUE:
+		return (tc & NV50TIC_0_0_MAPB_MASK) >> NV50TIC_0_0_MAPB_SHIFT;
+	case PIPE_SWIZZLE_ALPHA:
+		return (tc & NV50TIC_0_0_MAPA_MASK) >> NV50TIC_0_0_MAPA_SHIFT;
+	case PIPE_SWIZZLE_ONE:
+		return 7;
+	case PIPE_SWIZZLE_ZERO:
+	default:
+		return 0;
+	}
+}
+
+boolean
+nv50_tex_construct(struct nv50_sampler_view *view)
+{
+	const struct util_format_description *desc;
+	struct nv50_miptree *mt = nv50_miptree(view->pipe.texture);
+	uint32_t swz[4], *tic = view->tic;
+
+	tic[0] = nv50_texture_formats[view->pipe.format];
+
+	swz[0] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_r);
+	swz[1] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_g);
+	swz[2] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_b);
+	swz[3] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_a);
+	view->tic[0] = (tic[0] &  ~NV50TIC_0_0_SWIZZLE_MASK) |
+		(swz[0] << NV50TIC_0_0_MAPR_SHIFT) |
+		(swz[1] << NV50TIC_0_0_MAPG_SHIFT) |
+		(swz[2] << NV50TIC_0_0_MAPB_SHIFT) |
+		(swz[3] << NV50TIC_0_0_MAPA_SHIFT);
+
+	tic[2] = 0x50001000;
+	tic[2] |= ((mt->base.bo->tile_mode & 0x0f) << 22) |
+		  ((mt->base.bo->tile_mode & 0xf0) << 21);
+
+	desc = util_format_description(mt->base.base.format);
+	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+		tic[2] |= NV50TIC_0_2_COLORSPACE_SRGB;
+
+	switch (mt->base.base.target) {
+	case PIPE_TEXTURE_1D:
+		tic[2] |= NV50TIC_0_2_TARGET_1D;
+		break;
+	case PIPE_TEXTURE_2D:
+		tic[2] |= NV50TIC_0_2_TARGET_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		tic[2] |= NV50TIC_0_2_TARGET_3D;
+		break;
+	case PIPE_TEXTURE_CUBE:
+		tic[2] |= NV50TIC_0_2_TARGET_CUBE;
+		break;
+	default:
+		NOUVEAU_ERR("invalid texture target: %d\n",
+			    mt->base.base.target);
+		return FALSE;
+	}
+
+	tic[3] = 0x00300000;
+
+	tic[4] = (1 << 31) | mt->base.base.width0;
+	tic[5] = (mt->base.base.last_level << 28) |
+		(mt->base.base.depth0 << 16) | mt->base.base.height0;
+
+	tic[6] = 0x03000000;
+
+	tic[7] = (view->pipe.last_level << 4) | view->pipe.first_level;
+
+	return TRUE;
+}
+
+static int
+nv50_validate_textures(struct nv50_context *nv50, struct nouveau_stateobj *so,
+		       unsigned p)
+{
+	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	unsigned unit, j;
+
+	const unsigned rll = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW;
+	const unsigned rlh = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH
+		| NOUVEAU_BO_OR;
+
+	nv50_so_init_sifc(nv50, so, nv50->screen->tic, NOUVEAU_BO_VRAM,
+			  p * (32 * 8 * 4), nv50->sampler_view_nr[p] * 8 * 4);
+
+	for (unit = 0; unit < nv50->sampler_view_nr[p]; ++unit) {
+		struct nv50_sampler_view *view =
+			nv50_sampler_view(nv50->sampler_views[p][unit]);
+
+		so_method(so, eng2d, NV50_2D_SIFC_DATA | (2 << 29), 8);
+		if (view) {
+			uint32_t tic2 = view->tic[2];
+			struct nv50_miptree *mt =
+				nv50_miptree(view->pipe.texture);
+
+			tic2 &= ~NV50TIC_0_2_NORMALIZED_COORDS;
+			if (nv50->sampler[p][unit]->normalized)
+				tic2 |= NV50TIC_0_2_NORMALIZED_COORDS;
+			view->tic[2] = tic2;
+
+			so_data  (so, view->tic[0]);
+			so_reloc (so, mt->base.bo, 0, rll, 0, 0);
+			so_reloc (so, mt->base.bo, 0, rlh, tic2, tic2);
+			so_datap (so, &view->tic[3], 5);
+
+			/* Set TEX insn $t src binding $unit in program type p
+			 * to TIC, TSC entry (32 * p + unit), mark valid (1).
+			 */
+			so_method(so, tesla, NV50TCL_BIND_TIC(p), 1);
+			so_data  (so, ((32 * p + unit) << 9) | (unit << 1) | 1);
+		} else {
+			for (j = 0; j < 8; ++j)
+				so_data(so, 0);
+			so_method(so, tesla, NV50TCL_BIND_TIC(p), 1);
+			so_data  (so, (unit << 1) | 0);
+		}
+	}
+
+	for (; unit < nv50->state.sampler_view_nr[p]; unit++) {
+		/* Make other bindings invalid. */
+		so_method(so, tesla, NV50TCL_BIND_TIC(p), 1);
+		so_data  (so, (unit << 1) | 0);
+	}
+
+	nv50->state.sampler_view_nr[p] = nv50->sampler_view_nr[p];
+	return TRUE;
+}
+
+static void
+nv50_emit_texture_relocs(struct nv50_context *nv50, int prog)
+{
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_bo *tic = nv50->screen->tic;
+	int unit;
+
+	for (unit = 0; unit < nv50->sampler_view_nr[prog]; unit++) {
+		struct nv50_sampler_view *view;
+		struct nv50_miptree *mt;
+		const unsigned base = ((prog * 32) + unit) * 32;
+
+		view = nv50_sampler_view(nv50->sampler_views[prog][unit]);
+		if (!view)
+			continue;
+		mt = nv50_miptree(view->pipe.texture);
+
+		nouveau_reloc_emit(chan, tic, base + 4, NULL, mt->base.bo, 0, 0,
+				   NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+				   NOUVEAU_BO_LOW, 0, 0);
+		nouveau_reloc_emit(chan, tic, base + 8, NULL, mt->base.bo, 0, 0,
+				   NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+				   NOUVEAU_BO_HIGH, view->tic[2], view->tic[2]);
+	}
+}
+
+void
+nv50_tex_relocs(struct nv50_context *nv50)
+{
+	nv50_emit_texture_relocs(nv50, 2); /* FP */
+	nv50_emit_texture_relocs(nv50, 0); /* VP */
+}
+
+struct nouveau_stateobj *
+nv50_tex_validate(struct nv50_context *nv50)
+{
+	struct nouveau_stateobj *so;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	unsigned p, m = 0, d = 0, r = 0;
+
+	for (p = 0; p < 3; ++p) {
+		unsigned nr = MAX2(nv50->sampler_view_nr[p],
+				   nv50->state.sampler_view_nr[p]);
+		m += nr;
+		d += nr;
+		r += nv50->sampler_view_nr[p];
+	}
+	m = m * 2 + 3 * 4 + 1;
+	d = d * 9 + 3 * 19 + 1;
+	r = r * 2 + 3 * 2;
+
+	so = so_new(m, d, r);
+
+	if (nv50_validate_textures(nv50, so, 0) == FALSE ||
+	    nv50_validate_textures(nv50, so, 2) == FALSE) {
+		so_ref(NULL, &so);
+
+		NOUVEAU_ERR("failed tex validate\n");
+		return NULL;
+	}
+
+	so_method(so, tesla, 0x1330, 1); /* flush TIC */
+	so_data  (so, 0);
+
+	return so;
+}
diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h
new file mode 100644
index 0000000000..3475d3e432
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_texture.h
@@ -0,0 +1,188 @@
+#ifndef __NV50_TEXTURE_H__
+#define __NV50_TEXTURE_H__
+
+/* It'd be really nice to have these in nouveau_class.h generated by
+ * renouveau like the rest of the object header - but not sure it can
+ * handle non-object stuff nicely - need to look into it.
+ */
+
+/* Texture image control block */
+#define NV50TIC_0_0_SWIZZLE_MASK                                  0x3ffc0000
+#define NV50TIC_0_0_MAPA_MASK                                     0x38000000
+#define NV50TIC_0_0_MAPA_SHIFT                                            27
+#define NV50TIC_0_0_MAPA_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPA_C0                                       0x10000000
+#define NV50TIC_0_0_MAPA_C1                                       0x18000000
+#define NV50TIC_0_0_MAPA_C2                                       0x20000000
+#define NV50TIC_0_0_MAPA_C3                                       0x28000000
+#define NV50TIC_0_0_MAPA_ONE                                      0x38000000
+#define NV50TIC_0_0_MAPB_MASK                                     0x07000000
+#define NV50TIC_0_0_MAPB_SHIFT                                            24
+#define NV50TIC_0_0_MAPB_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPB_C0                                       0x02000000
+#define NV50TIC_0_0_MAPB_C1                                       0x03000000
+#define NV50TIC_0_0_MAPB_C2                                       0x04000000
+#define NV50TIC_0_0_MAPB_C3                                       0x05000000
+#define NV50TIC_0_0_MAPB_ONE                                      0x07000000
+#define NV50TIC_0_0_MAPG_MASK                                     0x00e00000
+#define NV50TIC_0_0_MAPG_SHIFT                                            21
+#define NV50TIC_0_0_MAPG_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPG_C0                                       0x00400000
+#define NV50TIC_0_0_MAPG_C1                                       0x00600000
+#define NV50TIC_0_0_MAPG_C2                                       0x00800000
+#define NV50TIC_0_0_MAPG_C3                                       0x00a00000
+#define NV50TIC_0_0_MAPG_ONE                                      0x00e00000
+#define NV50TIC_0_0_MAPR_MASK                                     0x001c0000
+#define NV50TIC_0_0_MAPR_SHIFT                                            18
+#define NV50TIC_0_0_MAPR_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPR_C0                                       0x00080000
+#define NV50TIC_0_0_MAPR_C1                                       0x000c0000
+#define NV50TIC_0_0_MAPR_C2                                       0x00100000
+#define NV50TIC_0_0_MAPR_C3                                       0x00140000
+#define NV50TIC_0_0_MAPR_ONE                                      0x001c0000
+#define NV50TIC_0_0_TYPEA_MASK                                    0x00038000
+#define NV50TIC_0_0_TYPEA_UNORM                                   0x00010000
+#define NV50TIC_0_0_TYPEA_SNORM                                   0x00008000
+#define NV50TIC_0_0_TYPEA_SINT                                    0x00018000
+#define NV50TIC_0_0_TYPEA_UINT                                    0x00020000
+#define NV50TIC_0_0_TYPEA_FLOAT                                   0x00038000
+#define NV50TIC_0_0_TYPEB_MASK                                    0x00007000
+#define NV50TIC_0_0_TYPEB_UNORM                                   0x00002000
+#define NV50TIC_0_0_TYPEB_SNORM                                   0x00001000
+#define NV50TIC_0_0_TYPEB_SINT                                    0x00003000
+#define NV50TIC_0_0_TYPEB_UINT                                    0x00004000
+#define NV50TIC_0_0_TYPEB_FLOAT                                   0x00007000
+#define NV50TIC_0_0_TYPEG_MASK                                    0x00000e00
+#define NV50TIC_0_0_TYPEG_UNORM                                   0x00000400
+#define NV50TIC_0_0_TYPEG_SNORM                                   0x00000200
+#define NV50TIC_0_0_TYPEG_SINT                                    0x00000600
+#define NV50TIC_0_0_TYPEG_UINT                                    0x00000800
+#define NV50TIC_0_0_TYPEG_FLOAT                                   0x00000e00
+#define NV50TIC_0_0_TYPER_MASK                                    0x000001c0
+#define NV50TIC_0_0_TYPER_UNORM                                   0x00000080
+#define NV50TIC_0_0_TYPER_SNORM                                   0x00000040
+#define NV50TIC_0_0_TYPER_SINT                                    0x000000c0
+#define NV50TIC_0_0_TYPER_UINT                                    0x00000100
+#define NV50TIC_0_0_TYPER_FLOAT                                   0x000001c0
+#define NV50TIC_0_0_FMT_MASK                                      0x0000003f
+#define NV50TIC_0_0_FMT_32_32_32_32                               0x00000001
+#define NV50TIC_0_0_FMT_16_16_16_16                               0x00000003
+#define NV50TIC_0_0_FMT_32_32                                     0x00000004
+#define NV50TIC_0_0_FMT_8_8_8_8                                   0x00000008
+#define NV50TIC_0_0_FMT_2_10_10_10                                0x00000009
+#define NV50TIC_0_0_FMT_16_16                                     0x0000000c
+#define NV50TIC_0_0_FMT_32                                        0x0000000f
+#define NV50TIC_0_0_FMT_4_4_4_4                                   0x00000012
+/* #define NV50TIC_0_0_FMT_1_5_5_5                                0x00000013 */
+#define NV50TIC_0_0_FMT_1_5_5_5                                   0x00000014
+#define NV50TIC_0_0_FMT_5_6_5                                     0x00000015
+#define NV50TIC_0_0_FMT_8_8                                       0x00000018
+#define NV50TIC_0_0_FMT_16                                        0x0000001b
+#define NV50TIC_0_0_FMT_8                                         0x0000001d
+#define NV50TIC_0_0_FMT_5_9_9_9                                   0x00000020
+#define NV50TIC_0_0_FMT_10_11_11                                  0x00000021
+#define NV50TIC_0_0_FMT_DXT1                                      0x00000024
+#define NV50TIC_0_0_FMT_DXT3                                      0x00000025
+#define NV50TIC_0_0_FMT_DXT5                                      0x00000026
+#define NV50TIC_0_0_FMT_RGTC1                                     0x00000027
+#define NV50TIC_0_0_FMT_RGTC2                                     0x00000028
+#define NV50TIC_0_0_FMT_24_8                                      0x00000029
+#define NV50TIC_0_0_FMT_8_24                                      0x0000002a
+#define NV50TIC_0_0_FMT_32_DEPTH                                  0x0000002f
+#define NV50TIC_0_0_FMT_32_8                                      0x00000030
+
+#define NV50TIC_0_1_OFFSET_LOW_MASK                               0xffffffff
+#define NV50TIC_0_1_OFFSET_LOW_SHIFT                                       0
+
+#define NV50TIC_0_2_COLORSPACE_SRGB                               0x00000400
+#define NV50TIC_0_2_TARGET_1D                                     0x00000000
+#define NV50TIC_0_2_TARGET_2D                                     0x00004000
+#define NV50TIC_0_2_TARGET_3D                                     0x00008000
+#define NV50TIC_0_2_TARGET_CUBE                                   0x0000c000
+#define NV50TIC_0_2_TARGET_1D_ARRAY                               0x00010000
+#define NV50TIC_0_2_TARGET_2D_ARRAY                               0x00014000
+#define NV50TIC_0_2_TARGET_BUFFER                                 0x00018000
+#define NV50TIC_0_2_TARGET_RECT                                   0x0001c000
+/* #define NV50TIC_0_0_TILE_MODE_LINEAR                           0x00040000 */
+#define NV50TIC_0_2_TILE_MODE_Y_MASK                              0x01c00000
+#define NV50TIC_0_2_TILE_MODE_Y_SHIFT                                     22
+#define NV50TIC_0_2_TILE_MODE_Z_MASK                              0x0e000000
+#define NV50TIC_0_2_TILE_MODE_Z_SHIFT                                     25
+#define NV50TIC_0_2_NORMALIZED_COORDS                             0x80000000
+
+#define NV50TIC_0_3_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_4_WIDTH_MASK                                    0x0000ffff
+#define NV50TIC_0_4_WIDTH_SHIFT                                            0
+
+#define NV50TIC_0_5_LAST_LEVEL_MASK                               0xf0000000
+#define NV50TIC_0_5_LAST_LEVEL_SHIFT                                      28
+#define NV50TIC_0_5_DEPTH_MASK                                    0x0fff0000
+#define NV50TIC_0_5_DEPTH_SHIFT                                           16
+#define NV50TIC_0_5_HEIGHT_MASK                                   0x0000ffff
+#define NV50TIC_0_5_HEIGHT_SHIFT                                           0
+#define NV50TIC_0_6_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_7_BASE_LEVEL_MASK                               0x0000000f
+#define NV50TIC_0_7_BASE_LEVEL_SHIFT                                       0
+#define NV50TIC_0_7_MAX_LEVEL_MASK                                0x000000f0
+#define NV50TIC_0_7_MAX_LEVEL_SHIFT                                        4
+
+/* Texture sampler control block */
+#define NV50TSC_1_0_WRAPS_MASK                                   0x00000007
+#define NV50TSC_1_0_WRAPS_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPS_MIRROR_REPEAT                          0x00000001
+#define NV50TSC_1_0_WRAPS_CLAMP_TO_EDGE                          0x00000002
+#define NV50TSC_1_0_WRAPS_CLAMP_TO_BORDER                        0x00000003
+#define NV50TSC_1_0_WRAPS_CLAMP                                  0x00000004
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_EDGE                   0x00000005
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_BORDER                 0x00000006
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP                           0x00000007
+#define NV50TSC_1_0_WRAPT_MASK                                   0x00000038
+#define NV50TSC_1_0_WRAPT_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPT_MIRROR_REPEAT                          0x00000008
+#define NV50TSC_1_0_WRAPT_CLAMP_TO_EDGE                          0x00000010
+#define NV50TSC_1_0_WRAPT_CLAMP_TO_BORDER                        0x00000018
+#define NV50TSC_1_0_WRAPT_CLAMP                                  0x00000020
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP_TO_EDGE                   0x00000028
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP_TO_BORDER                 0x00000030
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP                           0x00000038
+#define NV50TSC_1_0_WRAPR_MASK                                   0x000001c0
+#define NV50TSC_1_0_WRAPR_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPR_MIRROR_REPEAT                          0x00000040
+#define NV50TSC_1_0_WRAPR_CLAMP_TO_EDGE                          0x00000080
+#define NV50TSC_1_0_WRAPR_CLAMP_TO_BORDER                        0x000000c0
+#define NV50TSC_1_0_WRAPR_CLAMP                                  0x00000100
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP_TO_EDGE                   0x00000140
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP_TO_BORDER                 0x00000180
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP                           0x000001c0
+#define NV50TSC_1_0_MAX_ANISOTROPY_MASK                          0x00700000
+
+#define NV50TSC_1_1_MAGF_MASK                                    0x00000003
+#define NV50TSC_1_1_MAGF_NEAREST                                 0x00000001
+#define NV50TSC_1_1_MAGF_LINEAR                                  0x00000002
+#define NV50TSC_1_1_MINF_MASK                                    0x00000030
+#define NV50TSC_1_1_MINF_NEAREST                                 0x00000010
+#define NV50TSC_1_1_MINF_LINEAR                                  0x00000020
+#define NV50TSC_1_1_MIPF_MASK                                    0x000000c0
+#define NV50TSC_1_1_MIPF_NONE                                    0x00000040
+#define NV50TSC_1_1_MIPF_NEAREST                                 0x00000080
+#define NV50TSC_1_1_MIPF_LINEAR                                  0x000000c0
+#define NV50TSC_1_1_LOD_BIAS_MASK                                0x01fff000
+#define NV50TSC_1_1_UNKN_ANISO_15                                0x10000000
+#define NV50TSC_1_1_UNKN_ANISO_35                                0x18000000
+
+#define NV50TSC_1_2_MIN_LOD_MASK                                 0x00000f00
+#define NV50TSC_1_2_MAX_LOD_MASK                                 0x00f00000
+
+#define NV50TSC_1_3_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_4_BORDER_COLOR_RED_MASK                        0xffffffff
+
+#define NV50TSC_1_5_BORDER_COLOR_GREEN_MASK                      0xffffffff
+
+#define NV50TSC_1_6_BORDER_COLOR_BLUE_MASK                       0xffffffff
+
+#define NV50TSC_1_7_BORDER_COLOR_ALPHA_MASK                      0xffffffff
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c
new file mode 100644
index 0000000000..f973cf24b9
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_transfer.c
@@ -0,0 +1,348 @@
+
+#include "pipe/p_context.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+
+#include "nv50_context.h"
+#include "nv50_transfer.h"
+#include "nv50_resource.h"
+
+struct nv50_transfer {
+	struct pipe_transfer base;
+	struct nouveau_bo *bo;
+	int map_refcnt;
+	unsigned level_offset;
+	unsigned level_tiling;
+	int level_pitch;
+	int level_width;
+	int level_height;
+	int level_depth;
+	int level_x;
+	int level_y;
+	int level_z;
+	unsigned nblocksx;
+	unsigned nblocksy;
+};
+
+static void
+nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
+			struct nouveau_bo *src_bo, unsigned src_offset,
+			int src_pitch, unsigned src_tile_mode,
+			int sx, int sy, int sz, int sw, int sh, int sd,
+			struct nouveau_bo *dst_bo, unsigned dst_offset,
+			int dst_pitch, unsigned dst_tile_mode,
+			int dx, int dy, int dz, int dw, int dh, int dd,
+			int cpp, int width, int height,
+			unsigned src_reloc, unsigned dst_reloc)
+{
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nouveau_channel *chan = screen->m2mf->channel;
+	struct nouveau_grobj *m2mf = screen->m2mf;
+
+	src_reloc |= NOUVEAU_BO_RD;
+	dst_reloc |= NOUVEAU_BO_WR;
+
+	WAIT_RING (chan, 14);
+
+	if (!src_bo->tile_flags) {
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_IN, 1);
+		OUT_RING  (chan, 1);
+		BEGIN_RING(chan, m2mf,
+			NV04_MEMORY_TO_MEMORY_FORMAT_PITCH_IN, 1);
+		OUT_RING  (chan, src_pitch);
+		src_offset += (sy * src_pitch) + (sx * cpp);
+	} else {
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_IN, 6);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, src_tile_mode << 4);
+		OUT_RING  (chan, sw * cpp);
+		OUT_RING  (chan, sh);
+		OUT_RING  (chan, sd);
+		OUT_RING  (chan, sz); /* copying only 1 zslice per call */
+	}
+
+	if (!dst_bo->tile_flags) {
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_OUT, 1);
+		OUT_RING  (chan, 1);
+		BEGIN_RING(chan, m2mf,
+			NV04_MEMORY_TO_MEMORY_FORMAT_PITCH_OUT, 1);
+		OUT_RING  (chan, dst_pitch);
+		dst_offset += (dy * dst_pitch) + (dx * cpp);
+	} else {
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_OUT, 6);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, dst_tile_mode << 4);
+		OUT_RING  (chan, dw * cpp);
+		OUT_RING  (chan, dh);
+		OUT_RING  (chan, dd);
+		OUT_RING  (chan, dz); /* copying only 1 zslice per call */
+	}
+
+	while (height) {
+		int line_count = height > 2047 ? 2047 : height;
+
+		MARK_RING (chan, 15, 4); /* flush on lack of space or relocs */
+		BEGIN_RING(chan, m2mf,
+			NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN_HIGH, 2);
+		OUT_RELOCh(chan, src_bo, src_offset, src_reloc);
+		OUT_RELOCh(chan, dst_bo, dst_offset, dst_reloc);
+		BEGIN_RING(chan, m2mf,
+			NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 2);
+		OUT_RELOCl(chan, src_bo, src_offset, src_reloc);
+		OUT_RELOCl(chan, dst_bo, dst_offset, dst_reloc);
+		if (src_bo->tile_flags) {
+			BEGIN_RING(chan, m2mf,
+				NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN, 1);
+			OUT_RING  (chan, (sy << 16) | (sx * cpp));
+		} else {
+			src_offset += (line_count * src_pitch);
+		}
+		if (dst_bo->tile_flags) {
+			BEGIN_RING(chan, m2mf,
+				NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT, 1);
+			OUT_RING  (chan, (dy << 16) | (dx * cpp));
+		} else {
+			dst_offset += (line_count * dst_pitch);
+		}
+		BEGIN_RING(chan, m2mf,
+			NV04_MEMORY_TO_MEMORY_FORMAT_LINE_LENGTH_IN, 4);
+		OUT_RING  (chan, width * cpp);
+		OUT_RING  (chan, line_count);
+		OUT_RING  (chan, 0x00000101);
+		OUT_RING  (chan, 0);
+		FIRE_RING (chan);
+
+		height -= line_count;
+		sy += line_count;
+		dy += line_count;
+	}
+}
+
+struct pipe_transfer *
+nv50_miptree_transfer_new(struct pipe_context *pcontext,
+			  struct pipe_resource *pt,
+			  struct pipe_subresource sr,
+			  unsigned usage,
+			  const struct pipe_box *box)
+{
+        struct pipe_screen *pscreen = pcontext->screen;
+	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+	struct nv50_miptree *mt = nv50_miptree(pt);
+	struct nv50_miptree_level *lvl = &mt->level[sr.level];
+	struct nv50_transfer *tx;
+	unsigned nx, ny, image = 0;
+	int ret;
+
+	if (pt->target == PIPE_TEXTURE_CUBE)
+		image = sr.face;
+
+	tx = CALLOC_STRUCT(nv50_transfer);
+	if (!tx)
+		return NULL;
+
+	/* Don't handle 3D transfers yet.
+	 */
+	assert(box->depth == 1);
+
+
+	pipe_resource_reference(&tx->base.resource, pt);
+	tx->base.sr = sr;
+	tx->base.usage = usage;
+	tx->base.box = *box;
+	tx->nblocksx = util_format_get_nblocksx(pt->format, u_minify(pt->width0, sr.level));
+	tx->nblocksy = util_format_get_nblocksy(pt->format, u_minify(pt->height0, sr.level));
+	tx->base.stride = tx->nblocksx * util_format_get_blocksize(pt->format);
+	tx->base.usage = usage;
+
+	tx->level_pitch = lvl->pitch;
+	tx->level_width = u_minify(mt->base.base.width0, sr.level);
+	tx->level_height = u_minify(mt->base.base.height0, sr.level);
+	tx->level_depth = u_minify(mt->base.base.depth0, sr.level);
+	tx->level_offset = lvl->image_offset[image];
+	tx->level_tiling = lvl->tile_mode;
+	tx->level_z = box->z;
+	tx->level_x = util_format_get_nblocksx(pt->format, box->x);
+	tx->level_y = util_format_get_nblocksy(pt->format, box->y);
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0,
+			     tx->nblocksy * tx->base.stride, &tx->bo);
+	if (ret) {
+		FREE(tx);
+		return NULL;
+	}
+
+	if (usage & PIPE_TRANSFER_READ) {
+		nx = util_format_get_nblocksx(pt->format, box->width);
+		ny = util_format_get_nblocksy(pt->format, box->height);
+
+		nv50_transfer_rect_m2mf(pscreen, mt->base.bo, tx->level_offset,
+					tx->level_pitch, tx->level_tiling,
+					box->x, box->y, box->z,
+					tx->nblocksx, tx->nblocksy,
+					tx->level_depth,
+					tx->bo, 0,
+					tx->base.stride, tx->bo->tile_mode,
+					0, 0, 0,
+					tx->nblocksx, tx->nblocksy, 1,
+					util_format_get_blocksize(pt->format), nx, ny,
+					NOUVEAU_BO_VRAM | NOUVEAU_BO_GART,
+					NOUVEAU_BO_GART);
+	}
+
+	return &tx->base;
+}
+
+void
+nv50_miptree_transfer_del(struct pipe_context *pcontext,
+			  struct pipe_transfer *ptx)
+{
+	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
+	struct nv50_miptree *mt = nv50_miptree(ptx->resource);
+	struct pipe_resource *pt = ptx->resource;
+
+	unsigned nx = util_format_get_nblocksx(pt->format, tx->base.box.width);
+	unsigned ny = util_format_get_nblocksy(pt->format, tx->base.box.height);
+
+	if (ptx->usage & PIPE_TRANSFER_WRITE) {
+		struct pipe_screen *pscreen = pcontext->screen;
+
+		nv50_transfer_rect_m2mf(pscreen, tx->bo, 0,
+					tx->base.stride, tx->bo->tile_mode,
+					0, 0, 0,
+					tx->nblocksx, tx->nblocksy, 1,
+					mt->base.bo, tx->level_offset,
+					tx->level_pitch, tx->level_tiling,
+					tx->level_x, tx->level_y, tx->level_z,
+					tx->nblocksx, tx->nblocksy,
+					tx->level_depth,
+					util_format_get_blocksize(pt->format), nx, ny,
+					NOUVEAU_BO_GART, NOUVEAU_BO_VRAM |
+					NOUVEAU_BO_GART);
+	}
+
+	nouveau_bo_ref(NULL, &tx->bo);
+	pipe_resource_reference(&ptx->resource, NULL);
+	FREE(ptx);
+}
+
+void *
+nv50_miptree_transfer_map(struct pipe_context *pcontext,
+			  struct pipe_transfer *ptx)
+{
+	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
+	unsigned flags = 0;
+	int ret;
+
+	if (tx->map_refcnt++)
+		return tx->bo->map;
+
+	if (ptx->usage & PIPE_TRANSFER_WRITE)
+		flags |= NOUVEAU_BO_WR;
+	if (ptx->usage & PIPE_TRANSFER_READ)
+		flags |= NOUVEAU_BO_RD;
+
+	ret = nouveau_bo_map(tx->bo, flags);
+	if (ret) {
+		tx->map_refcnt = 0;
+		return NULL;
+	}
+	return tx->bo->map;
+}
+
+void
+nv50_miptree_transfer_unmap(struct pipe_context *pcontext,
+			    struct pipe_transfer *ptx)
+{
+	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
+
+	if (--tx->map_refcnt)
+		return;
+	nouveau_bo_unmap(tx->bo);
+}
+
+
+void
+nv50_upload_sifc(struct nv50_context *nv50,
+		 struct nouveau_bo *bo, unsigned dst_offset, unsigned reloc,
+		 unsigned dst_format, int dst_w, int dst_h, int dst_pitch,
+		 void *src, unsigned src_format, int src_pitch,
+		 int x, int y, int w, int h, int cpp)
+{
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
+	unsigned line_dwords = (w * cpp + 3) / 4;
+
+	reloc |= NOUVEAU_BO_WR;
+
+	MARK_RING (chan, 32, 2); /* flush on lack of space or relocs */
+
+	if (bo->tile_flags) {
+		BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 5);
+		OUT_RING  (chan, dst_format);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, bo->tile_mode << 4);
+		OUT_RING  (chan, 1);
+		OUT_RING  (chan, 0);
+	} else {
+		BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 2);
+		OUT_RING  (chan, dst_format);
+		OUT_RING  (chan, 1);
+		BEGIN_RING(chan, eng2d, NV50_2D_DST_PITCH, 1);
+		OUT_RING  (chan, dst_pitch);
+	}
+
+	BEGIN_RING(chan, eng2d, NV50_2D_DST_WIDTH, 4);
+	OUT_RING  (chan, dst_w);
+	OUT_RING  (chan, dst_h);
+	OUT_RELOCh(chan, bo, dst_offset, reloc);
+	OUT_RELOCl(chan, bo, dst_offset, reloc);
+
+	/* NV50_2D_OPERATION_SRCCOPY assumed already set */
+
+	BEGIN_RING(chan, eng2d, NV50_2D_SIFC_BITMAP_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, src_format);
+	BEGIN_RING(chan, eng2d, NV50_2D_SIFC_WIDTH, 10);
+	OUT_RING  (chan, w);
+	OUT_RING  (chan, h);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, x);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, y);
+
+	while (h--) {
+		const uint32_t *p = src;
+		unsigned count = line_dwords;
+
+		while (count) {
+			unsigned nr = MIN2(count, 1792);
+
+			if (AVAIL_RING(chan) <= nr) {
+				FIRE_RING (chan);
+
+				BEGIN_RING(chan, eng2d,
+					   NV50_2D_DST_ADDRESS_HIGH, 2);
+				OUT_RELOCh(chan, bo, dst_offset, reloc);
+				OUT_RELOCl(chan, bo, dst_offset, reloc);
+			}
+			assert(AVAIL_RING(chan) > nr);
+
+			BEGIN_RING(chan, eng2d,
+				   NV50_2D_SIFC_DATA | (2 << 29), nr);
+			OUT_RINGp (chan, p, nr);
+
+			p += nr;
+			count -= nr;
+		}
+
+		src = (uint8_t *) src + src_pitch;
+	}
+}
diff --git a/src/gallium/drivers/nv50/nv50_transfer.h b/src/gallium/drivers/nv50/nv50_transfer.h
new file mode 100644
index 0000000000..663503547c
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_transfer.h
@@ -0,0 +1,31 @@
+
+#ifndef NV50_TRANSFER_H
+#define NV50_TRANSFER_H
+
+#include "pipe/p_state.h"
+
+
+struct pipe_transfer *
+nv50_miptree_transfer_new(struct pipe_context *pcontext,
+			  struct pipe_resource *pt,
+			  struct pipe_subresource sr,
+			  unsigned usage,
+			  const struct pipe_box *box);
+void
+nv50_miptree_transfer_del(struct pipe_context *pcontext,
+			  struct pipe_transfer *ptx);
+void *
+nv50_miptree_transfer_map(struct pipe_context *pcontext,
+			  struct pipe_transfer *ptx);
+void
+nv50_miptree_transfer_unmap(struct pipe_context *pcontext,
+			    struct pipe_transfer *ptx);
+
+extern void
+nv50_upload_sifc(struct nv50_context *nv50,
+		 struct nouveau_bo *bo, unsigned dst_offset, unsigned reloc,
+		 unsigned dst_format, int dst_w, int dst_h, int dst_pitch,
+		 void *src, unsigned src_format, int src_pitch,
+		 int x, int y, int w, int h, int cpp);
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
new file mode 100644
index 0000000000..864cb09352
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -0,0 +1,642 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+
+#include "nouveau/nouveau_util.h"
+#include "nv50_context.h"
+#include "nv50_resource.h"
+
+static INLINE uint32_t
+nv50_vbo_type_to_hw(enum pipe_format format)
+{
+	const struct util_format_description *desc;
+
+	desc = util_format_description(format);
+	assert(desc);
+
+	switch (desc->channel[0].type) {
+	case UTIL_FORMAT_TYPE_FLOAT:
+		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT;
+	case UTIL_FORMAT_TYPE_UNSIGNED:
+		if (desc->channel[0].normalized) {
+			return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM;
+		}
+		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED;
+	case UTIL_FORMAT_TYPE_SIGNED:
+		if (desc->channel[0].normalized) {
+			return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM;
+		}
+		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED;
+	/*
+	case PIPE_FORMAT_TYPE_UINT:
+		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT;
+	case PIPE_FORMAT_TYPE_SINT:
+		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT; */
+	default:
+		return 0;
+	}
+}
+
+static INLINE uint32_t
+nv50_vbo_size_to_hw(unsigned size, unsigned nr_c)
+{
+	static const uint32_t hw_values[] = {
+		0, 0, 0, 0,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16_16,
+		0, 0, 0, 0,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32_32 };
+
+	/* we'd also have R11G11B10 and R10G10B10A2 */
+
+	assert(nr_c > 0 && nr_c <= 4);
+
+	if (size > 32)
+		return 0;
+	size >>= (3 - 2);
+
+	return hw_values[size + (nr_c - 1)];
+}
+
+static INLINE uint32_t
+nv50_vbo_vtxelt_to_hw(struct pipe_vertex_element *ve)
+{
+	uint32_t hw_type, hw_size;
+	enum pipe_format pf = ve->src_format;
+	const struct util_format_description *desc;
+	unsigned size, nr_components;
+
+	desc = util_format_description(pf);
+	assert(desc);
+
+	size = util_format_get_component_bits(pf, UTIL_FORMAT_COLORSPACE_RGB, 0);
+	nr_components = util_format_get_nr_components(pf);
+
+	hw_type = nv50_vbo_type_to_hw(pf);
+	hw_size = nv50_vbo_size_to_hw(size, nr_components);
+
+	if (!hw_type || !hw_size) {
+		NOUVEAU_ERR("unsupported vbo format: %s\n", util_format_name(pf));
+		abort();
+		return 0x24e80000;
+	}
+
+	if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_Z) /* BGRA */
+		hw_size |= (1 << 31); /* no real swizzle bits :-( */
+
+	return (hw_type | hw_size);
+}
+
+struct instance {
+	struct nouveau_bo *bo;
+	unsigned delta;
+	unsigned stride;
+	unsigned step;
+	unsigned divisor;
+};
+
+static void
+instance_init(struct nv50_context *nv50, struct instance *a, unsigned first)
+{
+	int i;
+
+	for (i = 0; i < nv50->vtxelt->num_elements; i++) {
+		struct pipe_vertex_element *ve = &nv50->vtxelt->pipe[i];
+		struct pipe_vertex_buffer *vb;
+
+		a[i].divisor = ve->instance_divisor;
+		if (a[i].divisor) {
+			vb = &nv50->vtxbuf[ve->vertex_buffer_index];
+
+			a[i].bo = nv50_resource(vb->buffer)->bo;
+			a[i].stride = vb->stride;
+			a[i].step = first % a[i].divisor;
+			a[i].delta = vb->buffer_offset + ve->src_offset +
+				     (first * a[i].stride);
+		}
+	}
+}
+
+static void
+instance_step(struct nv50_context *nv50, struct instance *a)
+{
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	int i;
+
+	for (i = 0; i < nv50->vtxelt->num_elements; i++) {
+		if (!a[i].divisor)
+			continue;
+
+		BEGIN_RING(chan, tesla,
+			   NV50TCL_VERTEX_ARRAY_START_HIGH(i), 2);
+		OUT_RELOCh(chan, a[i].bo, a[i].delta, NOUVEAU_BO_RD |
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART);
+		OUT_RELOCl(chan, a[i].bo, a[i].delta, NOUVEAU_BO_RD |
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART);
+		if (++a[i].step == a[i].divisor) {
+			a[i].step = 0;
+			a[i].delta += a[i].stride;
+		}
+	}
+}
+
+void
+nv50_draw_arrays_instanced(struct pipe_context *pipe,
+			   unsigned mode, unsigned start, unsigned count,
+			   unsigned startInstance, unsigned instanceCount)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct instance a[16];
+	unsigned prim = nv50_prim(mode);
+
+	instance_init(nv50, a, startInstance);
+	if (!nv50_state_validate(nv50, 10 + 16*3))
+		return;
+
+	if (nv50->vbo_fifo) {
+		nv50_push_elements_instanced(pipe, NULL, 0, 0, mode, start,
+					     count, startInstance,
+					     instanceCount);
+		return;
+	}
+
+	BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 2);
+	OUT_RING  (chan, NV50_CB_AUX | (24 << 8));
+	OUT_RING  (chan, startInstance);
+	while (instanceCount--) {
+		if (AVAIL_RING(chan) < (7 + 16*3)) {
+			FIRE_RING(chan);
+			if (!nv50_state_validate(nv50, 7 + 16*3)) {
+				assert(0);
+				return;
+			}
+		}
+		instance_step(nv50, a);
+
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+		OUT_RING  (chan, prim);
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
+		OUT_RING  (chan, start);
+		OUT_RING  (chan, count);
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+		OUT_RING  (chan, 0);
+
+		prim |= (1 << 28);
+	}
+}
+
+void
+nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
+		 unsigned count)
+{
+	nv50_draw_arrays_instanced(pipe, mode, start, count, 0, 1);
+}
+
+struct inline_ctx {
+	struct nv50_context *nv50;
+	void *map;
+};
+
+static void
+inline_elt08(void *priv, unsigned start, unsigned count)
+{
+	struct inline_ctx *ctx = priv;
+	struct nouveau_grobj *tesla = ctx->nv50->screen->tesla;
+	struct nouveau_channel *chan = tesla->channel;
+	uint8_t *map = (uint8_t *)ctx->map + start;
+
+	if (count & 1) {
+		BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (chan, map[0]);
+		map++;
+		count &= ~1;
+	}
+
+	count >>= 1;
+	if (!count)
+		return;
+
+	BEGIN_RING_NI(chan, tesla, NV50TCL_VB_ELEMENT_U16, count);
+	while (count--) {
+		OUT_RING(chan, (map[1] << 16) | map[0]);
+		map += 2;
+	}
+}
+
+static void
+inline_elt16(void *priv, unsigned start, unsigned count)
+{
+	struct inline_ctx *ctx = priv;
+	struct nouveau_grobj *tesla = ctx->nv50->screen->tesla;
+	struct nouveau_channel *chan = tesla->channel;
+	uint16_t *map = (uint16_t *)ctx->map + start;
+
+	if (count & 1) {
+		BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (chan, map[0]);
+		count &= ~1;
+		map++;
+	}
+
+	count >>= 1;
+	if (!count)
+		return;
+
+	BEGIN_RING_NI(chan, tesla, NV50TCL_VB_ELEMENT_U16, count);
+	while (count--) {
+		OUT_RING(chan, (map[1] << 16) | map[0]);
+		map += 2;
+	}
+}
+
+static void
+inline_elt32(void *priv, unsigned start, unsigned count)
+{
+	struct inline_ctx *ctx = priv;
+	struct nouveau_grobj *tesla = ctx->nv50->screen->tesla;
+	struct nouveau_channel *chan = tesla->channel;
+
+	BEGIN_RING_NI(chan, tesla, NV50TCL_VB_ELEMENT_U32, count);
+	OUT_RINGp    (chan, (uint32_t *)ctx->map + start, count);
+}
+
+static void
+inline_edgeflag(void *priv, boolean enabled)
+{
+	struct inline_ctx *ctx = priv;
+	struct nouveau_grobj *tesla = ctx->nv50->screen->tesla;
+	struct nouveau_channel *chan = tesla->channel;
+
+	BEGIN_RING(chan, tesla, NV50TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (chan, enabled ? 1 : 0);
+}
+
+static void
+nv50_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_resource *indexBuffer, unsigned indexSize,
+			  unsigned mode, unsigned start, unsigned count,
+			  unsigned startInstance, unsigned instanceCount)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct pipe_transfer *transfer;
+	struct instance a[16];
+	struct inline_ctx ctx;
+	struct u_split_prim s;
+	boolean nzi = FALSE;
+	unsigned overhead;
+
+	overhead = 16*3; /* potential instance adjustments */
+	overhead += 4; /* Begin()/End() */
+	overhead += 4; /* potential edgeflag disable/reenable */
+	overhead += 3; /* potentially 3 VTX_ELT_U16/U32 packet headers */
+
+	s.priv = &ctx;
+	if (indexSize == 1)
+		s.emit = inline_elt08;
+	else
+	if (indexSize == 2)
+		s.emit = inline_elt16;
+	else
+		s.emit = inline_elt32;
+	s.edge = inline_edgeflag;
+
+	ctx.nv50 = nv50;
+	ctx.map = pipe_buffer_map(pipe, indexBuffer, PIPE_TRANSFER_READ, &transfer);
+	assert(ctx.map);
+	if (!ctx.map)
+		return;
+
+	instance_init(nv50, a, startInstance);
+	if (!nv50_state_validate(nv50, overhead + 6 + 3))
+		return;
+
+	BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 2);
+	OUT_RING  (chan, NV50_CB_AUX | (24 << 8));
+	OUT_RING  (chan, startInstance);
+	while (instanceCount--) {
+		unsigned max_verts;
+		boolean done;
+
+		u_split_prim_init(&s, mode, start, count);
+		do {
+			if (AVAIL_RING(chan) < (overhead + 6)) {
+				FIRE_RING(chan);
+				if (!nv50_state_validate(nv50, (overhead + 6))) {
+					assert(0);
+					return;
+				}
+			}
+
+			max_verts = AVAIL_RING(chan) - overhead;
+			if (max_verts > 2047)
+				max_verts = 2047;
+			if (indexSize != 4)
+				max_verts <<= 1;
+			instance_step(nv50, a);
+
+			BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+			OUT_RING  (chan, nv50_prim(s.mode) | (nzi ? (1<<28) : 0));
+			done = u_split_prim_next(&s, max_verts);
+			BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+			OUT_RING  (chan, 0);
+		} while (!done);
+
+		nzi = TRUE;
+	}
+
+	pipe_buffer_unmap(pipe, indexBuffer, transfer);
+}
+
+void
+nv50_draw_elements_instanced(struct pipe_context *pipe,
+			     struct pipe_resource *indexBuffer,
+			     unsigned indexSize, int indexBias,
+			     unsigned mode, unsigned start, unsigned count,
+			     unsigned startInstance, unsigned instanceCount)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct instance a[16];
+	unsigned prim = nv50_prim(mode);
+
+	instance_init(nv50, a, startInstance);
+	if (!nv50_state_validate(nv50, 13 + 16*3))
+		return;
+
+	if (nv50->vbo_fifo) {
+		nv50_push_elements_instanced(pipe, indexBuffer, indexSize,
+					     indexBias, mode, start, count,
+					     startInstance, instanceCount);
+		return;
+	}
+
+	/* indices are uint32 internally, so large indexBias means negative */
+	BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_BASE, 1);
+	OUT_RING  (chan, indexBias);
+
+	if (!nv50_resource_mapped_by_gpu(indexBuffer) || indexSize == 1) {
+		nv50_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count, startInstance,
+					  instanceCount);
+		return;
+	}
+
+	BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 2);
+	OUT_RING  (chan, NV50_CB_AUX | (24 << 8));
+	OUT_RING  (chan, startInstance);
+	while (instanceCount--) {
+		if (AVAIL_RING(chan) < (7 + 16*3)) {
+			FIRE_RING(chan);
+			if (!nv50_state_validate(nv50, 10 + 16*3)) {
+				assert(0);
+				return;
+			}
+		}
+		instance_step(nv50, a);
+
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+		OUT_RING  (chan, prim);
+		if (indexSize == 4) {
+			BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U32 | 0x30000, 0);
+			OUT_RING  (chan, count);
+			nouveau_pushbuf_submit(chan, 
+					       nv50_resource(indexBuffer)->bo,
+					       start << 2, count << 2);
+		} else
+		if (indexSize == 2) {
+			unsigned vb_start = (start & ~1);
+			unsigned vb_end = (start + count + 1) & ~1;
+			unsigned dwords = (vb_end - vb_start) >> 1;
+
+			BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U16_SETUP, 1);
+			OUT_RING  (chan, ((start & 1) << 31) | count);
+			BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U16 | 0x30000, 0);
+			OUT_RING  (chan, dwords);
+			nouveau_pushbuf_submit(chan,
+					       nv50_resource(indexBuffer)->bo,
+					       vb_start << 1, dwords << 2);
+			BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U16_SETUP, 1);
+			OUT_RING  (chan, 0);
+		}
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+		OUT_RING  (chan, 0);
+
+		prim |= (1 << 28);
+	}
+}
+
+void
+nv50_draw_elements(struct pipe_context *pipe,
+		   struct pipe_resource *indexBuffer,
+		   unsigned indexSize, int indexBias,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	nv50_draw_elements_instanced(pipe, indexBuffer, indexSize, indexBias,
+				     mode, start, count, 0, 1);
+}
+
+static INLINE boolean
+nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib,
+		       struct nouveau_stateobj **pso,
+		       struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+
+{
+	struct nouveau_stateobj *so;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_bo *bo = nv50_resource(vb->buffer)->bo;
+	float v[4];
+	int ret;
+	unsigned nr_components = util_format_get_nr_components(ve->src_format);
+
+	ret = nouveau_bo_map(bo, NOUVEAU_BO_RD);
+	if (ret)
+		return FALSE;
+
+	util_format_read_4f(ve->src_format, v, 0, (uint8_t *)bo->map +
+			    (vb->buffer_offset + ve->src_offset), 0,
+			    0, 0, 1, 1);
+	so = *pso;
+	if (!so)
+		*pso = so = so_new(nv50->vtxelt->num_elements,
+				   nv50->vtxelt->num_elements * 4, 0);
+
+	switch (nr_components) {
+	case 4:
+		so_method(so, tesla, NV50TCL_VTX_ATTR_4F_X(attrib), 4);
+		so_data  (so, fui(v[0]));
+		so_data  (so, fui(v[1]));
+		so_data  (so, fui(v[2]));
+		so_data  (so, fui(v[3]));
+		break;
+	case 3:
+		so_method(so, tesla, NV50TCL_VTX_ATTR_3F_X(attrib), 3);
+		so_data  (so, fui(v[0]));
+		so_data  (so, fui(v[1]));
+		so_data  (so, fui(v[2]));
+		break;
+	case 2:
+		so_method(so, tesla, NV50TCL_VTX_ATTR_2F_X(attrib), 2);
+		so_data  (so, fui(v[0]));
+		so_data  (so, fui(v[1]));
+		break;
+	case 1:
+		if (attrib == nv50->vertprog->cfg.edgeflag_in) {
+			so_method(so, tesla, NV50TCL_EDGEFLAG_ENABLE, 1);
+			so_data  (so, v[0] ? 1 : 0);
+		}
+		so_method(so, tesla, NV50TCL_VTX_ATTR_1F(attrib), 1);
+		so_data  (so, fui(v[0]));
+		break;
+	default:
+		nouveau_bo_unmap(bo);
+		return FALSE;
+	}
+
+	nouveau_bo_unmap(bo);
+	return TRUE;
+}
+
+void
+nv50_vtxelt_construct(struct nv50_vtxelt_stateobj *cso)
+{
+	unsigned i;
+
+	for (i = 0; i < cso->num_elements; ++i) {
+		struct pipe_vertex_element *ve = &cso->pipe[i];
+
+		cso->hw[i] = nv50_vbo_vtxelt_to_hw(ve);
+	}
+}
+
+struct nouveau_stateobj *
+nv50_vbo_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *vtxbuf, *vtxfmt, *vtxattr;
+	unsigned i, n_ve;
+
+	/* don't validate if Gallium took away our buffers */
+	if (nv50->vtxbuf_nr == 0)
+		return NULL;
+
+	nv50->vbo_fifo = 0;
+	if (nv50->screen->force_push ||
+	    nv50->vertprog->cfg.edgeflag_in < 16)
+		nv50->vbo_fifo = 0xffff;
+
+	for (i = 0; i < nv50->vtxbuf_nr; i++) {
+		if (nv50->vtxbuf[i].stride &&
+		    !nv50_resource_mapped_by_gpu(nv50->vtxbuf[i].buffer))
+			nv50->vbo_fifo = 0xffff;
+	}
+
+	n_ve = MAX2(nv50->vtxelt->num_elements, nv50->state.vtxelt_nr);
+
+	vtxattr = NULL;
+	vtxbuf = so_new(n_ve * 2, n_ve * 5, nv50->vtxelt->num_elements * 4);
+	vtxfmt = so_new(1, n_ve, 0);
+	so_method(vtxfmt, tesla, NV50TCL_VERTEX_ARRAY_ATTRIB(0), n_ve);
+
+	for (i = 0; i < nv50->vtxelt->num_elements; i++) {
+		struct pipe_vertex_element *ve = &nv50->vtxelt->pipe[i];
+		struct pipe_vertex_buffer *vb =
+			&nv50->vtxbuf[ve->vertex_buffer_index];
+		struct nouveau_bo *bo = nv50_resource(vb->buffer)->bo;
+		uint32_t hw = nv50->vtxelt->hw[i];
+
+		if (!vb->stride &&
+		    nv50_vbo_static_attrib(nv50, i, &vtxattr, ve, vb)) {
+			so_data(vtxfmt, hw | (1 << 4));
+
+			so_method(vtxbuf, tesla,
+				  NV50TCL_VERTEX_ARRAY_FORMAT(i), 1);
+			so_data  (vtxbuf, 0);
+
+			nv50->vbo_fifo &= ~(1 << i);
+			continue;
+		}
+
+		if (nv50->vbo_fifo) {
+			so_data  (vtxfmt, hw | (ve->instance_divisor ? (1 << 4) : i));
+			so_method(vtxbuf, tesla,
+				  NV50TCL_VERTEX_ARRAY_FORMAT(i), 1);
+			so_data  (vtxbuf, 0);
+			continue;
+		}
+
+		so_data(vtxfmt, hw | i);
+
+		so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 3);
+		so_data  (vtxbuf, 0x20000000 |
+			  (ve->instance_divisor ? 0 : vb->stride));
+		so_reloc (vtxbuf, bo, vb->buffer_offset +
+			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+			  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (vtxbuf, bo, vb->buffer_offset +
+			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+			  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+
+		/* vertex array limits */
+		so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_LIMIT_HIGH(i), 2);
+		so_reloc (vtxbuf, bo, vb->buffer->width0 - 1,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			  NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (vtxbuf, bo, vb->buffer->width0 - 1,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			  NOUVEAU_BO_LOW, 0, 0);
+	}
+	for (; i < n_ve; ++i) {
+		so_data  (vtxfmt, 0x7e080010);
+
+		so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 1);
+		so_data  (vtxbuf, 0);
+	}
+	nv50->state.vtxelt_nr = nv50->vtxelt->num_elements;
+
+	so_ref (vtxbuf, &nv50->state.vtxbuf);
+	so_ref (vtxattr, &nv50->state.vtxattr);
+	so_ref (NULL, &vtxbuf);
+	so_ref (NULL, &vtxattr);
+	return vtxfmt;
+}
+
+
diff --git a/src/gallium/drivers/nvfx/Makefile b/src/gallium/drivers/nvfx/Makefile
new file mode 100644
index 0000000000..c1d57ca396
--- /dev/null
+++ b/src/gallium/drivers/nvfx/Makefile
@@ -0,0 +1,37 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nvfx
+
+C_SOURCES = \
+	nv04_surface_2d.c \
+	nvfx_buffer.c \
+	nvfx_context.c \
+	nvfx_clear.c \
+	nvfx_draw.c \
+	nvfx_fragprog.c \
+	nvfx_fragtex.c \
+	nv30_fragtex.c \
+	nv40_fragtex.c \
+	nvfx_miptree.c \
+	nvfx_query.c \
+	nvfx_resource.c \
+	nvfx_screen.c \
+	nvfx_state.c \
+	nvfx_state_blend.c \
+        nvfx_state_emit.c \
+	nvfx_state_fb.c \
+	nvfx_state_rasterizer.c \
+	nvfx_state_scissor.c \
+        nvfx_state_stipple.c \
+	nvfx_state_viewport.c \
+	nvfx_state_zsa.c \
+	nvfx_surface.c \
+	nvfx_transfer.c \
+	nvfx_vbo.c \
+	nvfx_vertprog.c
+
+LIBRARY_INCLUDES = \
+	-I$(TOP)/src/gallium/drivers/nouveau/include
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/nvfx/SConscript b/src/gallium/drivers/nvfx/SConscript
new file mode 100644
index 0000000000..02d931b10e
--- /dev/null
+++ b/src/gallium/drivers/nvfx/SConscript
@@ -0,0 +1,40 @@
+Import('*')
+
+env = env.Clone()
+
+env.PrependUnique(delete_existing=1, CPPPATH = [
+    '#/src/gallium/drivers',
+])
+
+nvfx = env.ConvenienceLibrary(
+    target = 'nvfx',
+    source = [
+        'nv04_surface_2d.c',
+        'nvfx_buffer.c',
+        'nvfx_context.c',
+        'nvfx_clear.c',
+        'nvfx_draw.c',
+        'nvfx_fragprog.c',
+        'nvfx_fragtex.c',
+        'nv30_fragtex.c',
+        'nv40_fragtex.c',
+        'nvfx_miptree.c',
+        'nvfx_query.c',
+        'nvfx_resource.c',
+        'nvfx_screen.c',
+        'nvfx_state.c',
+        'nvfx_state_blend.c',
+        'nvfx_state_emit.c',
+        'nvfx_state_fb.c',
+        'nvfx_state_rasterizer.c',
+        'nvfx_state_scissor.c',
+        'nvfx_state_stipple.c',
+        'nvfx_state_viewport.c',
+        'nvfx_state_zsa.c',
+        'nvfx_surface.c',
+        'nvfx_transfer.c',
+        'nvfx_vbo.c',
+        'nvfx_vertprog.c',
+    ])
+
+Export('nvfx')
diff --git a/src/gallium/drivers/nvfx/nv04_surface_2d.c b/src/gallium/drivers/nvfx/nv04_surface_2d.c
new file mode 100644
index 0000000000..7acbb505df
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_surface_2d.c
@@ -0,0 +1,532 @@
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_util.h"
+#include "nouveau/nouveau_screen.h"
+#include "nv04_surface_2d.h"
+
+static INLINE int
+nv04_surface_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+	case PIPE_FORMAT_L8_UNORM:
+	case PIPE_FORMAT_I8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+	case PIPE_FORMAT_R16_SNORM:
+	case PIPE_FORMAT_B5G6R5_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
+	case PIPE_FORMAT_L8A8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_rect_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_B5G6R5_UNORM:
+	case PIPE_FORMAT_L8A8_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_scaled_image_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+	case PIPE_FORMAT_L8_UNORM:
+	case PIPE_FORMAT_I8_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8;
+	case PIPE_FORMAT_B5G5R5A1_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5;
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8;
+	case PIPE_FORMAT_B5G6R5_UNORM:
+	case PIPE_FORMAT_R16_SNORM:
+	case PIPE_FORMAT_L8A8_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5;
+	default:
+		return -1;
+	}
+}
+
+static INLINE unsigned
+nv04_swizzle_bits_square(unsigned x, unsigned y)
+{
+	unsigned u = (x & 0x001) << 0 |
+	             (x & 0x002) << 1 |
+	             (x & 0x004) << 2 |
+	             (x & 0x008) << 3 |
+	             (x & 0x010) << 4 |
+	             (x & 0x020) << 5 |
+	             (x & 0x040) << 6 |
+	             (x & 0x080) << 7 |
+	             (x & 0x100) << 8 |
+	             (x & 0x200) << 9 |
+	             (x & 0x400) << 10 |
+	             (x & 0x800) << 11;
+
+	unsigned v = (y & 0x001) << 1 |
+	             (y & 0x002) << 2 |
+	             (y & 0x004) << 3 |
+	             (y & 0x008) << 4 |
+	             (y & 0x010) << 5 |
+	             (y & 0x020) << 6 |
+	             (y & 0x040) << 7 |
+	             (y & 0x080) << 8 |
+	             (y & 0x100) << 9 |
+	             (y & 0x200) << 10 |
+	             (y & 0x400) << 11 |
+	             (y & 0x800) << 12;
+	return v | u;
+}
+
+/* rectangular swizzled textures are linear concatenations of swizzled square tiles */
+static INLINE unsigned
+nv04_swizzle_bits(unsigned x, unsigned y, unsigned w, unsigned h)
+{
+	unsigned s = MIN2(w, h);
+	unsigned m = s - 1;
+	return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m);
+}
+
+static int
+nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
+			  struct pipe_surface *dst, int dx, int dy,
+			  struct pipe_surface *src, int sx, int sy,
+			  int w, int h)
+{
+	struct nouveau_channel *chan = ctx->swzsurf->channel;
+	struct nouveau_grobj *swzsurf = ctx->swzsurf;
+	struct nouveau_grobj *sifm = ctx->sifm;
+	struct nouveau_bo *src_bo = ctx->buf(src);
+	struct nouveau_bo *dst_bo = ctx->buf(dst);
+	const unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
+        /* Max width & height may not be the same on all HW, but must be POT */
+	const unsigned max_w = 1024;
+	const unsigned max_h = 1024;
+	unsigned sub_w = w > max_w ? max_w : w;
+	unsigned sub_h = h > max_h ? max_h : h;
+	unsigned x;
+	unsigned y;
+
+        /* Swizzled surfaces must be POT  */
+	assert(util_is_pot(dst->width) && util_is_pot(dst->height));
+
+        /* If area is too large to copy in one shot we must copy it in POT chunks to meet alignment requirements */
+	assert(sub_w == w || util_is_pot(sub_w));
+	assert(sub_h == h || util_is_pot(sub_h));
+
+	MARK_RING (chan, 8 + ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*17, 2 +
+			 ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*2);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, dst_bo,
+	                 NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
+	OUT_RING  (chan, nv04_surface_format(dst->format) |
+	                 log2i(dst->width) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
+	                 log2i(dst->height) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
+
+	BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, src_bo,
+	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
+	OUT_RING  (chan, swzsurf->handle);
+
+	for (y = 0; y < h; y += sub_h) {
+	  sub_h = MIN2(sub_h, h - y);
+
+	  for (x = 0; x < w; x += sub_w) {
+	    sub_w = MIN2(sub_w, w - x);
+
+	    assert(!(dst->offset & 63));
+
+	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
+	    OUT_RELOCl(chan, dst_bo, dst->offset,
+                             NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	    BEGIN_RING(chan, sifm, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
+	    OUT_RING  (chan, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
+	    OUT_RING  (chan, nv04_scaled_image_format(src->format));
+	    OUT_RING  (chan, NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
+	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
+	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | sub_w);
+	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
+	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | sub_w);
+	    OUT_RING  (chan, 1 << 20);
+	    OUT_RING  (chan, 1 << 20);
+
+	    BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
+	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT | sub_w);
+	    OUT_RING  (chan, src_pitch |
+			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
+			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
+	    OUT_RELOCl(chan, src_bo, src->offset + (sy+y) * src_pitch + (sx+x) * util_format_get_blocksize(src->texture->format),
+                             NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	    OUT_RING  (chan, 0);
+	  }
+	}
+
+	return 0;
+}
+
+static int
+nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx,
+		       struct pipe_surface *dst, int dx, int dy,
+		       struct pipe_surface *src, int sx, int sy, int w, int h)
+{
+	struct nouveau_channel *chan = ctx->m2mf->channel;
+	struct nouveau_grobj *m2mf = ctx->m2mf;
+	struct nouveau_bo *src_bo = ctx->buf(src);
+	struct nouveau_bo *dst_bo = ctx->buf(dst);
+	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
+	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
+	unsigned dst_offset = dst->offset + dy * dst_pitch +
+	                      dx * util_format_get_blocksize(dst->texture->format);
+	unsigned src_offset = src->offset + sy * src_pitch +
+	                      sx * util_format_get_blocksize(src->texture->format);
+
+	MARK_RING (chan, 3 + ((h / 2047) + 1) * 9, 2 + ((h / 2047) + 1) * 2);
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
+	OUT_RELOCo(chan, src_bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst_bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	while (h) {
+		int count = (h > 2047) ? 2047 : h;
+
+		BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+		OUT_RELOCl(chan, src_bo, src_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCl(chan, dst_bo, dst_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+		OUT_RING  (chan, src_pitch);
+		OUT_RING  (chan, dst_pitch);
+		OUT_RING  (chan, w * util_format_get_blocksize(src->texture->format));
+		OUT_RING  (chan, count);
+		OUT_RING  (chan, 0x0101);
+		OUT_RING  (chan, 0);
+
+		h -= count;
+		src_offset += src_pitch * count;
+		dst_offset += dst_pitch * count;
+	}
+
+	return 0;
+}
+
+static int
+nv04_surface_copy_blit(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		       int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		       int w, int h)
+{
+	struct nouveau_channel *chan = ctx->surf2d->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *blit = ctx->blit;
+	struct nouveau_bo *src_bo = ctx->buf(src);
+	struct nouveau_bo *dst_bo = ctx->buf(dst);
+	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
+	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
+	int format;
+
+	format = nv04_surface_format(dst->format);
+	if (format < 0)
+		return 1;
+
+	MARK_RING (chan, 12, 4);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, src_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, (dst_pitch << 16) | src_pitch);
+	OUT_RELOCl(chan, src_bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, blit, 0x0300, 3);
+	OUT_RING  (chan, (sy << 16) | sx);
+	OUT_RING  (chan, (dy << 16) | dx);
+	OUT_RING  (chan, ( h << 16) |  w);
+
+	return 0;
+}
+
+static void
+nv04_surface_copy(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		  int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		  int w, int h)
+{
+	int src_linear = src->texture->flags & NVFX_RESOURCE_FLAG_LINEAR;
+	int dst_linear = dst->texture->flags & NVFX_RESOURCE_FLAG_LINEAR;
+
+	assert(src->format == dst->format);
+
+	/* Setup transfer to swizzle the texture to vram if needed */
+        if (src_linear && !dst_linear && w > 1 && h > 1) {
+           nv04_surface_copy_swizzle(ctx, dst, dx, dy, src, sx, sy, w, h);
+           return;
+        }
+
+        /* Use M2MF instead of the blitter since it always works
+         * Any possible performance drop is likely to be not very significant
+         * and dwarfed anyway by the current buffer management problems
+         */
+        nv04_surface_copy_m2mf(ctx, dst, dx, dy, src, sx, sy, w, h);
+}
+
+static void
+nv04_surface_fill(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		  int dx, int dy, int w, int h, unsigned value)
+{
+	struct nouveau_channel *chan = ctx->surf2d->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *rect = ctx->rect;
+	struct nouveau_bo *dst_bo = ctx->buf(dst);
+	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
+	int cs2d_format, gdirect_format;
+
+	cs2d_format = nv04_surface_format(dst->format);
+	assert(cs2d_format >= 0);
+
+	gdirect_format = nv04_rect_format(dst->format);
+	assert(gdirect_format >= 0);
+
+	MARK_RING (chan, 16, 4);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, cs2d_format);
+	OUT_RING  (chan, (dst_pitch << 16) | dst_pitch);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
+	OUT_RING  (chan, gdirect_format);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
+	OUT_RING  (chan, value);
+	BEGIN_RING(chan, rect,
+		   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
+	OUT_RING  (chan, (dx << 16) | dy);
+	OUT_RING  (chan, ( w << 16) |  h);
+}
+
+void
+nv04_surface_2d_takedown(struct nv04_surface_2d **pctx)
+{
+	struct nv04_surface_2d *ctx;
+
+	if (!pctx || !*pctx)
+		return;
+	ctx = *pctx;
+	*pctx = NULL;
+
+	nouveau_notifier_free(&ctx->ntfy);
+	nouveau_grobj_free(&ctx->m2mf);
+	nouveau_grobj_free(&ctx->surf2d);
+	nouveau_grobj_free(&ctx->swzsurf);
+	nouveau_grobj_free(&ctx->rect);
+	nouveau_grobj_free(&ctx->blit);
+	nouveau_grobj_free(&ctx->sifm);
+
+	FREE(ctx);
+}
+
+struct nv04_surface_2d *
+nv04_surface_2d_init(struct nouveau_screen *screen)
+{
+	struct nv04_surface_2d *ctx = CALLOC_STRUCT(nv04_surface_2d);
+	struct nouveau_channel *chan = screen->channel;
+	unsigned handle = 0x88000000, class;
+	int ret;
+
+	if (!ctx)
+		return NULL;
+
+	ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_CONTEXT_SURFACES_2D;
+	else
+		class = NV10_CONTEXT_SURFACES_2D;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->surf2d,
+			 NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_IMAGE_BLIT;
+	else
+		class = NV12_IMAGE_BLIT;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_OPERATION, 1);
+	OUT_RING  (chan, NV01_IMAGE_BLIT_OPERATION_SRCCOPY);
+
+	ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT,
+				  &ctx->rect);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, ctx->rect,
+			 NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x00:
+	case 0x10:
+		class = NV04_SWIZZLED_SURFACE;
+		break;
+	case 0x20:
+		class = NV20_SWIZZLED_SURFACE;
+		break;
+	case 0x30:
+		class = NV30_SWIZZLED_SURFACE;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SWIZZLED_SURFACE;
+		break;
+	default:
+		/* Famous last words: this really can't happen.. */
+		assert(0);
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x10:
+	case 0x20:
+		class = NV10_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	case 0x30:
+		class = NV30_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	default:
+		class = NV04_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	ctx->copy = nv04_surface_copy;
+	ctx->fill = nv04_surface_fill;
+	return ctx;
+}
+
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen,
+			     struct nv04_surface_2d* eng2d, struct nv04_surface* ns)
+{
+	struct pipe_resource templ;
+	struct pipe_resource* temp_tex;
+	struct nv04_surface* temp_ns;
+	int temp_flags;
+
+	temp_flags = ns->base.usage;
+
+	ns->base.usage = 0;
+
+	memset(&templ, 0, sizeof(templ));
+	templ.format = ns->base.texture->format;
+	templ.target = PIPE_TEXTURE_2D;
+	templ.width0 = ns->base.width;
+	templ.height0 = ns->base.height;
+	templ.depth0 = 1;
+	templ.last_level = 0;
+
+	// TODO: this is probably wrong and we should specifically handle multisampling somehow once it is implemented
+	templ.nr_samples = ns->base.texture->nr_samples;
+
+	templ.bind = ns->base.texture->bind | PIPE_BIND_RENDER_TARGET;
+
+	temp_tex = pscreen->resource_create(pscreen, &templ);
+	temp_ns = (struct nv04_surface*)pscreen->get_tex_surface(pscreen, temp_tex, 0, 0, 0, temp_flags);
+	temp_ns->backing = ns;
+
+	if(1) /* hmm */
+		eng2d->copy(eng2d, &temp_ns->backing->base,
+			    0, 0, &ns->base,
+			    0, 0, ns->base.width, ns->base.height);
+
+	return temp_ns;
+}
diff --git a/src/gallium/drivers/nvfx/nv04_surface_2d.h b/src/gallium/drivers/nvfx/nv04_surface_2d.h
new file mode 100644
index 0000000000..2123c3ed08
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_surface_2d.h
@@ -0,0 +1,43 @@
+#ifndef __NV04_SURFACE_2D_H__
+#define __NV04_SURFACE_2D_H__
+
+#include "pipe/p_state.h"
+
+struct nouveau_screen;
+
+struct nv04_surface {
+	struct pipe_surface base;
+	unsigned pitch;
+	struct nv04_surface* backing;
+};
+
+struct nv04_surface_2d {
+	struct nouveau_notifier *ntfy;
+	struct nouveau_grobj *surf2d;
+	struct nouveau_grobj *swzsurf;
+	struct nouveau_grobj *m2mf;
+	struct nouveau_grobj *rect;
+	struct nouveau_grobj *blit;
+	struct nouveau_grobj *sifm;
+
+	struct nouveau_bo *(*buf)(struct pipe_surface *);
+
+	void (*copy)(struct nv04_surface_2d *, struct pipe_surface *dst,
+		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		     int w, int h);
+	void (*fill)(struct nv04_surface_2d *, struct pipe_surface *dst,
+		     int dx, int dy, int w, int h, unsigned value);
+};
+
+struct nv04_surface_2d *
+nv04_surface_2d_init(struct nouveau_screen *screen);
+
+void
+nv04_surface_2d_takedown(struct nv04_surface_2d **);
+
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns);
+
+#define NVFX_RESOURCE_FLAG_LINEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nv30_fragtex.c b/src/gallium/drivers/nvfx/nv30_fragtex.c
new file mode 100644
index 0000000000..dec073ac90
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv30_fragtex.c
@@ -0,0 +1,149 @@
+#include "util/u_format.h"
+
+#include "nvfx_context.h"
+#include "nouveau/nouveau_util.h"
+#include "nvfx_tex.h"
+#include "nvfx_resource.h"
+
+void
+nv30_sampler_state_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_state *ps,
+			  const struct pipe_sampler_state *cso)
+{
+	if (cso->max_anisotropy >= 8) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_8X;
+	} else
+	if (cso->max_anisotropy >= 4) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_4X;
+	} else
+	if (cso->max_anisotropy >= 2) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_2X;
+	}
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 14 /*NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT*/;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 26 /*NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT*/;
+	}
+}
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w)                        \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV34TCL_TX_FORMAT_FORMAT_##tf,                                               \
+  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |           \
+   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |           \
+   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |           \
+   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w)            \
+}
+
+struct nv30_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+};
+
+static struct nv30_texture_format
+nv30_texture_formats[] = {
+	_(B8G8R8X8_UNORM, A8R8G8B8,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(B8G8R8A8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(B5G5R5A1_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(B4G4R4A4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(B5G6R5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X),
+	_(L8A8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y),
+	_(Z16_UNORM     , R5G6B5  ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(S8_USCALED_Z24_UNORM   , A8R8G8B8,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	{},
+};
+
+static struct nv30_texture_format *
+nv30_fragtex_format(uint pipe_format)
+{
+	struct nv30_texture_format *tf = nv30_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", util_format_name(pipe_format));
+	return NULL;
+}
+
+
+void
+nv30_fragtex_set(struct nvfx_context *nvfx, int unit)
+{
+	struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit];
+	struct nvfx_miptree *nv30mt = (struct nvfx_miptree *)nvfx->fragment_sampler_views[unit]->texture;
+	struct pipe_resource *pt = &nv30mt->base.base;
+	struct nouveau_bo *bo = nv30mt->base.bo;
+	struct nv30_texture_format *tf;
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	uint32_t txf, txs;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv30_fragtex_format(pt->format);
+	if (!tf)
+		return;
+
+	txf  = tf->format;
+	txf |= ((pt->last_level>0) ? NV34TCL_TX_FORMAT_MIPMAP : 0);
+	txf |= log2i(pt->width0) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
+	txf |= log2i(pt->height0) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
+	txf |= log2i(pt->depth0) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
+	txf |= NV34TCL_TX_FORMAT_NO_BORDER | 0x10000;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	txs = tf->swizzle;
+
+	MARK_RING(chan, 9, 2);
+	OUT_RING(chan, RING_3D(NV34TCL_TX_OFFSET(unit), 8));
+	OUT_RELOC(chan, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	OUT_RELOC(chan, bo, txf, tex_flags | NOUVEAU_BO_OR,
+		      NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	OUT_RING(chan, ps->wrap);
+	OUT_RING(chan, NV34TCL_TX_ENABLE_ENABLE | ps->en);
+	OUT_RING(chan, txs);
+	OUT_RING(chan, ps->filt | 0x2000 /*voodoo*/);
+	OUT_RING(chan, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) |
+		       pt->height0);
+	OUT_RING(chan, ps->bcol);
+
+	nvfx->hw_txf[unit] = txf;
+	nvfx->hw_samplers |= (1 << unit);
+}
diff --git a/src/gallium/drivers/nvfx/nv30_vertprog.h b/src/gallium/drivers/nvfx/nv30_vertprog.h
new file mode 100644
index 0000000000..ec0444c07f
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv30_vertprog.h
@@ -0,0 +1,169 @@
+#ifndef __NV30_SHADER_H__
+#define __NV30_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * 128bit opcodes, split into 4 32-bit ones for ease of use.
+ *
+ * Non-native instructions
+ *   ABS - MOV + NV40_VP_INST0_DEST_ABS
+ *   POW - EX2 + MUL + LG2
+ *   SUB - ADD, second source negated
+ *   SWZ - MOV
+ *   XPD -
+ *
+ * Register access
+ *   - Only one INPUT can be accessed per-instruction (move extras into TEMPs)
+ *   - Only one CONST can be accessed per-instruction (move extras into TEMPs)
+ *
+ * Relative Addressing
+ *   According to the value returned for
+ *   MAX_PROGRAM_NATIVE_ADDRESS_REGISTERS_ARB
+ *
+ *   there are only two address registers available.  The destination in the
+ *   ARL instruction is set to TEMP <n> (The temp isn't actually written).
+ *
+ *   When using vanilla ARB_v_p, the proprietary driver will squish both the
+ *   available ADDRESS regs into the first hardware reg in the X and Y
+ *   components.
+ *
+ *   To use an address reg as an index into consts, the CONST_SRC is set to
+ *   (const_base + offset) and INDEX_CONST is set.
+ *
+ *   To access the second address reg use ADDR_REG_SELECT_1. A particular
+ *   component of the address regs is selected with ADDR_SWZ.
+ *
+ *   Only one address register can be accessed per instruction.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details) Conditional
+ * execution of an instruction is enabled by setting COND_TEST_ENABLE, and
+ * selecting the condition which will allow the test to pass with
+ * COND_{FL,LT,...}.  It is possible to swizzle the values in the condition
+ * register, which allows for testing against an individual component.
+ *
+ * Branching:
+ *
+ *   The BRA/CAL instructions seem to follow a slightly different opcode
+ *   layout.  The destination instruction ID (IADDR) overlaps a source field.
+ *   Instruction ID's seem to be numbered based on the UPLOAD_FROM_ID FIFO
+ *   command, and is incremented automatically on each UPLOAD_INST FIFO
+ *   command.
+ *
+ *   Conditional branching is achieved by using the condition tests described
+ *   above.  There doesn't appear to be dedicated looping instructions, but
+ *   this can be done using a temp reg + conditional branching.
+ *
+ *   Subroutines may be uploaded before the main program itself, but the first
+ *   executed instruction is determined by the PROGRAM_START_ID FIFO command.
+ *
+ */
+
+/* DWORD 0 */
+
+#define NV30_VP_INST_ADDR_REG_SELECT_1        (1 << 24)
+#define NV30_VP_INST_SRC2_ABS           (1 << 23) /* guess */
+#define NV30_VP_INST_SRC1_ABS           (1 << 22) /* guess */
+#define NV30_VP_INST_SRC0_ABS           (1 << 21) /* guess */
+#define NV30_VP_INST_VEC_RESULT         (1 << 20)
+#define NV30_VP_INST_DEST_TEMP_ID_SHIFT        16
+#define NV30_VP_INST_DEST_TEMP_ID_MASK        (0x0F << 16)
+#define NV30_VP_INST_COND_UPDATE_ENABLE        (1<<15)
+#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0xF << 16)
+#define NV30_VP_INST_COND_TEST_ENABLE        (1<<14)
+#define NV30_VP_INST_COND_SHIFT          11
+#define NV30_VP_INST_COND_MASK          (0x07 << 11)
+#define NV30_VP_INST_COND_SWZ_X_SHIFT        9
+#define NV30_VP_INST_COND_SWZ_X_MASK        (0x03 <<  9)
+#define NV30_VP_INST_COND_SWZ_Y_SHIFT        7
+#define NV30_VP_INST_COND_SWZ_Y_MASK        (0x03 <<  7)
+#define NV30_VP_INST_COND_SWZ_Z_SHIFT        5
+#define NV30_VP_INST_COND_SWZ_Z_MASK        (0x03 <<  5)
+#define NV30_VP_INST_COND_SWZ_W_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_W_MASK        (0x03 <<  3)
+#define NV30_VP_INST_COND_SWZ_ALL_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_ALL_MASK        (0xFF <<  3)
+#define NV30_VP_INST_ADDR_SWZ_SHIFT        1
+#define NV30_VP_INST_ADDR_SWZ_MASK        (0x03 <<  1)
+#define NV30_VP_INST_SCA_OPCODEH_SHIFT        0
+#define NV30_VP_INST_SCA_OPCODEH_MASK        (0x01 <<  0)
+
+/* DWORD 1 */
+#define NV30_VP_INST_SCA_OPCODEL_SHIFT        28
+#define NV30_VP_INST_SCA_OPCODEL_MASK        (0x0F << 28)
+#define NV30_VP_INST_VEC_OPCODE_SHIFT        23
+#define NV30_VP_INST_VEC_OPCODE_MASK        (0x1F << 23)
+#define NV30_VP_INST_CONST_SRC_SHIFT        14
+#define NV30_VP_INST_CONST_SRC_MASK        (0xFF << 14)
+#define NV30_VP_INST_INPUT_SRC_SHIFT        9    /*NV20*/
+#define NV30_VP_INST_INPUT_SRC_MASK        (0x0F <<  9)  /*NV20*/
+#define NV30_VP_INST_SRC0H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC0H_MASK          (0x1FF << 0)  /*NV20*/
+
+/* Please note: the IADDR fields overlap other fields because they are used
+ * only for branch instructions.  See Branching: label above
+ *
+ * DWORD 2
+ */
+#define NV30_VP_INST_SRC0L_SHIFT        26    /*NV20*/
+#define NV30_VP_INST_SRC0L_MASK         (0x3F  <<26)  /* NV30_VP_SRC0_LOW_MASK << 26 */
+#define NV30_VP_INST_SRC1_SHIFT         11    /*NV20*/
+#define NV30_VP_INST_SRC1_MASK          (0x7FFF<<11)  /*NV20*/
+#define NV30_VP_INST_SRC2H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC2H_MASK          (0x7FF << 0)  /* NV30_VP_SRC2_HIGH_MASK >> 4*/
+#define NV30_VP_INST_IADDR_SHIFT        2
+#define NV30_VP_INST_IADDR_MASK          (0xF <<  28)   /* NV30_VP_SRC2_LOW_MASK << 28 */
+
+/* DWORD 3 */
+#define NV30_VP_INST_SRC2L_SHIFT        28    /*NV20*/
+#define NV30_VP_INST_SRC2L_MASK          (0x0F  <<28)  /*NV20*/
+#define NV30_VP_INST_STEMP_WRITEMASK_SHIFT      24
+#define NV30_VP_INST_STEMP_WRITEMASK_MASK      (0x0F << 24)
+#define NV30_VP_INST_VTEMP_WRITEMASK_SHIFT      20
+#define NV30_VP_INST_VTEMP_WRITEMASK_MASK      (0x0F << 20)
+#define NV30_VP_INST_SDEST_WRITEMASK_SHIFT      16
+#define NV30_VP_INST_SDEST_WRITEMASK_MASK      (0x0F << 16)
+#define NV30_VP_INST_VDEST_WRITEMASK_SHIFT      12    /*NV20*/
+#define NV30_VP_INST_VDEST_WRITEMASK_MASK      (0x0F << 12)  /*NV20*/
+#define NV30_VP_INST_DEST_SHIFT        2
+#define NV30_VP_INST_DEST_MASK        (0x0F <<  2)
+#  define NV30_VP_INST_DEST_POS  0
+#  define NV30_VP_INST_DEST_BFC0  1
+#  define NV30_VP_INST_DEST_BFC1  2
+#  define NV30_VP_INST_DEST_COL0  3
+#  define NV30_VP_INST_DEST_COL1  4
+#  define NV30_VP_INST_DEST_FOGC  5
+#  define NV30_VP_INST_DEST_PSZ   6
+#  define NV30_VP_INST_DEST_TC(n)  (8+n)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV30_VP_SRC0_HIGH_SHIFT                                                6
+#define NV30_VP_SRC0_HIGH_MASK                                        0x00007FC0
+#define NV30_VP_SRC0_LOW_MASK                                         0x0000003F
+#define NV30_VP_SRC2_HIGH_SHIFT                                                4
+#define NV30_VP_SRC2_HIGH_MASK                                        0x00007FF0
+#define NV30_VP_SRC2_LOW_MASK                                         0x0000000F
+
+
+/* Source-register definition - matches NV20 exactly */
+#define NV30_VP_SRC_NEGATE          (1<<14)
+#define NV30_VP_SRC_SWZ_X_SHIFT        12
+#define NV30_VP_SRC_REG_SWZ_X_MASK        (0x03  <<12)
+#define NV30_VP_SRC_SWZ_Y_SHIFT        10
+#define NV30_VP_SRC_REG_SWZ_Y_MASK        (0x03  <<10)
+#define NV30_VP_SRC_SWZ_Z_SHIFT        8
+#define NV30_VP_SRC_REG_SWZ_Z_MASK        (0x03  << 8)
+#define NV30_VP_SRC_SWZ_W_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_W_MASK        (0x03  << 6)
+#define NV30_VP_SRC_REG_SWZ_ALL_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_ALL_MASK        (0xFF  << 6)
+#define NV30_VP_SRC_TEMP_SRC_SHIFT        2
+#define NV30_VP_SRC_REG_TEMP_ID_MASK        (0x0F  << 0)
+#define NV30_VP_SRC_REG_TYPE_SHIFT        0
+#define NV30_VP_SRC_REG_TYPE_MASK        (0x03  << 0)
+#define NV30_VP_SRC_REG_TYPE_TEMP  1
+#define NV30_VP_SRC_REG_TYPE_INPUT  2
+#define NV30_VP_SRC_REG_TYPE_CONST  3 /* guess */
+
+#include "nvfx_shader.h"
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nv40_fragtex.c b/src/gallium/drivers/nvfx/nv40_fragtex.c
new file mode 100644
index 0000000000..0068b1ba54
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv40_fragtex.c
@@ -0,0 +1,176 @@
+#include "util/u_format.h"
+#include "nvfx_context.h"
+#include "nvfx_tex.h"
+#include "nvfx_resource.h"
+
+void
+nv40_sampler_state_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_state *ps,
+			  const struct pipe_sampler_state *cso)
+{
+	if (cso->max_anisotropy >= 2) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+		if (cso->max_anisotropy >= 16) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_2X;
+		}
+	}
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 7;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 19;
+	}
+}
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sx,sy,sz,sw)            \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV40TCL_TEX_FORMAT_FORMAT_##tf,                                              \
+  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |         \
+   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |         \
+   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |         \
+   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w),         \
+  ((NV34TCL_TX_FILTER_SIGNED_RED*sx) | (NV34TCL_TX_FILTER_SIGNED_GREEN*sy) |       \
+   (NV34TCL_TX_FILTER_SIGNED_BLUE*sz) | (NV34TCL_TX_FILTER_SIGNED_ALPHA*sw))       \
+}
+
+struct nv40_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+	int     sign;
+};
+
+static struct nv40_texture_format
+nv40_texture_formats[] = {
+	_(B8G8R8X8_UNORM, A8R8G8B8,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(B8G8R8A8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(B5G5R5A1_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(B4G4R4A4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(B5G6R5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(R16_SNORM     , A16     , ZERO, ZERO,   S1,  ONE, X, X, X, Y, 1, 1, 1, 1),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(L8A8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y, 0, 0, 0, 0),
+	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(S8_USCALED_Z24_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	{},
+};
+
+static struct nv40_texture_format *
+nv40_fragtex_format(uint pipe_format)
+{
+	struct nv40_texture_format *tf = nv40_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", util_format_name(pipe_format));
+	return NULL;
+}
+
+
+void
+nv40_fragtex_set(struct nvfx_context *nvfx, int unit)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit];
+	struct nvfx_miptree *nv40mt = (struct nvfx_miptree *)nvfx->fragment_sampler_views[unit]->texture;
+	struct nouveau_bo *bo = nv40mt->base.bo;
+	struct pipe_resource *pt = &nv40mt->base.base;
+	struct nv40_texture_format *tf;
+
+	uint32_t txf, txs, txp;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv40_fragtex_format(pt->format);
+	if (!tf)
+		assert(0);
+
+	txf  = ps->fmt;
+	txf |= tf->format | 0x8000;
+	txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+
+	if (1) /* XXX */
+		txf |= NV34TCL_TX_FORMAT_NO_BORDER;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	if (!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR)) {
+		txp = 0;
+	} else {
+		txp  = nv40mt->level[0].pitch;
+		txf |= NV40TCL_TEX_FORMAT_LINEAR;
+	}
+
+	txs = tf->swizzle;
+
+	MARK_RING(chan, 11 + 2 * !unit, 2);
+	OUT_RING(chan, RING_3D(NV34TCL_TX_OFFSET(unit), 8));
+	OUT_RELOC(chan, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	OUT_RELOC(chan, bo, txf, tex_flags | NOUVEAU_BO_OR,
+			NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	OUT_RING(chan, ps->wrap);
+	OUT_RING(chan, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
+	OUT_RING(chan, txs);
+	OUT_RING(chan, ps->filt | tf->sign | 0x2000 /*voodoo*/);
+	OUT_RING(chan, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) | pt->height0);
+	OUT_RING(chan, ps->bcol);
+	OUT_RING(chan, RING_3D(NV40TCL_TEX_SIZE1(unit), 1));
+	OUT_RING(chan, (pt->depth0 << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+
+	nvfx->hw_txf[unit] = txf;
+	nvfx->hw_samplers |= (1 << unit);
+}
diff --git a/src/gallium/drivers/nvfx/nv40_vertprog.h b/src/gallium/drivers/nvfx/nv40_vertprog.h
new file mode 100644
index 0000000000..7337293bab
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv40_vertprog.h
@@ -0,0 +1,177 @@
+#ifndef __NV40_SHADER_H__
+#define __NV40_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * The NV40 instruction set is very similar to NV30.  Most fields are in
+ * a slightly different position in the instruction however.
+ *
+ * Merged instructions
+ *     In some cases it is possible to put two instructions into one opcode
+ *     slot.  The rules for when this is OK is not entirely clear to me yet.
+ *
+ *     There are separate writemasks and dest temp register fields for each
+ *     grouping of instructions.  There is however only one field with the
+ *     ID of a result register.  Writing to temp/result regs is selected by
+ *     setting VEC_RESULT/SCA_RESULT.
+ *
+ * Temporary registers
+ *     The source/dest temp register fields have been extended by 1 bit, to
+ *     give a total of 32 temporary registers.
+ *
+ * Relative Addressing
+ *     NV40 can use an address register to index into vertex attribute regs.
+ *     This is done by putting the offset value into INPUT_SRC and setting
+ *     the INDEX_INPUT flag.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details)
+ *     There is a second condition code register on NV40, it's use is enabled
+ *     by setting the COND_REG_SELECT_1 flag.
+ *
+ * Texture lookup
+ *     TODO
+ */
+
+/* ---- OPCODE BITS 127:96 / data DWORD 0 --- */
+#define NV40_VP_INST_VEC_RESULT                                        (1 << 30)
+/* uncertain.. */
+#define NV40_VP_INST_COND_UPDATE_ENABLE                        ((1 << 14)|1<<29)
+/* use address reg as index into attribs */
+#define NV40_VP_INST_INDEX_INPUT                                       (1 << 27)
+#define NV40_VP_INST_COND_REG_SELECT_1                                 (1 << 25)
+#define NV40_VP_INST_ADDR_REG_SELECT_1                                 (1 << 24)
+#define NV40_VP_INST_SRC2_ABS                                          (1 << 23)
+#define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
+#define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
+#define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x1F << 15)
+#define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
+#define NV40_VP_INST_COND_SHIFT                                               10
+#define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
+#define NV40_VP_INST_COND_SWZ_X_SHIFT                                          8
+#define NV40_VP_INST_COND_SWZ_X_MASK                                    (3 << 8)
+#define NV40_VP_INST_COND_SWZ_Y_SHIFT                                          6
+#define NV40_VP_INST_COND_SWZ_Y_MASK                                    (3 << 6)
+#define NV40_VP_INST_COND_SWZ_Z_SHIFT                                          4
+#define NV40_VP_INST_COND_SWZ_Z_MASK                                    (3 << 4)
+#define NV40_VP_INST_COND_SWZ_W_SHIFT                                          2
+#define NV40_VP_INST_COND_SWZ_W_MASK                                    (3 << 2)
+#define NV40_VP_INST_COND_SWZ_ALL_SHIFT                                        2
+#define NV40_VP_INST_COND_SWZ_ALL_MASK                               (0xFF << 2)
+#define NV40_VP_INST_ADDR_SWZ_SHIFT                                            0
+#define NV40_VP_INST_ADDR_SWZ_MASK                                   (0x03 << 0)
+#define NV40_VP_INST0_KNOWN ( \
+                NV40_VP_INST_INDEX_INPUT | \
+                NV40_VP_INST_COND_REG_SELECT_1 | \
+                NV40_VP_INST_ADDR_REG_SELECT_1 | \
+                NV40_VP_INST_SRC2_ABS | \
+                NV40_VP_INST_SRC1_ABS | \
+                NV40_VP_INST_SRC0_ABS | \
+                NV40_VP_INST_VEC_DEST_TEMP_MASK | \
+                NV40_VP_INST_COND_TEST_ENABLE | \
+                NV40_VP_INST_COND_MASK | \
+                NV40_VP_INST_COND_SWZ_ALL_MASK | \
+                NV40_VP_INST_ADDR_SWZ_MASK)
+
+/* ---- OPCODE BITS 95:64 / data DWORD 1 --- */
+#define NV40_VP_INST_VEC_OPCODE_SHIFT                                         22
+#define NV40_VP_INST_VEC_OPCODE_MASK                                (0x1F << 22)
+#define NV40_VP_INST_SCA_OPCODE_SHIFT                                         27
+#define NV40_VP_INST_SCA_OPCODE_MASK                                (0x1F << 27)
+#define NV40_VP_INST_CONST_SRC_SHIFT                                          12
+#define NV40_VP_INST_CONST_SRC_MASK                                 (0xFF << 12)
+#define NV40_VP_INST_INPUT_SRC_SHIFT                                           8
+#define NV40_VP_INST_INPUT_SRC_MASK                                  (0x0F << 8)
+#define NV40_VP_INST_SRC0H_SHIFT                                               0
+#define NV40_VP_INST_SRC0H_MASK                                      (0xFF << 0)
+#define NV40_VP_INST1_KNOWN ( \
+                NV40_VP_INST_VEC_OPCODE_MASK | \
+                NV40_VP_INST_SCA_OPCODE_MASK | \
+                NV40_VP_INST_CONST_SRC_MASK  | \
+                NV40_VP_INST_INPUT_SRC_MASK  | \
+                NV40_VP_INST_SRC0H_MASK \
+                )
+
+/* ---- OPCODE BITS 63:32 / data DWORD 2 --- */
+#define NV40_VP_INST_SRC0L_SHIFT                                              23
+#define NV40_VP_INST_SRC0L_MASK                                    (0x1FF << 23)
+#define NV40_VP_INST_SRC1_SHIFT                                                6
+#define NV40_VP_INST_SRC1_MASK                                    (0x1FFFF << 6)
+#define NV40_VP_INST_SRC2H_SHIFT                                               0
+#define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
+#define NV40_VP_INST_IADDRH_SHIFT                                              0
+#define NV40_VP_INST_IADDRH_MASK                                     (0x1F << 0)
+
+/* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
+#define NV40_VP_INST_IADDRL_SHIFT                                             29
+#define NV40_VP_INST_IADDRL_MASK                                       (7 << 29)
+#define NV40_VP_INST_SRC2L_SHIFT                                              21
+#define NV40_VP_INST_SRC2L_MASK                                    (0x7FF << 21)
+#define NV40_VP_INST_SCA_WRITEMASK_SHIFT                                      17
+#define NV40_VP_INST_SCA_WRITEMASK_MASK                              (0xF << 17)
+#    define NV40_VP_INST_SCA_WRITEMASK_X                               (1 << 20)
+#    define NV40_VP_INST_SCA_WRITEMASK_Y                               (1 << 19)
+#    define NV40_VP_INST_SCA_WRITEMASK_Z                               (1 << 18)
+#    define NV40_VP_INST_SCA_WRITEMASK_W                               (1 << 17)
+#define NV40_VP_INST_VEC_WRITEMASK_SHIFT                                      13
+#define NV40_VP_INST_VEC_WRITEMASK_MASK                              (0xF << 13)
+#    define NV40_VP_INST_VEC_WRITEMASK_X                               (1 << 16)
+#    define NV40_VP_INST_VEC_WRITEMASK_Y                               (1 << 15)
+#    define NV40_VP_INST_VEC_WRITEMASK_Z                               (1 << 14)
+#    define NV40_VP_INST_VEC_WRITEMASK_W                               (1 << 13)
+#define NV40_VP_INST_SCA_RESULT                                        (1 << 12)
+#define NV40_VP_INST_SCA_DEST_TEMP_SHIFT                                       7
+#define NV40_VP_INST_SCA_DEST_TEMP_MASK                              (0x1F << 7)
+#define NV40_VP_INST_DEST_SHIFT                                                2
+#define NV40_VP_INST_DEST_MASK                                         (31 << 2)
+#    define NV40_VP_INST_DEST_POS                                              0
+#    define NV40_VP_INST_DEST_COL0                                             1
+#    define NV40_VP_INST_DEST_COL1                                             2
+#    define NV40_VP_INST_DEST_BFC0                                             3
+#    define NV40_VP_INST_DEST_BFC1                                             4
+#    define NV40_VP_INST_DEST_FOGC                                             5
+#    define NV40_VP_INST_DEST_PSZ                                              6
+#    define NV40_VP_INST_DEST_TC0                                              7
+#    define NV40_VP_INST_DEST_TC(n)                                        (7+n)
+#    define NV40_VP_INST_DEST_TEMP                                          0x1F
+#define NV40_VP_INST_INDEX_CONST                                        (1 << 1)
+#define NV40_VP_INST3_KNOWN ( \
+                NV40_VP_INST_SRC2L_MASK |\
+                NV40_VP_INST_SCA_WRITEMASK_MASK |\
+                NV40_VP_INST_VEC_WRITEMASK_MASK |\
+                NV40_VP_INST_SCA_DEST_TEMP_MASK |\
+                NV40_VP_INST_DEST_MASK |\
+                NV40_VP_INST_INDEX_CONST)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV40_VP_SRC0_HIGH_SHIFT                                                9
+#define NV40_VP_SRC0_HIGH_MASK                                        0x0001FE00
+#define NV40_VP_SRC0_LOW_MASK                                         0x000001FF
+#define NV40_VP_SRC2_HIGH_SHIFT                                               11
+#define NV40_VP_SRC2_HIGH_MASK                                        0x0001F800
+#define NV40_VP_SRC2_LOW_MASK                                         0x000007FF
+
+/* Source selection - these are the bits you fill NV40_VP_INST_SRCn with */
+#define NV40_VP_SRC_NEGATE                                             (1 << 16)
+#define NV40_VP_SRC_SWZ_X_SHIFT                                               14
+#define NV40_VP_SRC_SWZ_X_MASK                                         (3 << 14)
+#define NV40_VP_SRC_SWZ_Y_SHIFT                                               12
+#define NV40_VP_SRC_SWZ_Y_MASK                                         (3 << 12)
+#define NV40_VP_SRC_SWZ_Z_SHIFT                                               10
+#define NV40_VP_SRC_SWZ_Z_MASK                                         (3 << 10)
+#define NV40_VP_SRC_SWZ_W_SHIFT                                                8
+#define NV40_VP_SRC_SWZ_W_MASK                                          (3 << 8)
+#define NV40_VP_SRC_SWZ_ALL_SHIFT                                              8
+#define NV40_VP_SRC_SWZ_ALL_MASK                                     (0xFF << 8)
+#define NV40_VP_SRC_TEMP_SRC_SHIFT                                             2
+#define NV40_VP_SRC_TEMP_SRC_MASK                                    (0x1F << 2)
+#define NV40_VP_SRC_REG_TYPE_SHIFT                                             0
+#define NV40_VP_SRC_REG_TYPE_MASK                                       (3 << 0)
+#    define NV40_VP_SRC_REG_TYPE_UNK0                                          0
+#    define NV40_VP_SRC_REG_TYPE_TEMP                                          1
+#    define NV40_VP_SRC_REG_TYPE_INPUT                                         2
+#    define NV40_VP_SRC_REG_TYPE_CONST                                         3
+
+#include "nvfx_shader.h"
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_buffer.c b/src/gallium/drivers/nvfx/nvfx_buffer.c
new file mode 100644
index 0000000000..05b824b8f7
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_buffer.c
@@ -0,0 +1,153 @@
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "nouveau/nouveau_screen.h"
+#include "nouveau/nouveau_winsys.h"
+#include "nvfx_resource.h"
+
+
+/* Currently using separate implementations for buffers and textures,
+ * even though gallium has a unified abstraction of these objects.
+ * Eventually these should be combined, and mechanisms like transfers
+ * be adapted to work for both buffer and texture uploads.
+ */
+static void nvfx_buffer_destroy(struct pipe_screen *pscreen,
+				struct pipe_resource *presource)
+{
+	struct nvfx_resource *buffer = nvfx_resource(presource);
+
+	nouveau_screen_bo_release(pscreen, buffer->bo);
+	FREE(buffer);
+}
+
+
+
+
+/* Utility functions for transfer create/destroy are hooked in and
+ * just record the arguments to those functions.
+ */
+static void *
+nvfx_buffer_transfer_map( struct pipe_context *pipe,
+			  struct pipe_transfer *transfer )
+{
+	struct nvfx_resource *buffer = nvfx_resource(transfer->resource);
+	uint8_t *map;
+
+	map = nouveau_screen_bo_map_range( pipe->screen,
+					   buffer->bo,
+					   transfer->box.x,
+					   transfer->box.width,
+					   nouveau_screen_transfer_flags(transfer->usage) );
+	if (map == NULL)
+		return NULL;
+	
+	return map + transfer->box.x;
+}
+
+
+
+static void nvfx_buffer_transfer_flush_region( struct pipe_context *pipe,
+					       struct pipe_transfer *transfer,
+					       const struct pipe_box *box)
+{
+	struct nvfx_resource *buffer = nvfx_resource(transfer->resource);
+
+	nouveau_screen_bo_map_flush_range(pipe->screen,
+					  buffer->bo,
+					  transfer->box.x + box->x,
+					  box->width);
+}
+
+static void nvfx_buffer_transfer_unmap( struct pipe_context *pipe,
+					struct pipe_transfer *transfer )
+{
+	struct nvfx_resource *buffer = nvfx_resource(transfer->resource);
+
+	nouveau_screen_bo_unmap(pipe->screen, buffer->bo);
+}
+
+
+
+
+struct u_resource_vtbl nvfx_buffer_vtbl = 
+{
+	u_default_resource_get_handle,      /* get_handle */
+	nvfx_buffer_destroy,		     /* resource_destroy */
+	NULL,			    /* is_resource_referenced */
+	u_default_get_transfer,	     /* get_transfer */
+	u_default_transfer_destroy,	     /* transfer_destroy */
+	nvfx_buffer_transfer_map,	     /* transfer_map */
+	nvfx_buffer_transfer_flush_region,  /* transfer_flush_region */
+	nvfx_buffer_transfer_unmap,	     /* transfer_unmap */
+	u_default_transfer_inline_write   /* transfer_inline_write */
+};
+
+
+
+struct pipe_resource *
+nvfx_buffer_create(struct pipe_screen *pscreen,
+		   const struct pipe_resource *template)
+{
+	struct nvfx_resource *buffer;
+
+	buffer = CALLOC_STRUCT(nvfx_resource);
+	if (!buffer)
+		return NULL;
+
+	buffer->base = *template;
+	buffer->vtbl = &nvfx_buffer_vtbl;
+	pipe_reference_init(&buffer->base.reference, 1);
+	buffer->base.screen = pscreen;
+
+	buffer->bo = nouveau_screen_bo_new(pscreen,
+					   16,
+					   buffer->base.usage,
+					   buffer->base.bind,
+					   buffer->base.width0);
+
+	if (buffer->bo == NULL)
+		goto fail;
+
+	return &buffer->base;
+
+fail:
+	FREE(buffer);
+	return NULL;
+}
+
+
+struct pipe_resource *
+nvfx_user_buffer_create(struct pipe_screen *pscreen,
+			void *ptr,
+			unsigned bytes,
+			unsigned usage)
+{
+	struct nvfx_resource *buffer;
+
+	buffer = CALLOC_STRUCT(nvfx_resource);
+	if (!buffer)
+		return NULL;
+
+	pipe_reference_init(&buffer->base.reference, 1);
+	buffer->vtbl = &nvfx_buffer_vtbl;
+	buffer->base.screen = pscreen;
+	buffer->base.format = PIPE_FORMAT_R8_UNORM;
+	buffer->base.usage = PIPE_USAGE_IMMUTABLE;
+	buffer->base.bind = usage;
+	buffer->base.width0 = bytes;
+	buffer->base.height0 = 1;
+	buffer->base.depth0 = 1;
+
+	buffer->bo = nouveau_screen_bo_user(pscreen, ptr, bytes);
+	if (!buffer->bo)
+		goto fail;
+	
+	return &buffer->base;
+
+fail:
+	FREE(buffer);
+	return NULL;
+}
+
diff --git a/src/gallium/drivers/nvfx/nvfx_clear.c b/src/gallium/drivers/nvfx/nvfx_clear.c
new file mode 100644
index 0000000000..2be70fcee4
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_clear.c
@@ -0,0 +1,14 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_clear.h"
+
+#include "nvfx_context.h"
+
+void
+nvfx_clear(struct pipe_context *pipe, unsigned buffers,
+           const float *rgba, double depth, unsigned stencil)
+{
+	util_clear(pipe, &nvfx_context(pipe)->framebuffer, buffers, rgba, depth,
+		   stencil);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c
new file mode 100644
index 0000000000..6d2dc4d5bf
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_context.c
@@ -0,0 +1,84 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+
+#include "nvfx_context.h"
+#include "nvfx_screen.h"
+#include "nvfx_resource.h"
+
+static void
+nvfx_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(chan, eng3d, 0x1fd8, 1);
+		OUT_RING  (chan, 2);
+		BEGIN_RING(chan, eng3d, 0x1fd8, 1);
+		OUT_RING  (chan, 1);
+	}
+
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
+}
+
+static void
+nvfx_destroy(struct pipe_context *pipe)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	if (nvfx->draw)
+		draw_destroy(nvfx->draw);
+	FREE(nvfx);
+}
+
+struct pipe_context *
+nvfx_create(struct pipe_screen *pscreen, void *priv)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nvfx_context *nvfx;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvfx = CALLOC(1, sizeof(struct nvfx_context));
+	if (!nvfx)
+		return NULL;
+	nvfx->screen = screen;
+
+	nvfx->nvws = nvws;
+
+	nvfx->pipe.winsys = ws;
+	nvfx->pipe.screen = pscreen;
+	nvfx->pipe.priv = priv;
+	nvfx->pipe.destroy = nvfx_destroy;
+	nvfx->pipe.draw_arrays = nvfx_draw_arrays;
+	nvfx->pipe.draw_elements = nvfx_draw_elements;
+	nvfx->pipe.clear = nvfx_clear;
+	nvfx->pipe.flush = nvfx_flush;
+
+	screen->base.channel->user_private = nvfx;
+
+	nvfx->is_nv4x = screen->is_nv4x;
+
+	nvfx_init_query_functions(nvfx);
+	nvfx_init_surface_functions(nvfx);
+	nvfx_init_state_functions(nvfx);
+	nvfx_init_resource_functions(&nvfx->pipe);
+
+	/* Create, configure, and install fallback swtnl path */
+	nvfx->draw = draw_create(&nvfx->pipe);
+	draw_wide_point_threshold(nvfx->draw, 9999999.0);
+	draw_wide_line_threshold(nvfx->draw, 9999999.0);
+	draw_enable_line_stipple(nvfx->draw, FALSE);
+	draw_enable_point_sprites(nvfx->draw, FALSE);
+	draw_set_rasterize_stage(nvfx->draw, nvfx_draw_render_stage(nvfx));
+
+	/* set these to that we init them on first validation */
+	nvfx->state.scissor_enabled = ~0;
+	nvfx->state.stipple_enabled = ~0;
+	return &nvfx->pipe;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
new file mode 100644
index 0000000000..e48f9f3aa8
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -0,0 +1,251 @@
+#ifndef __NVFX_CONTEXT_H__
+#define __NVFX_CONTEXT_H__
+
+#include <stdio.h>
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_inlines.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#include "nvfx_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#include "nvfx_screen.h"
+
+#define NVFX_NEW_BLEND		(1 <<  0)
+#define NVFX_NEW_RAST		(1 <<  1)
+#define NVFX_NEW_ZSA		(1 <<  2)
+#define NVFX_NEW_SAMPLER	(1 <<  3)
+#define NVFX_NEW_FB		(1 <<  4)
+#define NVFX_NEW_STIPPLE	(1 <<  5)
+#define NVFX_NEW_SCISSOR	(1 <<  6)
+#define NVFX_NEW_VIEWPORT	(1 <<  7)
+#define NVFX_NEW_BCOL		(1 <<  8)
+#define NVFX_NEW_VERTPROG	(1 <<  9)
+#define NVFX_NEW_FRAGPROG	(1 << 10)
+#define NVFX_NEW_ARRAYS		(1 << 11)
+#define NVFX_NEW_UCP		(1 << 12)
+#define NVFX_NEW_SR		(1 << 13)
+#define NVFX_NEW_VERTCONST	(1 << 14)
+#define NVFX_NEW_FRAGCONST	(1 << 15)
+
+struct nvfx_rasterizer_state {
+	struct pipe_rasterizer_state pipe;
+	unsigned sb_len;
+	uint32_t sb[32];
+};
+
+struct nvfx_zsa_state {
+	struct pipe_depth_stencil_alpha_state pipe;
+	unsigned sb_len;
+	uint32_t sb[26];
+};
+
+struct nvfx_blend_state {
+	struct pipe_blend_state pipe;
+	unsigned sb_len;
+	uint32_t sb[13];
+};
+
+
+struct nvfx_state {
+	unsigned scissor_enabled;
+	unsigned stipple_enabled;
+	unsigned fp_samplers;
+};
+
+struct nvfx_vtxelt_state {
+	struct pipe_vertex_element pipe[16];
+	unsigned num_elements;
+};
+
+struct nvfx_render_target {
+	struct nouveau_bo* bo;
+	unsigned offset;
+	unsigned pitch;
+};
+
+struct nvfx_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nvfx_screen *screen;
+
+	unsigned is_nv4x; /* either 0 or ~0 */
+
+	struct draw_context *draw;
+
+	/* HW state derived from pipe states */
+	struct nvfx_state state;
+	struct {
+		struct nvfx_vertex_program *vertprog;
+
+		unsigned nr_attribs;
+		unsigned hw[PIPE_MAX_SHADER_INPUTS];
+		unsigned draw[PIPE_MAX_SHADER_INPUTS];
+		unsigned emit[PIPE_MAX_SHADER_INPUTS];
+	} swtnl;
+
+	enum {
+		HW, SWTNL, SWRAST
+	} render_mode;
+	unsigned fallback_swtnl;
+
+	/* Context state */
+	unsigned dirty, draw_dirty;
+	struct pipe_scissor_state scissor;
+	unsigned stipple[32];
+	struct pipe_clip_state clip;
+	struct nvfx_vertex_program *vertprog;
+	struct nvfx_fragment_program *fragprog;
+	struct pipe_resource *constbuf[PIPE_SHADER_TYPES];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+	struct nvfx_rasterizer_state *rasterizer;
+	struct nvfx_zsa_state *zsa;
+	struct nvfx_blend_state *blend;
+	struct pipe_blend_color blend_colour;
+	struct pipe_stencil_ref stencil_ref;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct pipe_resource *idxbuf;
+	unsigned idxbuf_format;
+	struct nvfx_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
+	unsigned nr_samplers;
+	unsigned nr_textures;
+	unsigned dirty_samplers;
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct nvfx_vtxelt_state *vtxelt;
+
+	unsigned vbo_bo;
+	unsigned hw_vtxelt_nr;
+	uint8_t hw_samplers;
+	uint32_t hw_txf[8];
+	struct nvfx_render_target hw_rt[4];
+	struct nvfx_render_target hw_zeta;
+};
+
+static INLINE struct nvfx_context *
+nvfx_context(struct pipe_context *pipe)
+{
+	return (struct nvfx_context *)pipe;
+}
+
+extern struct nvfx_state_entry nvfx_state_blend;
+extern struct nvfx_state_entry nvfx_state_blend_colour;
+extern struct nvfx_state_entry nvfx_state_fragprog;
+extern struct nvfx_state_entry nvfx_state_fragtex;
+extern struct nvfx_state_entry nvfx_state_framebuffer;
+extern struct nvfx_state_entry nvfx_state_rasterizer;
+extern struct nvfx_state_entry nvfx_state_scissor;
+extern struct nvfx_state_entry nvfx_state_sr;
+extern struct nvfx_state_entry nvfx_state_stipple;
+extern struct nvfx_state_entry nvfx_state_vbo;
+extern struct nvfx_state_entry nvfx_state_vertprog;
+extern struct nvfx_state_entry nvfx_state_viewport;
+extern struct nvfx_state_entry nvfx_state_vtxfmt;
+extern struct nvfx_state_entry nvfx_state_zsa;
+
+extern void nvfx_init_query_functions(struct nvfx_context *nvfx);
+extern void nvfx_init_surface_functions(struct nvfx_context *nvfx);
+
+/* nvfx_context.c */
+struct pipe_context *
+nvfx_create(struct pipe_screen *pscreen, void *priv);
+
+/* nvfx_clear.c */
+extern void nvfx_clear(struct pipe_context *pipe, unsigned buffers,
+		       const float *rgba, double depth, unsigned stencil);
+
+/* nvfx_draw.c */
+extern struct draw_stage *nvfx_draw_render_stage(struct nvfx_context *nvfx);
+extern void nvfx_draw_elements_swtnl(struct pipe_context *pipe,
+                                     struct pipe_resource *idxbuf,
+                                     unsigned ib_size, int ib_bias,
+                                     unsigned mode,
+                                     unsigned start, unsigned count);
+extern void nvfx_vtxfmt_validate(struct nvfx_context *nvfx);
+
+/* nvfx_fb.c */
+extern void nvfx_state_framebuffer_validate(struct nvfx_context *nvfx);
+void
+nvfx_framebuffer_relocate(struct nvfx_context *nvfx);
+
+/* nvfx_fragprog.c */
+extern void nvfx_fragprog_destroy(struct nvfx_context *,
+				    struct nvfx_fragment_program *);
+extern void nvfx_fragprog_validate(struct nvfx_context *nvfx);
+extern void
+nvfx_fragprog_relocate(struct nvfx_context *nvfx);
+
+/* nvfx_fragtex.c */
+extern void nvfx_fragtex_validate(struct nvfx_context *nvfx);
+extern void
+nvfx_fragtex_relocate(struct nvfx_context *nvfx);
+
+/* nv30_fragtex.c */
+extern void
+nv30_sampler_state_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_state *ps,
+			  const struct pipe_sampler_state *cso);
+extern void nv30_fragtex_set(struct nvfx_context *nvfx, int unit);
+
+/* nv40_fragtex.c */
+extern void
+nv40_sampler_state_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_state *ps,
+			  const struct pipe_sampler_state *cso);
+extern void nv40_fragtex_set(struct nvfx_context *nvfx, int unit);
+
+/* nvfx_state.c */
+extern void nvfx_init_state_functions(struct nvfx_context *nvfx);
+extern void nvfx_state_scissor_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_stipple_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_blend_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_blend_colour_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_viewport_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_rasterizer_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_sr_validate(struct nvfx_context *nvfx);
+extern void nvfx_state_zsa_validate(struct nvfx_context *nvfx);
+
+/* nvfx_state_emit.c */
+extern void nvfx_state_relocate(struct nvfx_context *nvfx);
+extern boolean nvfx_state_validate(struct nvfx_context *nvfx);
+extern boolean nvfx_state_validate_swtnl(struct nvfx_context *nvfx);
+extern void nvfx_state_emit(struct nvfx_context *nvfx);
+
+/* nvfx_transfer.c */
+extern void nvfx_init_transfer_functions(struct nvfx_context *nvfx);
+
+/* nvfx_vbo.c */
+extern boolean nvfx_vbo_validate(struct nvfx_context *nvfx);
+extern void nvfx_vbo_relocate(struct nvfx_context *nvfx);
+extern void nvfx_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern void nvfx_draw_elements(struct pipe_context *pipe,
+                               struct pipe_resource *indexBuffer,
+                               unsigned indexSize, int indexBias,
+                               unsigned mode, unsigned start,
+                               unsigned count);
+
+/* nvfx_vertprog.c */
+extern boolean nvfx_vertprog_validate(struct nvfx_context *nvfx);
+extern void nvfx_vertprog_destroy(struct nvfx_context *,
+				  struct nvfx_vertex_program *);
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_draw.c b/src/gallium/drivers/nvfx/nvfx_draw.c
new file mode 100644
index 0000000000..22cff370b7
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_draw.c
@@ -0,0 +1,350 @@
+#include "pipe/p_shader_tokens.h"
+#include "util/u_inlines.h"
+#include "tgsi/tgsi_ureg.h"
+
+#include "util/u_pack_color.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pipe.h"
+
+#include "nvfx_context.h"
+
+/* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
+ * often at all.  Uses "quadro style" vertex submission + a fixed vertex
+ * layout to avoid the need to generate a vertex program or vtxfmt.
+ */
+
+struct nvfx_render_stage {
+	struct draw_stage stage;
+	struct nvfx_context *nvfx;
+	unsigned prim;
+};
+
+static INLINE struct nvfx_render_stage *
+nvfx_render_stage(struct draw_stage *stage)
+{
+	return (struct nvfx_render_stage *)stage;
+}
+
+static INLINE void
+nvfx_render_vertex(struct nvfx_context *nvfx, const struct vertex_header *v)
+{
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	unsigned i;
+
+	for (i = 0; i < nvfx->swtnl.nr_attribs; i++) {
+		unsigned idx = nvfx->swtnl.draw[i];
+		unsigned hw = nvfx->swtnl.hw[i];
+
+		switch (nvfx->swtnl.emit[i]) {
+		case EMIT_OMIT:
+			break;
+		case EMIT_1F:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_1F(hw), 1);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			break;
+		case EMIT_2F:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_2F_X(hw), 2);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			break;
+		case EMIT_3F:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_3F_X(hw), 3);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			OUT_RING  (chan, fui(v->data[idx][2]));
+			break;
+		case EMIT_4F:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			OUT_RING  (chan, fui(v->data[idx][2]));
+			OUT_RING  (chan, fui(v->data[idx][3]));
+			break;
+		case 0xff:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (chan, fui(v->data[idx][0] / v->data[idx][3]));
+			OUT_RING  (chan, fui(v->data[idx][1] / v->data[idx][3]));
+			OUT_RING  (chan, fui(v->data[idx][2] / v->data[idx][3]));
+			OUT_RING  (chan, fui(1.0f / v->data[idx][3]));
+			break;
+		case EMIT_4UB:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (chan, pack_ub4(float_to_ubyte(v->data[idx][0]),
+					    float_to_ubyte(v->data[idx][1]),
+					    float_to_ubyte(v->data[idx][2]),
+					    float_to_ubyte(v->data[idx][3])));
+		case EMIT_4UB_BGRA:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (chan, pack_ub4(float_to_ubyte(v->data[idx][2]),
+					    float_to_ubyte(v->data[idx][1]),
+					    float_to_ubyte(v->data[idx][0]),
+					    float_to_ubyte(v->data[idx][3])));
+			break;
+		default:
+			assert(0);
+			break;
+		}
+	}
+}
+
+static INLINE void
+nvfx_render_prim(struct draw_stage *stage, struct prim_header *prim,
+	       unsigned mode, unsigned count)
+{
+	struct nvfx_render_stage *rs = nvfx_render_stage(stage);
+	struct nvfx_context *nvfx = rs->nvfx;
+
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	unsigned i;
+
+	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
+	if (AVAIL_RING(chan) < ((count * 20) + 6)) {
+		if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
+			NOUVEAU_ERR("AIII, missed flush\n");
+			assert(0);
+		}
+		FIRE_RING(chan);
+		nvfx_state_emit(nvfx);
+	}
+
+	/* Switch primitive modes if necessary */
+	if (rs->prim != mode) {
+		if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
+			BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+			OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_STOP);
+		}
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, mode);
+		rs->prim = mode;
+	}
+
+	/* Emit vertex data */
+	for (i = 0; i < count; i++)
+		nvfx_render_vertex(nvfx, prim->v[i]);
+
+	/* If it's likely we'll need to empty the push buffer soon, finish
+	 * off the primitive now.
+	 */
+	if (AVAIL_RING(chan) < ((count * 20) + 6)) {
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_STOP);
+		rs->prim = NV34TCL_VERTEX_BEGIN_END_STOP;
+	}
+}
+
+static void
+nvfx_render_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	nvfx_render_prim(draw, prim, NV34TCL_VERTEX_BEGIN_END_POINTS, 1);
+}
+
+static void
+nvfx_render_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	nvfx_render_prim(draw, prim, NV34TCL_VERTEX_BEGIN_END_LINES, 2);
+}
+
+static void
+nvfx_render_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	nvfx_render_prim(draw, prim, NV34TCL_VERTEX_BEGIN_END_TRIANGLES, 3);
+}
+
+static void
+nvfx_render_flush(struct draw_stage *draw, unsigned flags)
+{
+	struct nvfx_render_stage *rs = nvfx_render_stage(draw);
+	struct nvfx_context *nvfx = rs->nvfx;
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+
+	if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_STOP);
+		rs->prim = NV34TCL_VERTEX_BEGIN_END_STOP;
+	}
+}
+
+static void
+nvfx_render_reset_stipple_counter(struct draw_stage *draw)
+{
+}
+
+static void
+nvfx_render_destroy(struct draw_stage *draw)
+{
+	FREE(draw);
+}
+
+static struct nvfx_vertex_program *
+nvfx_create_drawvp(struct nvfx_context *nvfx)
+{
+	struct ureg_program *ureg;
+	uint i;
+
+	ureg = ureg_create( TGSI_PROCESSOR_VERTEX );
+	if (ureg == NULL)
+		return NULL;
+
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), ureg_DECL_vs_input(ureg, 0));
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0), ureg_DECL_vs_input(ureg, 3));
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1), ureg_DECL_vs_input(ureg, 4));
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_BCOLOR, 0), ureg_DECL_vs_input(ureg, 3));
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_BCOLOR, 1), ureg_DECL_vs_input(ureg, 4));
+	ureg_MOV(ureg,
+		   ureg_writemask(ureg_DECL_output(ureg, TGSI_SEMANTIC_FOG, 1), TGSI_WRITEMASK_X),
+		   ureg_DECL_vs_input(ureg, 5));
+	for (i = 0; i < 8; ++i)
+		ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, i), ureg_DECL_vs_input(ureg, 8 + i));
+
+	ureg_END( ureg );
+
+	return ureg_create_shader_and_destroy( ureg, &nvfx->pipe );
+}
+
+struct draw_stage *
+nvfx_draw_render_stage(struct nvfx_context *nvfx)
+{
+	struct nvfx_render_stage *render = CALLOC_STRUCT(nvfx_render_stage);
+
+	if (!nvfx->swtnl.vertprog)
+		nvfx->swtnl.vertprog = nvfx_create_drawvp(nvfx);
+
+	render->nvfx = nvfx;
+	render->stage.draw = nvfx->draw;
+	render->stage.point = nvfx_render_point;
+	render->stage.line = nvfx_render_line;
+	render->stage.tri = nvfx_render_tri;
+	render->stage.flush = nvfx_render_flush;
+	render->stage.reset_stipple_counter = nvfx_render_reset_stipple_counter;
+	render->stage.destroy = nvfx_render_destroy;
+
+	return &render->stage;
+}
+
+void
+nvfx_draw_elements_swtnl(struct pipe_context *pipe,
+			 struct pipe_resource *idxbuf,
+			 unsigned idxbuf_size, int idxbuf_bias,
+			 unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS];
+	struct pipe_transfer *ib_transfer = NULL;
+	struct pipe_transfer *cb_transfer = NULL;
+	unsigned i;
+	void *map;
+
+	if (!nvfx_state_validate_swtnl(nvfx))
+		return;
+	nvfx_state_emit(nvfx);
+
+	for (i = 0; i < nvfx->vtxbuf_nr; i++) {
+		map = pipe_buffer_map(pipe, nvfx->vtxbuf[i].buffer,
+                                      PIPE_TRANSFER_READ,
+				      &vb_transfer[i]);
+		draw_set_mapped_vertex_buffer(nvfx->draw, i, map);
+	}
+
+	if (idxbuf) {
+		map = pipe_buffer_map(pipe, idxbuf,
+				      PIPE_TRANSFER_READ,
+				      &ib_transfer);
+		draw_set_mapped_element_buffer(nvfx->draw, idxbuf_size, idxbuf_bias, map);
+	} else {
+		draw_set_mapped_element_buffer(nvfx->draw, 0, 0, NULL);
+	}
+
+	if (nvfx->constbuf[PIPE_SHADER_VERTEX]) {
+		const unsigned nr = nvfx->constbuf_nr[PIPE_SHADER_VERTEX];
+
+		map = pipe_buffer_map(pipe,
+				      nvfx->constbuf[PIPE_SHADER_VERTEX],
+				      PIPE_TRANSFER_READ,
+				      &cb_transfer);
+		draw_set_mapped_constant_buffer(nvfx->draw, PIPE_SHADER_VERTEX, 0,
+                                                map, nr);
+	}
+
+	draw_arrays(nvfx->draw, mode, start, count);
+
+	for (i = 0; i < nvfx->vtxbuf_nr; i++)
+		pipe_buffer_unmap(pipe, nvfx->vtxbuf[i].buffer, vb_transfer[i]);
+
+	if (idxbuf)
+		pipe_buffer_unmap(pipe, idxbuf, ib_transfer);
+
+	if (nvfx->constbuf[PIPE_SHADER_VERTEX])
+		pipe_buffer_unmap(pipe, nvfx->constbuf[PIPE_SHADER_VERTEX],
+				  cb_transfer);
+
+	draw_flush(nvfx->draw);
+	pipe->flush(pipe, 0, NULL);
+}
+
+static INLINE void
+emit_attrib(struct nvfx_context *nvfx, unsigned hw, unsigned emit,
+	    unsigned semantic, unsigned index)
+{
+	unsigned draw_out = draw_find_shader_output(nvfx->draw, semantic, index);
+	unsigned a = nvfx->swtnl.nr_attribs++;
+
+	nvfx->swtnl.hw[a] = hw;
+	nvfx->swtnl.emit[a] = emit;
+	nvfx->swtnl.draw[a] = draw_out;
+}
+
+void
+nvfx_vtxfmt_validate(struct nvfx_context *nvfx)
+{
+	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	unsigned colour = 0, texcoords = 0, fog = 0, i;
+
+	/* Determine needed fragprog inputs */
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+		case TGSI_SEMANTIC_POSITION:
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			colour |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+			texcoords |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_FOG:
+			fog = 1;
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	nvfx->swtnl.nr_attribs = 0;
+
+	/* Map draw vtxprog output to hw attribute IDs */
+	for (i = 0; i < 2; i++) {
+		if (!(colour & (1 << i)))
+			continue;
+		emit_attrib(nvfx, 3 + i, EMIT_4F, TGSI_SEMANTIC_COLOR, i);
+	}
+
+	for (i = 0; i < 8; i++) {
+		if (!(texcoords & (1 << i)))
+			continue;
+		emit_attrib(nvfx, 8 + i, EMIT_4F, TGSI_SEMANTIC_GENERIC, i);
+	}
+
+	if (fog) {
+		emit_attrib(nvfx, 5, EMIT_1F, TGSI_SEMANTIC_FOG, 0);
+	}
+
+	emit_attrib(nvfx, 0, 0xff, TGSI_SEMANTIC_POSITION, 0);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
new file mode 100644
index 0000000000..6772d9bd51
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -0,0 +1,1004 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nvfx_context.h"
+#include "nvfx_shader.h"
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nvfx_fpc {
+	struct nvfx_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nvfx_sreg *r_temp;
+
+	int num_regs;
+
+	unsigned inst_offset;
+	unsigned have_const;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nvfx_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nvfx_sreg
+temp(struct nvfx_fpc *fpc)
+{
+	int idx = ffs(~fpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nvfx_sr(NVFXSR_TEMP, 0);
+	}
+
+	fpc->r_temps |= (1 << idx);
+	fpc->r_temps_discard |= (1 << idx);
+	return nvfx_sr(NVFXSR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nvfx_fpc *fpc)
+{
+	fpc->r_temps &= ~fpc->r_temps_discard;
+	fpc->r_temps_discard = 0;
+}
+
+static INLINE struct nvfx_sreg
+constant(struct nvfx_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nvfx_sr(NVFXSR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nvfx_fp_arith((cc), (s), NVFX_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nvfx_fp_tex((cc), (s), NVFX_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nvfx_fpc *fpc, int size)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NVFXSR_INPUT:
+		sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NVFX_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NVFXSR_OUTPUT:
+		sr |= NVFX_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NVFXSR_TEMP:
+		sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NVFX_FP_REG_SRC_SHIFT);
+		break;
+	case NVFXSR_CONST:
+		if (!fpc->have_const) {
+			grow_insns(fpc, 4);
+			fpc->have_const = 1;
+		}
+
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nvfx_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT);
+		break;
+	case NVFXSR_NONE:
+		sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NVFX_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NVFX_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NVFX_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NVFX_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NVFX_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nvfx_fpc *fpc, struct nvfx_sreg dst)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NVFXSR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NVFXSR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NVFX_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NVFXSR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NVFX_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nvfx_fp_arith(struct nvfx_fpc *fpc, int sat, int op,
+	      struct nvfx_sreg dst, int mask,
+	      struct nvfx_sreg s0, struct nvfx_sreg s1, struct nvfx_sreg s2)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	fpc->have_const = 0;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NVFX_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL;
+	hw[0] |= (op << NVFX_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NVFX_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NVFX_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NVFX_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NVFX_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NVFX_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nvfx_fp_tex(struct nvfx_fpc *fpc, int sat, int op, int unit,
+	    struct nvfx_sreg dst, int mask,
+	    struct nvfx_sreg s0, struct nvfx_sreg s1, struct nvfx_sreg s2)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+
+	nvfx_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NVFX_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nvfx_sreg
+tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nvfx_sreg src = { 0 };
+
+	switch (fsrc->Register.File) {
+	case TGSI_FILE_INPUT:
+		src = nvfx_sr(NVFXSR_INPUT,
+			      fpc->attrib_map[fsrc->Register.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->Register.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->Register.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->Register.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = fpc->r_temp[fsrc->Register.Index];
+		break;
+	/* NV40 fragprog result regs are just temps, so this is simple */
+	case TGSI_FILE_OUTPUT:
+		src = fpc->r_result[fsrc->Register.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->Register.Absolute;
+	src.negate = fsrc->Register.Negate;
+	src.swz[0] = fsrc->Register.SwizzleX;
+	src.swz[1] = fsrc->Register.SwizzleY;
+	src.swz[2] = fsrc->Register.SwizzleZ;
+	src.swz[3] = fsrc->Register.SwizzleW;
+	return src;
+}
+
+static INLINE struct nvfx_sreg
+tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	switch (fdst->Register.File) {
+	case TGSI_FILE_OUTPUT:
+		return fpc->r_result[fdst->Register.Index];
+	case TGSI_FILE_TEMPORARY:
+		return fpc->r_temp[fdst->Register.Index];
+	case TGSI_FILE_NULL:
+		return nvfx_sr(NVFXSR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File);
+		return nvfx_sr(NVFXSR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_FP_MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_FP_MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_FP_MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_FP_MASK_W;
+	return mask;
+}
+
+static boolean
+nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	struct nvfx_sreg src[3], dst, tmp;
+	int mask, sat, unit = 0;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->Src[i];
+		if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->Src[i];
+
+		switch (fsrc->Register.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->Register.Index) {
+				ai = fsrc->Register.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->Register.Index) {
+				ci = fsrc->Register.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->Register.Index) {
+				ii = fsrc->Register.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->Register.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->Dst[0]);
+	mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = nvfx_sr(NVFXSR_NONE, 0);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NVFX_COND_GE;
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		dst.cc_test = NVFX_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DDX:
+		if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDX, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, NVFX_FP_MASK_Z | NVFX_FP_MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDX, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDX, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DDY:
+		if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDY, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, NVFX_FP_MASK_Z | NVFX_FP_MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDY, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDY, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		dst = nvfx_sr(NVFXSR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, NVFX_FP_MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NVFX_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		if(!nvfx->is_nv4x)
+			arith(fpc, sat, LRP_NV30, dst, mask, src[0], src[1], src[2]);
+		else {
+			tmp = temp(fpc);
+			arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
+			arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
+		}
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		if(!nvfx->is_nv4x)
+			arith(fpc, sat, POW_NV30, dst, mask, src[0], src[1], none);
+		else {
+			tmp = temp(fpc);
+			arith(fpc, 0, LG2, tmp, NVFX_FP_MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+			arith(fpc, 0, MUL, tmp, NVFX_FP_MASK_X, swz(tmp, X, X, X, X),
+			      swz(src[1], X, X, X, X), none);
+			arith(fpc, sat, EX2, dst, mask,
+			      swz(tmp, X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		if(!nvfx->is_nv4x)
+			arith(fpc, 0, RFL_NV30, dst, mask, src[0], src[1], none);
+		else {
+			tmp = temp(fpc);
+			arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_X, src[0], src[0], none);
+			arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_Y, src[0], src[1], none);
+			arith(fpc, 0, DIV, scale(tmp, 2X), NVFX_FP_MASK_Z,
+			      swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+			arith(fpc, sat, MAD, dst, mask,
+			      swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
+		}
+		break;
+	case TGSI_OPCODE_RSQ:
+		if(!nvfx->is_nv4x)
+			arith(fpc, sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none);
+		else {
+			tmp = temp(fpc);
+			arith(fpc, 0, LG2, scale(tmp, INV_2X), NVFX_FP_MASK_X,
+			      abs(swz(src[0], X, X, X, X)), none, none);
+			arith(fpc, sat, EX2, dst, mask,
+			      neg(swz(tmp, X, X, X, X)), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SCS:
+		/* avoid overwriting the source */
+		if(src[0].swz[NVFX_SWZ_X] != NVFX_SWZ_X)
+		{
+			if (mask & NVFX_FP_MASK_X) {
+				arith(fpc, sat, COS, dst, NVFX_FP_MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & NVFX_FP_MASK_Y) {
+				arith(fpc, sat, SIN, dst, NVFX_FP_MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+		}
+		else
+		{
+			if (mask & NVFX_FP_MASK_Y) {
+				arith(fpc, sat, SIN, dst, NVFX_FP_MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & NVFX_FP_MASK_X) {
+				arith(fpc, sat, COS, dst, NVFX_FP_MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+		}
+		break;
+	case TGSI_OPCODE_SEQ:
+		arith(fpc, sat, SEQ, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SFL:
+		arith(fpc, sat, SFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SLE:
+		arith(fpc, sat, SLE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SNE:
+		arith(fpc, sat, SNE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_STR:
+		arith(fpc, sat, STR, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~NVFX_FP_MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(fpc);
+	return TRUE;
+}
+
+static boolean
+nvfx_fragprog_parse_decl_attrib(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.Name) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NVFX_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.Index == 0) {
+			hw = NVFX_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.Index == 1) {
+			hw = NVFX_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NVFX_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.Index <= 7) {
+			hw = NVFX_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     Index);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->Range.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nvfx_fragprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->Range.First;
+	unsigned hw;
+
+	switch (fdec->Semantic.Name) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = 1;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		hw = ~0;
+		switch (fdec->Semantic.Index) {
+		case 0: hw = 0; break;
+		case 1: hw = 2; break;
+		case 2: hw = 3; break;
+		case 3: hw = 4; break;
+		}
+		if(hw > ((nvfx->is_nv4x) ? 4 : 2)) {
+			NOUVEAU_ERR("bad rcol index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	fpc->r_result[idx] = nvfx_sr(NVFXSR_OUTPUT, hw);
+	fpc->r_temps |= (1 << hw);
+	return TRUE;
+}
+
+static boolean
+nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, i;
+
+	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nvfx_fragprog_parse_decl_attrib(nvfx, fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nvfx_fragprog_parse_decl_output(nvfx, fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->Range.Last > high_temp) {
+					high_temp =
+						fdec->Range.Last;
+				}
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+
+			imm = &p.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			vals[0] = imm->u[0].Float;
+			vals[1] = imm->u[1].Float;
+			vals[2] = imm->u[2].Float;
+			vals[3] = imm->u[3].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (++high_temp) {
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_sreg));
+		for (i = 0; i < high_temp; i++)
+			fpc->r_temp[i] = temp(fpc);
+		fpc->r_temps_discard = 0;
+	}
+
+	return TRUE;
+
+out_err:
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	tgsi_parse_free(&p);
+	return FALSE;
+}
+
+static void
+nvfx_fragprog_translate(struct nvfx_context *nvfx,
+			struct nvfx_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nvfx_fpc *fpc = NULL;
+
+	fpc = CALLOC(1, sizeof(struct nvfx_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->num_regs = 2;
+
+	if (!nvfx_fragprog_prepare(nvfx, fpc)) {
+		FREE(fpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, fp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nvfx_fragprog_parse_instruction(nvfx, fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	if(!nvfx->is_nv4x)
+		fp->fp_control |= (fpc->num_regs-1)/2;
+	else
+		fp->fp_control |= fpc->num_regs << NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT;
+
+	/* Terminate final instruction */
+	if(fp->insn)
+                fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+
+	fp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	FREE(fpc);
+}
+
+static inline void
+nvfx_fp_memcpy(void* dst, const void* src, size_t len)
+{
+#ifndef WORDS_BIGENDIAN
+	memcpy(dst, src, len);
+#else
+	size_t i;
+	for(i = 0; i < len; i += 4) {
+		uint32_t v = (uint32_t*)((char*)src + i);
+		*(uint32_t*)((char*)dst + i) = (v >> 16) | (v << 16);
+	}
+#endif
+}
+
+void
+nvfx_fragprog_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	int update = 0;
+	int i;
+
+	if (!fp->translated)
+	{
+		const int min_size = 4096;
+
+		nvfx_fragprog_translate(nvfx, fp);
+		if (!fp->translated) {
+			static unsigned dummy[8] = {1, 0, 0, 0, 1, 0, 0, 0};
+			static int warned = 0;
+			if(!warned)
+			{
+				fprintf(stderr, "nvfx: failed to translate fragment program!\n");
+				warned = 1;
+			}
+
+			/* use dummy program: we cannot fail here */
+			fp->translated = TRUE;
+			fp->insn = malloc(sizeof(dummy));
+			memcpy(fp->insn, dummy, sizeof(dummy));
+			fp->insn_len = sizeof(dummy) / sizeof(dummy[0]);
+		}
+		update = TRUE;
+
+		fp->prog_size = (fp->insn_len * 4 + 63) & ~63;
+
+		if(fp->prog_size >= min_size)
+			fp->progs_per_bo = 1;
+		else
+			fp->progs_per_bo = min_size / fp->prog_size;
+		fp->bo_prog_idx = fp->progs_per_bo - 1;
+	}
+
+	/* we must update constants even on "just" fragprog changes, because
+	   we don't check whether the current constant buffer matches the latest
+	   one bound to this fragment program */
+	if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG))
+		update = TRUE;
+
+	if(update) {
+		int offset;
+
+		++fp->bo_prog_idx;
+		if(fp->bo_prog_idx >= fp->progs_per_bo)
+		{
+			if(fp->fpbo && !nouveau_bo_busy(fp->fpbo->next->bo, NOUVEAU_BO_WR))
+			{
+				fp->fpbo = fp->fpbo->next;
+			}
+			else
+			{
+				struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + fp->prog_size * fp->progs_per_bo, 16);
+				char *map, *buf;
+
+				if(fp->fpbo)
+				{
+					fpbo->next = fp->fpbo->next;
+					fp->fpbo->next = fpbo;
+				}
+				else
+					fpbo->next = fpbo;
+				fp->fpbo = fpbo;
+				fpbo->bo = 0;
+				nouveau_bo_new(nvfx->screen->base.device, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 64, fp->prog_size * fp->progs_per_bo, &fpbo->bo);
+				nouveau_bo_map(fpbo->bo, NOUVEAU_BO_NOSYNC);
+
+				map = fpbo->bo->map;
+				buf = fpbo->insn;
+				for(int i = 0; i < fp->progs_per_bo; ++i)
+				{
+					memcpy(buf, fp->insn, fp->insn_len * 4);
+					nvfx_fp_memcpy(map, fp->insn, fp->insn_len * 4);
+					map += fp->prog_size;
+					buf += fp->prog_size;
+				}
+			}
+			fp->bo_prog_idx = 0;
+		}
+
+		offset = fp->bo_prog_idx * fp->prog_size;
+
+		if(nvfx->constbuf[PIPE_SHADER_FRAGMENT]) {
+			struct pipe_resource* constbuf = nvfx->constbuf[PIPE_SHADER_FRAGMENT];
+			// TODO: avoid using transfers, just directly the buffer
+			struct pipe_transfer* transfer;
+			// TODO: does this check make any sense, or should we do this unconditionally?
+			uint32_t* map = pipe_buffer_map(&nvfx->pipe, constbuf, PIPE_TRANSFER_READ, &transfer);
+			uint32_t* fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset);
+			uint32_t* buf = (uint32_t*)((char*)fp->fpbo->insn + offset);
+			for (i = 0; i < fp->nr_consts; ++i) {
+				unsigned off = fp->consts[i].offset;
+				unsigned idx = fp->consts[i].index * 4;
+
+				/* TODO: is checking a good idea? */
+				if(memcmp(&buf[off], &map[idx], 4 * sizeof(uint32_t))) {
+					memcpy(&buf[off], &map[idx], 4 * sizeof(uint32_t));
+					nvfx_fp_memcpy(&fpmap[off], &map[idx], 4 * sizeof(uint32_t));
+				}
+			}
+			pipe_buffer_unmap(&nvfx->pipe, constbuf, transfer);
+		}
+	}
+
+	if(update || (nvfx->dirty & NVFX_NEW_FRAGPROG)) {
+		int offset = fp->bo_prog_idx * fp->prog_size;
+		MARK_RING(chan, 8, 1);
+		OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1));
+		OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
+			      NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
+			      NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+		OUT_RING(chan, RING_3D(NV34TCL_FP_CONTROL, 1));
+		OUT_RING(chan, fp->fp_control);
+		if(!nvfx->is_nv4x) {
+			OUT_RING(chan, RING_3D(NV34TCL_FP_REG_CONTROL, 1));
+			OUT_RING(chan, (1<<16)|0x4);
+			OUT_RING(chan, RING_3D(NV34TCL_TX_UNITS_ENABLE, 1));
+			OUT_RING(chan, fp->samplers);
+		}
+	}
+}
+
+void
+nvfx_fragprog_relocate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	struct nouveau_bo* bo = fp->fpbo->bo;
+	int offset = fp->bo_prog_idx * fp->prog_size;
+	unsigned fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; // TODO: GART?
+	fp_flags |= NOUVEAU_BO_DUMMY;
+	MARK_RING(chan, 2, 2);
+	OUT_RELOC(chan, bo, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1), fp_flags, 0, 0);
+	OUT_RELOC(chan, bo, offset, fp_flags | NOUVEAU_BO_LOW |
+		      NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
+		      NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+}
+
+void
+nvfx_fragprog_destroy(struct nvfx_context *nvfx,
+		      struct nvfx_fragment_program *fp)
+{
+	struct nvfx_fragment_program_bo* fpbo = fp->fpbo;
+	if(fpbo)
+	{
+		do
+		{
+			struct nvfx_fragment_program_bo* next = fpbo->next;
+			nouveau_bo_unmap(fpbo->bo);
+			nouveau_bo_ref(0, &fpbo->bo);
+			free(fpbo);
+			fpbo = next;
+		}
+		while(fpbo != fp->fpbo);
+	}
+
+	if (fp->insn_len)
+		FREE(fp->insn);
+}
+
diff --git a/src/gallium/drivers/nvfx/nvfx_fragtex.c b/src/gallium/drivers/nvfx/nvfx_fragtex.c
new file mode 100644
index 0000000000..0b4a434fec
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_fragtex.c
@@ -0,0 +1,58 @@
+#include "nvfx_context.h"
+#include "nvfx_resource.h"
+
+void
+nvfx_fragtex_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned samplers, unit;
+
+	samplers = nvfx->dirty_samplers;
+	if(!samplers)
+		return;
+
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		if(nvfx->fragment_sampler_views[unit] && nvfx->tex_sampler[unit]) {
+			if(!nvfx->is_nv4x)
+				nv30_fragtex_set(nvfx, unit);
+			else
+				nv40_fragtex_set(nvfx, unit);
+		} else {
+			WAIT_RING(chan, 2);
+			/* this is OK for nv40 too */
+			OUT_RING(chan, RING_3D(NV34TCL_TX_ENABLE(unit), 1));
+			OUT_RING(chan, 0);
+			nvfx->hw_samplers &= ~(1 << unit);
+		}
+	}
+	nvfx->dirty_samplers = 0;
+}
+
+void
+nvfx_fragtex_relocate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned samplers, unit;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	samplers = nvfx->hw_samplers;
+	while (samplers) {
+		struct nvfx_miptree* mt;
+		struct nouveau_bo *bo;
+
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		mt = (struct nvfx_miptree*)nvfx->fragment_sampler_views[unit]->texture;
+		bo = mt->base.bo;
+
+		MARK_RING(chan, 3, 3);
+		OUT_RELOC(chan, bo, RING_3D(NV34TCL_TX_OFFSET(unit), 2), tex_flags | NOUVEAU_BO_DUMMY, 0, 0);
+		OUT_RELOC(chan, bo, 0, tex_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_DUMMY, 0, 0);
+		OUT_RELOC(chan, bo, nvfx->hw_txf[unit], tex_flags | NOUVEAU_BO_OR | NOUVEAU_BO_DUMMY,
+				NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_miptree.c b/src/gallium/drivers/nvfx/nvfx_miptree.c
new file mode 100644
index 0000000000..b5639bb464
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_miptree.c
@@ -0,0 +1,310 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+
+#include "nvfx_context.h"
+#include "nvfx_resource.h"
+#include "nvfx_transfer.h"
+#include "nv04_surface_2d.h"
+
+/* Currently using separate implementations for buffers and textures,
+ * even though gallium has a unified abstraction of these objects.
+ * Eventually these should be combined, and mechanisms like transfers
+ * be adapted to work for both buffer and texture uploads.
+ */
+
+static void
+nvfx_miptree_layout(struct nvfx_miptree *mt)
+{
+	struct pipe_resource *pt = &mt->base.base;
+	uint width = pt->width0;
+	uint offset = 0;
+	int nr_faces, l, f;
+	uint wide_pitch = pt->bind & (PIPE_BIND_SAMPLER_VIEW |
+				      PIPE_BIND_DEPTH_STENCIL |
+				      PIPE_BIND_RENDER_TARGET |
+				      PIPE_BIND_DISPLAY_TARGET |
+				      PIPE_BIND_SCANOUT);
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth0;
+	} else {
+		nr_faces = 1;
+	}
+
+	for (l = 0; l <= pt->last_level; l++) {
+		if (wide_pitch && (pt->flags & NVFX_RESOURCE_FLAG_LINEAR))
+			mt->level[l].pitch = align(util_format_get_stride(pt->format, pt->width0), 64);
+		else
+			mt->level[l].pitch = util_format_get_stride(pt->format, width);
+
+		mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = u_minify(width, 1);
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l < pt->last_level; l++) {
+			mt->level[l].image_offset[f] = offset;
+
+			if (!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR) &&
+			    u_minify(pt->width0, l + 1) > 1 && u_minify(pt->height0, l + 1) > 1)
+				offset += align(mt->level[l].pitch * u_minify(pt->height0, l), 64);
+			else
+				offset += mt->level[l].pitch * u_minify(pt->height0, l);
+		}
+
+		mt->level[l].image_offset[f] = offset;
+		offset += mt->level[l].pitch * u_minify(pt->height0, l);
+	}
+
+	mt->total_size = offset;
+}
+
+static boolean
+nvfx_miptree_get_handle(struct pipe_screen *pscreen,
+			struct pipe_resource *ptexture,
+			struct winsys_handle *whandle)
+{
+	struct nvfx_miptree* mt = (struct nvfx_miptree*)ptexture;
+
+	if (!mt || !mt->base.bo)
+		return FALSE;
+
+	return nouveau_screen_bo_get_handle(pscreen,
+					    mt->base.bo,
+					    mt->level[0].pitch,
+					    whandle);
+}
+
+
+static void
+nvfx_miptree_destroy(struct pipe_screen *screen, struct pipe_resource *pt)
+{
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+	int l;
+
+	nouveau_screen_bo_release(screen, mt->base.bo);
+
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	FREE(mt);
+}
+
+
+
+
+struct u_resource_vtbl nvfx_miptree_vtbl = 
+{
+   nvfx_miptree_get_handle,	      /* get_handle */
+   nvfx_miptree_destroy,	      /* resource_destroy */
+   NULL,			      /* is_resource_referenced */
+   nvfx_miptree_transfer_new,	      /* get_transfer */
+   nvfx_miptree_transfer_del,     /* transfer_destroy */
+   nvfx_miptree_transfer_map,	      /* transfer_map */
+   u_default_transfer_flush_region,   /* transfer_flush_region */
+   nvfx_miptree_transfer_unmap,	      /* transfer_unmap */
+   u_default_transfer_inline_write    /* transfer_inline_write */
+};
+
+
+
+struct pipe_resource *
+nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt)
+{
+	struct nvfx_miptree *mt;
+	static int no_swizzle = -1;
+	if(no_swizzle < 0)
+		no_swizzle = debug_get_bool_option("NOUVEAU_NO_SWIZZLE", FALSE);
+
+	mt = CALLOC_STRUCT(nvfx_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base.base = *pt;
+	mt->base.vtbl = &nvfx_miptree_vtbl;
+	pipe_reference_init(&mt->base.base.reference, 1);
+	mt->base.base.screen = pscreen;
+
+	/* Swizzled textures must be POT */
+	if (pt->width0 & (pt->width0 - 1) ||
+	    pt->height0 & (pt->height0 - 1))
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+	else
+	if (pt->bind & (PIPE_BIND_SCANOUT |
+			PIPE_BIND_DISPLAY_TARGET |
+			PIPE_BIND_DEPTH_STENCIL))
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+	else
+	if (pt->usage == PIPE_USAGE_DYNAMIC)
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+	else {
+		switch (pt->format) {
+		case PIPE_FORMAT_B5G6R5_UNORM:
+		case PIPE_FORMAT_L8A8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+			/* TODO: we can actually swizzle these formats on nv40, we
+				are just preserving the pre-unification behavior.
+				The whole 2D code is going to be rewritten anyway. */
+			if(nvfx_screen(pscreen)->is_nv4x) {
+				mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+				break;
+			}
+		/* TODO: Figure out which formats can be swizzled */
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case PIPE_FORMAT_B8G8R8X8_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+		{
+			if (no_swizzle)
+				mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+			break;
+		}
+		default:
+			mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+		}
+	}
+
+	/* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear.
+	 * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy.
+	 * This also happens for small mipmaps of large textures. */
+	if (pt->bind & PIPE_BIND_RENDER_TARGET &&
+	    util_format_get_stride(pt->format, pt->width0) < 64)
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+
+	nvfx_miptree_layout(mt);
+
+	mt->base.bo = nouveau_screen_bo_new(pscreen, 256,
+            pt->usage, pt->bind, mt->total_size);
+	if (!mt->base.bo) {
+		FREE(mt);
+		return NULL;
+	}
+	return &mt->base.base;
+}
+
+
+
+
+struct pipe_resource *
+nvfx_miptree_from_handle(struct pipe_screen *pscreen,
+			 const struct pipe_resource *template,
+			 struct winsys_handle *whandle)
+{
+	struct nvfx_miptree *mt;
+	unsigned stride;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (template->target != PIPE_TEXTURE_2D ||
+	    template->last_level != 0 ||
+	    template->depth0 != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nvfx_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base.bo = nouveau_screen_bo_from_handle(pscreen, whandle, &stride);
+	if (mt->base.bo == NULL) {
+		FREE(mt);
+		return NULL;
+	}
+
+	mt->base.base = *template;
+	mt->base.vtbl = &nvfx_miptree_vtbl;
+	pipe_reference_init(&mt->base.base.reference, 1);
+	mt->base.base.screen = pscreen;
+	mt->level[0].pitch = stride;
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	/* Assume whoever created this buffer expects it to be linear for now */
+	mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+
+	/* XXX: Need to adjust bo refcount??
+	 */
+	/* nouveau_bo_ref(bo, &mt->base.bo); */
+	return &mt->base.base;
+}
+
+
+
+
+
+/* Surface helpers, not strictly required to implement the resource vtbl:
+ */
+struct pipe_surface *
+nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+	struct nv04_surface *ns;
+
+	ns = CALLOC_STRUCT(nv04_surface);
+	if (!ns)
+		return NULL;
+	pipe_resource_reference(&ns->base.texture, pt);
+	ns->base.format = pt->format;
+	ns->base.width = u_minify(pt->width0, level);
+	ns->base.height = u_minify(pt->height0, level);
+	ns->base.usage = flags;
+	pipe_reference_init(&ns->base.reference, 1);
+	ns->base.face = face;
+	ns->base.level = level;
+	ns->base.zslice = zslice;
+	ns->pitch = mt->level[level].pitch;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ns->base.offset = mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ns->base.offset = mt->level[level].image_offset[zslice];
+	} else {
+		ns->base.offset = mt->level[level].image_offset[0];
+	}
+
+	/* create a linear temporary that we can render into if
+	 * necessary.
+	 *
+	 * Note that ns->pitch is always a multiple of 64 for linear
+	 * surfaces and swizzled surfaces are POT, so ns->pitch & 63
+	 * is equivalent to (ns->pitch < 64 && swizzled)
+	 */
+
+	if ((ns->pitch & 63) && 
+	    (ns->base.usage & PIPE_BIND_RENDER_TARGET))
+	{
+		struct nv04_surface_2d* eng2d  =
+			((struct nvfx_screen*)pscreen)->eng2d;
+
+		ns = nv04_surface_wrap_for_render(pscreen, eng2d, ns);
+	}
+
+	return &ns->base;
+}
+
+void
+nvfx_miptree_surface_del(struct pipe_surface *ps)
+{
+	struct nv04_surface* ns = (struct nv04_surface*)ps;
+	if(ns->backing)
+	{
+		struct nvfx_screen* screen = (struct nvfx_screen*)ps->texture->screen;
+		if(1 /*ns->backing->base.usage & PIPE_BIND_BLIT_DESTINATION*/)
+			screen->eng2d->copy(screen->eng2d, &ns->backing->base, 0, 0, ps, 0, 0, ns->base.width, ns->base.height);
+		nvfx_miptree_surface_del(&ns->backing->base);
+	}
+
+	pipe_resource_reference(&ps->texture, NULL);
+	FREE(ps);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_query.c b/src/gallium/drivers/nvfx/nvfx_query.c
new file mode 100644
index 0000000000..1dab20c41a
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_query.c
@@ -0,0 +1,138 @@
+#include "pipe/p_context.h"
+
+#include "nvfx_context.h"
+
+struct nvfx_query {
+	struct list_head list;
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nvfx_query *
+nvfx_query(struct pipe_query *pipe)
+{
+	return (struct nvfx_query *)pipe;
+}
+
+static struct pipe_query *
+nvfx_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nvfx_query *q;
+
+	q = CALLOC(1, sizeof(struct nvfx_query));
+	q->type = query_type;
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nvfx_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nvfx_query *q = nvfx_query(pq);
+
+	if (q->object)
+	{
+		nouveau_resource_free(&q->object);
+		LIST_DEL(&q->list);
+	}
+	FREE(q);
+}
+
+static void
+nvfx_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_query *q = nvfx_query(pq);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	uint64_t tmp;
+
+	/* Happens when end_query() is called, then another begin_query()
+	 * without querying the result in-between.  For now we'll wait for
+	 * the existing query to notify completion, but it could be better.
+	 */
+	if (q->object)
+		pipe->get_query_result(pipe, pq, 1, &tmp);
+
+	while (nouveau_resource_alloc(nvfx->screen->query_heap, 1, NULL, &q->object))
+	{
+		struct nvfx_query* oldestq;
+		assert(!LIST_IS_EMPTY(&nvfx->screen->query_list));
+		oldestq = LIST_ENTRY(struct nvfx_query, nvfx->screen->query_list.next, list);
+		pipe->get_query_result(pipe, (struct pipe_query*)oldestq, 1, &tmp);
+	}
+
+	LIST_ADDTAIL(&q->list, &nvfx->screen->query_list);
+
+	nouveau_notifier_reset(nvfx->screen->query, q->object->start);
+
+	BEGIN_RING(chan, eng3d, NV34TCL_QUERY_RESET, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, eng3d, NV34TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (chan, 1);
+
+	q->ready = FALSE;
+}
+
+static void
+nvfx_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	struct nvfx_query *q = nvfx_query(pq);
+
+	BEGIN_RING(chan, eng3d, NV34TCL_QUERY_GET, 1);
+	OUT_RING  (chan, (0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV34TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING(chan);
+}
+
+static boolean
+nvfx_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, void *vresult)
+{
+	uint64_t *result = (uint64_t *)vresult;
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_query *q = nvfx_query(pq);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nouveau_notifier_status(nvfx->screen->query,
+						 q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+
+			nouveau_notifier_wait_status(nvfx->screen->query,
+					q->object->start,
+					NV_NOTIFY_STATE_STATUS_COMPLETED, 0);
+		}
+
+		q->result = nouveau_notifier_return_val(nvfx->screen->query,
+							q->object->start);
+		q->ready = TRUE;
+		nouveau_resource_free(&q->object);
+		LIST_DEL(&q->list);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nvfx_init_query_functions(struct nvfx_context *nvfx)
+{
+	nvfx->pipe.create_query = nvfx_query_create;
+	nvfx->pipe.destroy_query = nvfx_query_destroy;
+	nvfx->pipe.begin_query = nvfx_query_begin;
+	nvfx->pipe.end_query = nvfx_query_end;
+	nvfx->pipe.get_query_result = nvfx_query_result;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_resource.c b/src/gallium/drivers/nvfx/nvfx_resource.c
new file mode 100644
index 0000000000..10cdeed2a3
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_resource.c
@@ -0,0 +1,67 @@
+
+#include "pipe/p_context.h"
+#include "nvfx_resource.h"
+#include "nouveau/nouveau_screen.h"
+
+
+/* This doesn't look quite right - this query is supposed to ask
+ * whether the particular context has references to the resource in
+ * any unflushed rendering command buffer, and hence requires a
+ * pipe->flush() for serializing some modification to that resource.
+ *
+ * This seems to be answering the question of whether the resource is
+ * currently on hardware.
+ */
+static unsigned int
+nvfx_resource_is_referenced(struct pipe_context *pipe,
+			    struct pipe_resource *resource,
+			    unsigned face, unsigned level)
+{
+	return nouveau_reference_flags(nvfx_resource(resource)->bo);
+}
+
+static struct pipe_resource *
+nvfx_resource_create(struct pipe_screen *screen,
+		     const struct pipe_resource *template)
+{
+	if (template->target == PIPE_BUFFER)
+		return nvfx_buffer_create(screen, template);
+	else
+		return nvfx_miptree_create(screen, template);
+}
+
+static struct pipe_resource *
+nvfx_resource_from_handle(struct pipe_screen * screen,
+			  const struct pipe_resource *template,
+			  struct winsys_handle *whandle)
+{
+	if (template->target == PIPE_BUFFER)
+		return NULL;
+	else
+		return nvfx_miptree_from_handle(screen, template, whandle);
+}
+
+void
+nvfx_init_resource_functions(struct pipe_context *pipe)
+{
+	pipe->get_transfer = u_get_transfer_vtbl;
+	pipe->transfer_map = u_transfer_map_vtbl;
+	pipe->transfer_flush_region = u_transfer_flush_region_vtbl;
+	pipe->transfer_unmap = u_transfer_unmap_vtbl;
+	pipe->transfer_destroy = u_transfer_destroy_vtbl;
+	pipe->transfer_inline_write = u_transfer_inline_write_vtbl;
+	pipe->is_resource_referenced = nvfx_resource_is_referenced;
+}
+
+void
+nvfx_screen_init_resource_functions(struct pipe_screen *pscreen)
+{
+	pscreen->resource_create = nvfx_resource_create;
+	pscreen->resource_from_handle = nvfx_resource_from_handle;
+	pscreen->resource_get_handle = u_resource_get_handle_vtbl;
+	pscreen->resource_destroy = u_resource_destroy_vtbl;
+	pscreen->user_buffer_create = nvfx_user_buffer_create;
+   
+	pscreen->get_tex_surface = nvfx_miptree_surface_new;
+	pscreen->tex_surface_destroy = nvfx_miptree_surface_del;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_resource.h b/src/gallium/drivers/nvfx/nvfx_resource.h
new file mode 100644
index 0000000000..a68c14cf3f
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_resource.h
@@ -0,0 +1,91 @@
+
+#ifndef NVFX_RESOURCE_H
+#define NVFX_RESOURCE_H
+
+#include "util/u_transfer.h"
+
+struct pipe_resource;
+struct nouveau_bo;
+
+
+/* This gets further specialized into either buffer or texture
+ * structures.  In the future we'll want to remove much of that
+ * distinction, but for now try to keep as close to the existing code
+ * as possible and use the vtbl struct to choose between the two
+ * underlying implementations.
+ */
+struct nvfx_resource {
+	struct pipe_resource base;
+	struct u_resource_vtbl *vtbl;
+	struct nouveau_bo *bo;
+};
+
+#define NVFX_MAX_TEXTURE_LEVELS  16
+
+struct nvfx_miptree {
+	struct nvfx_resource base;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[NVFX_MAX_TEXTURE_LEVELS];
+
+	unsigned image_nr;
+};
+
+static INLINE 
+struct nvfx_resource *nvfx_resource(struct pipe_resource *resource)
+{
+	return (struct nvfx_resource *)resource;
+}
+
+static INLINE struct nouveau_bo *
+nvfx_surface_buffer(struct pipe_surface *surf)
+{
+	struct nvfx_resource *mt = nvfx_resource(surf->texture);
+
+	return mt->bo;
+}
+
+
+void
+nvfx_init_resource_functions(struct pipe_context *pipe);
+
+void
+nvfx_screen_init_resource_functions(struct pipe_screen *pscreen);
+
+
+/* Internal:
+ */
+
+struct pipe_resource *
+nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt);
+
+struct pipe_resource *
+nvfx_miptree_from_handle(struct pipe_screen *pscreen,
+			 const struct pipe_resource *template,
+			 struct winsys_handle *whandle);
+
+struct pipe_resource *
+nvfx_buffer_create(struct pipe_screen *pscreen,
+		   const struct pipe_resource *template);
+
+struct pipe_resource *
+nvfx_user_buffer_create(struct pipe_screen *screen,
+			void *ptr,
+			unsigned bytes,
+			unsigned usage);
+
+
+
+void
+nvfx_miptree_surface_del(struct pipe_surface *ps);
+
+struct pipe_surface *
+nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags);
+
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.c b/src/gallium/drivers/nvfx/nvfx_screen.c
new file mode 100644
index 0000000000..a78d2411a0
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_screen.c
@@ -0,0 +1,521 @@
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/u_format_s3tc.h"
+#include "util/u_simple_screen.h"
+
+#include "nouveau/nouveau_screen.h"
+
+#include "nvfx_context.h"
+#include "nvfx_screen.h"
+#include "nvfx_resource.h"
+
+#define NV30TCL_CHIPSET_3X_MASK 0x00000003
+#define NV34TCL_CHIPSET_3X_MASK 0x00000010
+#define NV35TCL_CHIPSET_3X_MASK 0x000001e0
+
+/* FIXME: It seems I should not include directly ../../winsys/drm/nouveau/drm/nouveau_drm_api.h
+* to get the pointer to the context front buffer, so I copied nouveau_winsys here.
+* nv30_screen_surface_format_supported() can then use it to enforce creating fbo
+* with same number of bits everywhere.
+*/
+struct nouveau_winsys {
+	struct pipe_winsys base;
+
+	struct pipe_screen *pscreen;
+
+	struct pipe_surface *front;
+};
+#define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
+#define NV4X_GRCLASS4497_CHIPSETS 0x00005450
+#define NV6X_GRCLASS4497_CHIPSETS 0x00000088
+
+static int
+nvfx_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		/* TODO: check this */
+		return screen->is_nv4x ? 16 : 8;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return !!screen->is_nv4x;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return screen->is_nv4x ? 4 : 2;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+        case PIPE_CAP_TIMER_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+		return !!screen->is_nv4x;
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0; /* We have 4 on nv40 - but unsupported currently */
+	case PIPE_CAP_TGSI_CONT_SUPPORTED:
+		return 0;
+	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+		return !!screen->is_nv4x;
+	case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+		return 16;
+	case PIPE_CAP_INDEP_BLEND_ENABLE:
+		/* TODO: on nv40 we have separate color masks */
+		/* TODO: nv40 mrt blending is probably broken */
+		return 0;
+	case PIPE_CAP_INDEP_BLEND_FUNC:
+		return 0;
+	case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
+		return 0;
+	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+		return 1;
+	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+		return 0;
+	case PIPE_CAP_MAX_FS_INSTRUCTIONS:
+	case PIPE_CAP_MAX_FS_ALU_INSTRUCTIONS:
+	case PIPE_CAP_MAX_FS_TEX_INSTRUCTIONS:
+	case PIPE_CAP_MAX_FS_TEX_INDIRECTIONS:
+		return 4096;
+	case PIPE_CAP_MAX_FS_CONTROL_FLOW_DEPTH:
+		/* FIXME: is it the dynamic (nv30:0/nv40:24) or the static
+		   value (nv30:0/nv40:4) ? */
+		return screen->is_nv4x ? 4 : 0;
+	case PIPE_CAP_MAX_FS_INPUTS:
+		return 10;
+	case PIPE_CAP_MAX_FS_CONSTS:
+		return screen->is_nv4x ? 224 : 32;
+	case PIPE_CAP_MAX_FS_TEMPS:
+		return 32;
+	case PIPE_CAP_MAX_FS_ADDRS:
+		return screen->is_nv4x ? 1 : 0;
+	case PIPE_CAP_MAX_FS_PREDS:
+		return screen->is_nv4x ? 1 : 0;
+	case PIPE_CAP_MAX_VS_INSTRUCTIONS:
+	case PIPE_CAP_MAX_VS_ALU_INSTRUCTIONS:
+		return screen->is_nv4x ? 512 : 256;
+	case PIPE_CAP_MAX_VS_TEX_INSTRUCTIONS:
+	case PIPE_CAP_MAX_VS_TEX_INDIRECTIONS:
+		return screen->is_nv4x ? 512 : 0;
+	case PIPE_CAP_MAX_VS_CONTROL_FLOW_DEPTH:
+		/* FIXME: is it the dynamic (nv30:24/nv40:24) or the static
+		   value (nv30:1/nv40:4) ? */
+		return screen->is_nv4x ? 4 : 1;
+	case PIPE_CAP_MAX_VS_INPUTS:
+		return 16;
+	case PIPE_CAP_MAX_VS_CONSTS:
+		return 256;
+	case PIPE_CAP_MAX_VS_TEMPS:
+		return screen->is_nv4x ? 32 : 13;
+	case PIPE_CAP_MAX_VS_ADDRS:
+		return 2;
+	case PIPE_CAP_MAX_VS_PREDS:
+		return screen->is_nv4x ? 1 : 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nvfx_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_cap param)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return screen->is_nv4x ? 16.0 : 8.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return screen->is_nv4x ? 16.0 : 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
+				     enum pipe_format format,
+				     enum pipe_texture_target target,
+				     unsigned sample_count,
+				     unsigned tex_usage, unsigned geom_flags)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+	struct pipe_surface *front = ((struct nouveau_winsys *) pscreen->winsys)->front;
+
+	 if (sample_count > 1)
+		return FALSE;
+
+	if (tex_usage & PIPE_BIND_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case PIPE_FORMAT_B8G8R8X8_UNORM:
+		case PIPE_FORMAT_B5G6R5_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else
+	if (tex_usage & PIPE_BIND_DEPTH_STENCIL) {
+		switch (format) {
+		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+		case PIPE_FORMAT_X8Z24_UNORM:
+			return TRUE;
+		case PIPE_FORMAT_Z16_UNORM:
+			/* TODO: this nv30 limitation probably does not exist */
+			if (!screen->is_nv4x && front)
+				return (front->format == PIPE_FORMAT_B5G6R5_UNORM);
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		if (tex_usage & PIPE_BIND_SAMPLER_VIEW) {
+			switch (format) {
+			case PIPE_FORMAT_DXT1_RGB:
+			case PIPE_FORMAT_DXT1_RGBA:
+			case PIPE_FORMAT_DXT3_RGBA:
+			case PIPE_FORMAT_DXT5_RGBA:
+				return util_format_s3tc_enabled;
+			default:
+				break;
+			}
+		}
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case PIPE_FORMAT_B8G8R8X8_UNORM:
+		case PIPE_FORMAT_B5G5R5A1_UNORM:
+		case PIPE_FORMAT_B4G4R4A4_UNORM:
+		case PIPE_FORMAT_B5G6R5_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_L8A8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+			return TRUE;
+		/* TODO: does nv30 support this? */
+		case PIPE_FORMAT_R16_SNORM:
+			return !!screen->is_nv4x;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+
+static void
+nvfx_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+
+	nouveau_resource_destroy(&screen->vp_exec_heap);
+	nouveau_resource_destroy(&screen->vp_data_heap);
+	nouveau_resource_destroy(&screen->query_heap);
+	nouveau_notifier_free(&screen->query);
+	nouveau_notifier_free(&screen->sync);
+	nouveau_grobj_free(&screen->eng3d);
+	nv04_surface_2d_takedown(&screen->eng2d);
+
+	nouveau_screen_fini(&screen->base);
+
+	FREE(pscreen);
+}
+
+static void nv30_screen_init(struct nvfx_screen *screen)
+{
+	struct nouveau_channel *chan = screen->base.channel;
+	int i;
+
+	/* TODO: perhaps we should do some of this on nv40 too? */
+	for (i=1; i<8; i++) {
+		OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_CLIP_HORIZ(i), 1));
+		OUT_RING(chan, 0);
+		OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_CLIP_VERT(i), 1));
+		OUT_RING(chan, 0);
+	}
+
+	OUT_RING(chan, RING_3D(0x220, 1));
+	OUT_RING(chan, 1);
+
+	OUT_RING(chan, RING_3D(0x03b0, 1));
+	OUT_RING(chan, 0x00100000);
+	OUT_RING(chan, RING_3D(0x1454, 1));
+	OUT_RING(chan, 0);
+	OUT_RING(chan, RING_3D(0x1d80, 1));
+	OUT_RING(chan, 3);
+	OUT_RING(chan, RING_3D(0x1450, 1));
+	OUT_RING(chan, 0x00030004);
+
+	/* NEW */
+	OUT_RING(chan, RING_3D(0x1e98, 1));
+	OUT_RING(chan, 0);
+	OUT_RING(chan, RING_3D(0x17e0, 3));
+	OUT_RING(chan, fui(0.0));
+	OUT_RING(chan, fui(0.0));
+	OUT_RING(chan, fui(1.0));
+	OUT_RING(chan, RING_3D(0x1f80, 16));
+	for (i=0; i<16; i++) {
+		OUT_RING(chan, (i==8) ? 0x0000ffff : 0);
+	}
+
+	OUT_RING(chan, RING_3D(0x120, 3));
+	OUT_RING(chan, 0);
+	OUT_RING(chan, 1);
+	OUT_RING(chan, 2);
+
+	OUT_RING(chan, RING_3D(0x1d88, 1));
+	OUT_RING(chan, 0x00001200);
+
+	OUT_RING(chan, RING_3D(NV34TCL_RC_ENABLE, 1));
+	OUT_RING(chan, 0);
+
+	OUT_RING(chan, RING_3D(NV34TCL_DEPTH_RANGE_NEAR, 2));
+	OUT_RING(chan, fui(0.0));
+	OUT_RING(chan, fui(1.0));
+
+	OUT_RING(chan, RING_3D(NV34TCL_MULTISAMPLE_CONTROL, 1));
+	OUT_RING(chan, 0xffff0000);
+
+	/* enables use of vp rather than fixed-function somehow */
+	OUT_RING(chan, RING_3D(0x1e94, 1));
+	OUT_RING(chan, 0x13);
+}
+
+static void nv40_screen_init(struct nvfx_screen *screen)
+{
+	struct nouveau_channel *chan = screen->base.channel;
+
+	OUT_RING(chan, RING_3D(NV40TCL_DMA_COLOR2, 2));
+	OUT_RING(chan, screen->base.channel->vram->handle);
+	OUT_RING(chan, screen->base.channel->vram->handle);
+
+	OUT_RING(chan, RING_3D(0x1ea4, 3));
+	OUT_RING(chan, 0x00000010);
+	OUT_RING(chan, 0x01000100);
+	OUT_RING(chan, 0xff800006);
+
+	/* vtxprog output routing */
+	OUT_RING(chan, RING_3D(0x1fc4, 1));
+	OUT_RING(chan, 0x06144321);
+	OUT_RING(chan, RING_3D(0x1fc8, 2));
+	OUT_RING(chan, 0xedcba987);
+	OUT_RING(chan, 0x00000021);
+	OUT_RING(chan, RING_3D(0x1fd0, 1));
+	OUT_RING(chan, 0x00171615);
+	OUT_RING(chan, RING_3D(0x1fd4, 1));
+	OUT_RING(chan, 0x001b1a19);
+
+	OUT_RING(chan, RING_3D(0x1ef8, 1));
+	OUT_RING(chan, 0x0020ffff);
+	OUT_RING(chan, RING_3D(0x1d64, 1));
+	OUT_RING(chan, 0x00d30000);
+	OUT_RING(chan, RING_3D(0x1e94, 1));
+	OUT_RING(chan, 0x00000001);
+}
+
+static unsigned
+nvfx_screen_get_vertex_buffer_flags(struct nvfx_screen* screen)
+{
+	int vram_hack_default = 0;
+	int vram_hack;
+	// TODO: this is a bit of a guess; also add other cards that may need this hack.
+	// It may also depend on the specific card or the AGP/PCIe chipset.
+	if(screen->base.device->chipset == 0x47 /* G70 */
+		|| screen->base.device->chipset == 0x49 /* G71 */
+		|| screen->base.device->chipset == 0x46 /* G72 */
+		)
+		vram_hack_default = 1;
+	vram_hack = debug_get_bool_option("NOUVEAU_VTXIDX_IN_VRAM", vram_hack_default);
+
+#ifdef DEBUG
+	if(!vram_hack)
+	{
+		fprintf(stderr, "Some systems may experience graphics corruption due to randomly misplaced vertices.\n"
+			"If this is happening, export NOUVEAU_VTXIDX_IN_VRAM=1 may reduce or eliminate the problem\n");
+	}
+	else
+	{
+		fprintf(stderr, "A performance reducing hack is being used to help avoid graphics corruption.\n"
+			"You can try export NOUVEAU_VTXIDX_IN_VRAM=0 to disable it.\n");
+	}
+#endif
+
+	return vram_hack ? NOUVEAU_BO_VRAM : NOUVEAU_BO_GART;
+}
+
+struct pipe_screen *
+nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
+{
+	static const unsigned query_sizes[] = {(4096 - 4 * 32) / 32, 3 * 1024 / 32, 2 * 1024 / 32, 1024 / 32};
+	struct nvfx_screen *screen = CALLOC_STRUCT(nvfx_screen);
+	struct nouveau_channel *chan;
+	struct pipe_screen *pscreen;
+	unsigned eng3d_class = 0;
+	int ret, i;
+
+	if (!screen)
+		return NULL;
+
+	pscreen = &screen->base.base;
+
+	ret = nouveau_screen_init(&screen->base, dev);
+	if (ret) {
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+	chan = screen->base.channel;
+
+	pscreen->winsys = ws;
+	pscreen->destroy = nvfx_screen_destroy;
+	pscreen->get_param = nvfx_screen_get_param;
+	pscreen->get_paramf = nvfx_screen_get_paramf;
+	pscreen->is_format_supported = nvfx_screen_surface_format_supported;
+	pscreen->context_create = nvfx_create;
+
+	switch (dev->chipset & 0xf0) {
+	case 0x30:
+		if (NV30TCL_CHIPSET_3X_MASK & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = 0x0397;
+		else if (NV34TCL_CHIPSET_3X_MASK & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = 0x0697;
+		else if (NV35TCL_CHIPSET_3X_MASK & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = 0x0497;
+		break;
+	case 0x40:
+		if (NV4X_GRCLASS4097_CHIPSETS & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = NV40TCL;
+		else if (NV4X_GRCLASS4497_CHIPSETS & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = NV44TCL;
+		screen->is_nv4x = ~0;
+		break;
+	case 0x60:
+		if (NV6X_GRCLASS4497_CHIPSETS & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = NV44TCL;
+		screen->is_nv4x = ~0;
+		break;
+	}
+
+	if (!eng3d_class) {
+		NOUVEAU_ERR("Unknown nv3x/nv4x chipset: nv%02x\n", dev->chipset);
+		return NULL;
+	}
+
+	screen->force_swtnl = debug_get_bool_option("NOUVEAU_SWTNL", FALSE);
+
+	screen->vertex_buffer_reloc_flags = nvfx_screen_get_vertex_buffer_flags(screen);
+
+	/* surely both nv3x and nv44 support index buffers too: find out how and test that */
+	if(eng3d_class == NV40TCL)
+		screen->index_buffer_reloc_flags = screen->vertex_buffer_reloc_flags;
+
+	if(!screen->force_swtnl && screen->vertex_buffer_reloc_flags == screen->index_buffer_reloc_flags)
+		screen->base.vertex_buffer_flags = screen->base.index_buffer_flags = screen->vertex_buffer_reloc_flags;
+
+	nvfx_screen_init_resource_functions(pscreen);
+
+	ret = nouveau_grobj_alloc(chan, 0xbeef3097, eng3d_class, &screen->eng3d);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(&screen->base);
+	screen->eng2d->buf = nvfx_surface_buffer;
+
+	/* Notifier for sync purposes */
+	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	/* Query objects */
+	for(i = 0; i < sizeof(query_sizes) / sizeof(query_sizes[0]); ++i)
+	{
+		ret = nouveau_notifier_alloc(chan, 0xbeef0302, query_sizes[i], &screen->query);
+		if(!ret)
+			break;
+	}
+
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	ret = nouveau_resource_init(&screen->query_heap, 0, query_sizes[i]);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	LIST_INITHEAD(&screen->query_list);
+
+	/* Vtxprog resources */
+	if (nouveau_resource_init(&screen->vp_exec_heap, 0, screen->is_nv4x ? 512 : 256) ||
+	    nouveau_resource_init(&screen->vp_data_heap, 0, 256)) {
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	BIND_RING(chan, screen->eng3d, 7);
+
+	/* Static eng3d initialisation */
+	/* note that we just started using the channel, so we must have space in the pushbuffer */
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_NOTIFY, 1));
+	OUT_RING(chan, screen->sync->handle);
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_TEXTURE0, 2));
+	OUT_RING(chan, chan->vram->handle);
+	OUT_RING(chan, chan->gart->handle);
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_COLOR1, 1));
+	OUT_RING(chan, chan->vram->handle);
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_COLOR0, 2));
+	OUT_RING(chan, chan->vram->handle);
+	OUT_RING(chan, chan->vram->handle);
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_VTXBUF0, 2));
+	OUT_RING(chan, chan->vram->handle);
+	OUT_RING(chan, chan->gart->handle);
+
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_FENCE, 2));
+	OUT_RING(chan, 0);
+	OUT_RING(chan, screen->query->handle);
+
+	OUT_RING(chan, RING_3D(NV34TCL_DMA_IN_MEMORY7, 2));
+	OUT_RING(chan, chan->vram->handle);
+	OUT_RING(chan, chan->vram->handle);
+
+	if(!screen->is_nv4x)
+		nv30_screen_init(screen);
+	else
+		nv40_screen_init(screen);
+
+	return pscreen;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.h b/src/gallium/drivers/nvfx/nvfx_screen.h
new file mode 100644
index 0000000000..5e1c3945ae
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_screen.h
@@ -0,0 +1,43 @@
+#ifndef __NVFX_SCREEN_H__
+#define __NVFX_SCREEN_H__
+
+#include "util/u_double_list.h"
+#include "nouveau/nouveau_screen.h"
+#include "nv04_surface_2d.h"
+
+struct nvfx_context;
+
+struct nvfx_screen {
+	struct nouveau_screen base;
+
+	struct nouveau_winsys *nvws;
+
+	struct nvfx_context *cur_ctx;
+
+	unsigned is_nv4x; /* either 0 or ~0 */
+	boolean force_swtnl;
+	unsigned vertex_buffer_reloc_flags;
+	unsigned index_buffer_reloc_flags;
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *eng3d;
+	struct nouveau_notifier *sync;
+
+	/* Query object resources */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+	struct list_head query_list;
+
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+};
+
+static INLINE struct nvfx_screen *
+nvfx_screen(struct pipe_screen *screen)
+{
+	return (struct nvfx_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_shader.h b/src/gallium/drivers/nvfx/nvfx_shader.h
new file mode 100644
index 0000000000..50830b3916
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_shader.h
@@ -0,0 +1,429 @@
+#ifndef __NVFX_SHADER_H__
+#define __NVFX_SHADER_H__
+
+/* this will resolve to either the NV30 or the NV40 version
+ * depending on the current hardware */
+/* unusual, but very fast and compact method */
+#define NVFX_VP(c) ((NV30_VP_##c) + (nvfx->is_nv4x & ((NV40_VP_##c) - (NV30_VP_##c))))
+
+#define NVFX_VP_INST_SLOT_VEC 0
+#define NVFX_VP_INST_SLOT_SCA 1
+
+#define NVFX_VP_INST_IN_POS  0    /* These seem to match the bindings specified in */
+#define NVFX_VP_INST_IN_WEIGHT  1    /* the ARB_v_p spec (2.14.3.1) */
+#define NVFX_VP_INST_IN_NORMAL  2
+#define NVFX_VP_INST_IN_COL0  3    /* Should probably confirm them all though */
+#define NVFX_VP_INST_IN_COL1  4
+#define NVFX_VP_INST_IN_FOGC  5
+#define NVFX_VP_INST_IN_TC0  8
+#define NVFX_VP_INST_IN_TC(n)  (8+n)
+
+#define NVFX_VP_INST_SCA_OP_NOP 0x00
+#define NVFX_VP_INST_SCA_OP_MOV 0x01
+#define NVFX_VP_INST_SCA_OP_RCP 0x02
+#define NVFX_VP_INST_SCA_OP_RCC 0x03
+#define NVFX_VP_INST_SCA_OP_RSQ 0x04
+#define NVFX_VP_INST_SCA_OP_EXP 0x05
+#define NVFX_VP_INST_SCA_OP_LOG 0x06
+#define NVFX_VP_INST_SCA_OP_LIT 0x07
+#define NVFX_VP_INST_SCA_OP_BRA 0x09
+#define NVFX_VP_INST_SCA_OP_CAL 0x0B
+#define NVFX_VP_INST_SCA_OP_RET 0x0C
+#define NVFX_VP_INST_SCA_OP_LG2 0x0D
+#define NVFX_VP_INST_SCA_OP_EX2 0x0E
+#define NVFX_VP_INST_SCA_OP_SIN 0x0F
+#define NVFX_VP_INST_SCA_OP_COS 0x10
+
+#define NV40_VP_INST_SCA_OP_PUSHA 0x13
+#define NV40_VP_INST_SCA_OP_POPA 0x14
+
+#define NVFX_VP_INST_VEC_OP_NOP 0x00
+#define NVFX_VP_INST_VEC_OP_MOV 0x01
+#define NVFX_VP_INST_VEC_OP_MUL 0x02
+#define NVFX_VP_INST_VEC_OP_ADD 0x03
+#define NVFX_VP_INST_VEC_OP_MAD 0x04
+#define NVFX_VP_INST_VEC_OP_DP3 0x05
+#define NVFX_VP_INST_VEC_OP_DPH 0x06
+#define NVFX_VP_INST_VEC_OP_DP4 0x07
+#define NVFX_VP_INST_VEC_OP_DST 0x08
+#define NVFX_VP_INST_VEC_OP_MIN 0x09
+#define NVFX_VP_INST_VEC_OP_MAX 0x0A
+#define NVFX_VP_INST_VEC_OP_SLT 0x0B
+#define NVFX_VP_INST_VEC_OP_SGE 0x0C
+#define NVFX_VP_INST_VEC_OP_ARL 0x0D
+#define NVFX_VP_INST_VEC_OP_FRC 0x0E
+#define NVFX_VP_INST_VEC_OP_FLR 0x0F
+#define NVFX_VP_INST_VEC_OP_SEQ 0x10
+#define NVFX_VP_INST_VEC_OP_SFL 0x11
+#define NVFX_VP_INST_VEC_OP_SGT 0x12
+#define NVFX_VP_INST_VEC_OP_SLE 0x13
+#define NVFX_VP_INST_VEC_OP_SNE 0x14
+#define NVFX_VP_INST_VEC_OP_STR 0x15
+#define NVFX_VP_INST_VEC_OP_SSG 0x16
+#define NVFX_VP_INST_VEC_OP_ARR 0x17
+#define NVFX_VP_INST_VEC_OP_ARA 0x18
+
+#define NV40_VP_INST_VEC_OP_TXL 0x19
+
+/* DWORD 3 */
+#define NVFX_VP_INST_LAST                           (1 << 0)
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *   0 - Opcode, output reg/mask, ATTRIB source
+ *   1 - Source 0
+ *   2 - Source 1
+ *   3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *     result.color == R0.xyzw
+ *     result.depth == R1.z
+ * When the fragprog contains instructions to write depth, NV30_TCL_PRIMITIVE_3D_UNK1D78=0
+ * otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ *
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ *
+ *     ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO/SWIZZLE_ONE
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as SWIZZLE_ZERO
+ * is implemented simply by not writing to the relevant components of the destination.
+ *
+ * Conditional execution
+ *   TODO
+ *
+ * Non-native instructions:
+ *   LIT
+ *   LRP - MAD+MAD
+ *   SUB - ADD, negate second source
+ *   RSQ - LG2 + EX2
+ *   POW - LG2 + MUL + EX2
+ *   SCS - COS + SIN
+ *   XPD
+ *
+ * NV40 Looping
+ *   Loops appear to be fairly expensive on NV40 at least, the proprietary
+ *   driver goes to a lot of effort to avoid using the native looping
+ *   instructions.  If the total number of *executed* instructions between
+ *   REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
+ *   The maximum loop count is 255.
+ *
+ */
+
+//== Opcode / Destination selection ==
+#define NVFX_FP_OP_PROGRAM_END          (1 << 0)
+#define NVFX_FP_OP_OUT_REG_SHIFT        1
+#define NV30_FP_OP_OUT_REG_MASK          (31 << 1)  /* uncertain */
+#define NV40_FP_OP_OUT_REG_MASK          (63 << 1)
+/* Needs to be set when writing outputs to get expected result.. */
+#define NVFX_FP_OP_OUT_REG_HALF          (1 << 7)
+#define NVFX_FP_OP_COND_WRITE_ENABLE        (1 << 8)
+#define NVFX_FP_OP_OUTMASK_SHIFT        9
+#define NVFX_FP_OP_OUTMASK_MASK          (0xF << 9)
+#  define NVFX_FP_OP_OUT_X  (1<<9)
+#  define NVFX_FP_OP_OUT_Y  (1<<10)
+#  define NVFX_FP_OP_OUT_Z  (1<<11)
+#  define NVFX_FP_OP_OUT_W  (1<<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NVFX_FP_OP_INPUT_SRC_SHIFT        13
+#define NVFX_FP_OP_INPUT_SRC_MASK        (15 << 13)
+#  define NVFX_FP_OP_INPUT_SRC_POSITION  0x0
+#  define NVFX_FP_OP_INPUT_SRC_COL0  0x1
+#  define NVFX_FP_OP_INPUT_SRC_COL1  0x2
+#  define NVFX_FP_OP_INPUT_SRC_FOGC  0x3
+#  define NVFX_FP_OP_INPUT_SRC_TC0    0x4
+#  define NVFX_FP_OP_INPUT_SRC_TC(n)  (0x4 + n)
+#  define NV40_FP_OP_INPUT_SRC_FACING  0xE
+#define NVFX_FP_OP_TEX_UNIT_SHIFT        17
+#define NVFX_FP_OP_TEX_UNIT_MASK        (0xF << 17) /* guess */
+#define NVFX_FP_OP_PRECISION_SHIFT        22
+#define NVFX_FP_OP_PRECISION_MASK        (3 << 22)
+#   define NVFX_FP_PRECISION_FP32  0
+#   define NVFX_FP_PRECISION_FP16  1
+#   define NVFX_FP_PRECISION_FX12  2
+#define NVFX_FP_OP_OPCODE_SHIFT          24
+#define NVFX_FP_OP_OPCODE_MASK          (0x3F << 24)
+/* NV30/NV40 fragment program opcodes */
+#define NVFX_FP_OP_OPCODE_NOP 0x00
+#define NVFX_FP_OP_OPCODE_MOV 0x01
+#define NVFX_FP_OP_OPCODE_MUL 0x02
+#define NVFX_FP_OP_OPCODE_ADD 0x03
+#define NVFX_FP_OP_OPCODE_MAD 0x04
+#define NVFX_FP_OP_OPCODE_DP3 0x05
+#define NVFX_FP_OP_OPCODE_DP4 0x06
+#define NVFX_FP_OP_OPCODE_DST 0x07
+#define NVFX_FP_OP_OPCODE_MIN 0x08
+#define NVFX_FP_OP_OPCODE_MAX 0x09
+#define NVFX_FP_OP_OPCODE_SLT 0x0A
+#define NVFX_FP_OP_OPCODE_SGE 0x0B
+#define NVFX_FP_OP_OPCODE_SLE 0x0C
+#define NVFX_FP_OP_OPCODE_SGT 0x0D
+#define NVFX_FP_OP_OPCODE_SNE 0x0E
+#define NVFX_FP_OP_OPCODE_SEQ 0x0F
+#define NVFX_FP_OP_OPCODE_FRC 0x10
+#define NVFX_FP_OP_OPCODE_FLR 0x11
+#define NVFX_FP_OP_OPCODE_KIL 0x12
+#define NVFX_FP_OP_OPCODE_PK4B 0x13
+#define NVFX_FP_OP_OPCODE_UP4B 0x14
+#define NVFX_FP_OP_OPCODE_DDX 0x15 /* can only write XY */
+#define NVFX_FP_OP_OPCODE_DDY 0x16 /* can only write XY */
+#define NVFX_FP_OP_OPCODE_TEX 0x17
+#define NVFX_FP_OP_OPCODE_TXP 0x18
+#define NVFX_FP_OP_OPCODE_TXD 0x19
+#define NVFX_FP_OP_OPCODE_RCP 0x1A
+#define NVFX_FP_OP_OPCODE_EX2 0x1C
+#define NVFX_FP_OP_OPCODE_LG2 0x1D
+#define NVFX_FP_OP_OPCODE_STR 0x20
+#define NVFX_FP_OP_OPCODE_SFL 0x21
+#define NVFX_FP_OP_OPCODE_COS 0x22
+#define NVFX_FP_OP_OPCODE_SIN 0x23
+#define NVFX_FP_OP_OPCODE_PK2H 0x24
+#define NVFX_FP_OP_OPCODE_UP2H 0x25
+#define NVFX_FP_OP_OPCODE_PK4UB 0x27
+#define NVFX_FP_OP_OPCODE_UP4UB 0x28
+#define NVFX_FP_OP_OPCODE_PK2US 0x29
+#define NVFX_FP_OP_OPCODE_UP2US 0x2A
+#define NVFX_FP_OP_OPCODE_DP2A 0x2E
+#define NVFX_FP_OP_OPCODE_TXB 0x31
+#define NVFX_FP_OP_OPCODE_DIV 0x3A
+
+/* NV30 only fragment program opcodes */
+#define NVFX_FP_OP_OPCODE_RSQ_NV30 0x1B
+#define NVFX_FP_OP_OPCODE_LIT_NV30 0x1E
+#define NVFX_FP_OP_OPCODE_LRP_NV30 0x1F
+#define NVFX_FP_OP_OPCODE_POW_NV30 0x26
+#define NVFX_FP_OP_OPCODE_RFL_NV30 0x36
+
+/* NV40 only fragment program opcodes */
+#define NVFX_FP_OP_OPCODE_TXL_NV40 0x2F
+/* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
+#define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
+#define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
+#define NV40_FP_OP_BRA_OPCODE_IF                                     0x2
+#define NV40_FP_OP_BRA_OPCODE_LOOP                                   0x3
+#define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
+#define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
+
+#define NVFX_FP_OP_OUT_SAT          (1 << 31)
+
+/* high order bits of SRC0 */
+#define NVFX_FP_OP_OUT_ABS          (1 << 29)
+#define NVFX_FP_OP_COND_SWZ_W_SHIFT        27
+#define NVFX_FP_OP_COND_SWZ_W_MASK        (3 << 27)
+#define NVFX_FP_OP_COND_SWZ_Z_SHIFT        25
+#define NVFX_FP_OP_COND_SWZ_Z_MASK        (3 << 25)
+#define NVFX_FP_OP_COND_SWZ_Y_SHIFT        23
+#define NVFX_FP_OP_COND_SWZ_Y_MASK        (3 << 23)
+#define NVFX_FP_OP_COND_SWZ_X_SHIFT        21
+#define NVFX_FP_OP_COND_SWZ_X_MASK        (3 << 21)
+#define NVFX_FP_OP_COND_SWZ_ALL_SHIFT        21
+#define NVFX_FP_OP_COND_SWZ_ALL_MASK        (0xFF << 21)
+#define NVFX_FP_OP_COND_SHIFT          18
+#define NVFX_FP_OP_COND_MASK          (0x07 << 18)
+#  define NVFX_FP_OP_COND_FL  0
+#  define NVFX_FP_OP_COND_LT  1
+#  define NVFX_FP_OP_COND_EQ  2
+#  define NVFX_FP_OP_COND_LE  3
+#  define NVFX_FP_OP_COND_GT  4
+#  define NVFX_FP_OP_COND_NE  5
+#  define NVFX_FP_OP_COND_GE  6
+#  define NVFX_FP_OP_COND_TR  7
+
+/* high order bits of SRC1 */
+#define NV40_FP_OP_OPCODE_IS_BRANCH                                      (1<<31)
+#define NVFX_FP_OP_DST_SCALE_SHIFT        28
+#define NVFX_FP_OP_DST_SCALE_MASK        (3 << 28)
+#define NVFX_FP_OP_DST_SCALE_1X                                                0
+#define NVFX_FP_OP_DST_SCALE_2X                                                1
+#define NVFX_FP_OP_DST_SCALE_4X                                                2
+#define NVFX_FP_OP_DST_SCALE_8X                                                3
+#define NVFX_FP_OP_DST_SCALE_INV_2X                                            5
+#define NVFX_FP_OP_DST_SCALE_INV_4X                                            6
+#define NVFX_FP_OP_DST_SCALE_INV_8X                                            7
+
+/* SRC1 LOOP */
+#define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
+#define NV40_FP_OP_LOOP_INCR_MASK                                   (0xFF << 19)
+#define NV40_FP_OP_LOOP_INDEX_SHIFT                                           10
+#define NV40_FP_OP_LOOP_INDEX_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
+#define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
+
+/* SRC1 IF */
+#define NV40_FP_OP_ELSE_ID_SHIFT                                               2
+#define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
+
+/* SRC1 CAL */
+#define NV40_FP_OP_IADDR_SHIFT                                                 2
+#define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
+
+/* SRC1 REP
+ *   I have no idea why there are 3 count values here..  but they
+ *   have always been filled with the same value in my tests so
+ *   far..
+ */
+#define NV40_FP_OP_REP_COUNT1_SHIFT                                            2
+#define NV40_FP_OP_REP_COUNT1_MASK                                   (0xFF << 2)
+#define NV40_FP_OP_REP_COUNT2_SHIFT                                           10
+#define NV40_FP_OP_REP_COUNT2_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
+#define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
+
+/* SRC2 REP/IF */
+#define NV40_FP_OP_END_ID_SHIFT                                                2
+#define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
+
+/* high order bits of SRC2 */
+#define NVFX_FP_OP_INDEX_INPUT          (1 << 30)
+#define NV40_FP_OP_ADDR_INDEX_SHIFT        19
+#define NV40_FP_OP_ADDR_INDEX_MASK        (0xF << 19)
+
+//== Register selection ==
+#define NVFX_FP_REG_TYPE_SHIFT           0
+#define NVFX_FP_REG_TYPE_MASK           (3 << 0)
+#  define NVFX_FP_REG_TYPE_TEMP   0
+#  define NVFX_FP_REG_TYPE_INPUT  1
+#  define NVFX_FP_REG_TYPE_CONST  2
+#define NVFX_FP_REG_SRC_SHIFT            2
+#define NV30_FP_REG_SRC_MASK              (31 << 2)
+#define NV40_FP_REG_SRC_MASK              (63 << 2)
+#define NVFX_FP_REG_SRC_HALF            (1 << 8)
+#define NVFX_FP_REG_SWZ_ALL_SHIFT        9
+#define NVFX_FP_REG_SWZ_ALL_MASK        (255 << 9)
+#define NVFX_FP_REG_SWZ_X_SHIFT          9
+#define NVFX_FP_REG_SWZ_X_MASK          (3 << 9)
+#define NVFX_FP_REG_SWZ_Y_SHIFT          11
+#define NVFX_FP_REG_SWZ_Y_MASK          (3 << 11)
+#define NVFX_FP_REG_SWZ_Z_SHIFT          13
+#define NVFX_FP_REG_SWZ_Z_MASK          (3 << 13)
+#define NVFX_FP_REG_SWZ_W_SHIFT          15
+#define NVFX_FP_REG_SWZ_W_MASK          (3 << 15)
+#  define NVFX_FP_SWIZZLE_X  0
+#  define NVFX_FP_SWIZZLE_Y  1
+#  define NVFX_FP_SWIZZLE_Z  2
+#  define NVFX_FP_SWIZZLE_W  3
+#define NVFX_FP_REG_NEGATE          (1 << 17)
+
+#define NVFXSR_NONE	0
+#define NVFXSR_OUTPUT	1
+#define NVFXSR_INPUT	2
+#define NVFXSR_TEMP	3
+#define NVFXSR_CONST	4
+
+#define NVFX_COND_FL  0
+#define NVFX_COND_LT  1
+#define NVFX_COND_EQ  2
+#define NVFX_COND_LE  3
+#define NVFX_COND_GT  4
+#define NVFX_COND_NE  5
+#define NVFX_COND_GE  6
+#define NVFX_COND_TR  7
+
+/* Yes, this are ordered differently... */
+
+#define NVFX_VP_MASK_X 8
+#define NVFX_VP_MASK_Y 4
+#define NVFX_VP_MASK_Z 2
+#define NVFX_VP_MASK_W 1
+#define NVFX_VP_MASK_ALL 0xf
+
+#define NVFX_FP_MASK_X 1
+#define NVFX_FP_MASK_Y 2
+#define NVFX_FP_MASK_Z 4
+#define NVFX_FP_MASK_W 8
+#define NVFX_FP_MASK_ALL 0xf
+
+#define NVFX_SWZ_X 0
+#define NVFX_SWZ_Y 1
+#define NVFX_SWZ_Z 2
+#define NVFX_SWZ_W 3
+
+#define swz(s,x,y,z,w) nvfx_sr_swz((s), NVFX_SWZ_##x, NVFX_SWZ_##y, NVFX_SWZ_##z, NVFX_SWZ_##w)
+#define neg(s) nvfx_sr_neg((s))
+#define abs(s) nvfx_sr_abs((s))
+#define scale(s,v) nvfx_sr_scale((s), NVFX_FP_OP_DST_SCALE_##v)
+
+struct nvfx_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nvfx_sreg
+nvfx_sr(int type, int index)
+{
+	struct nvfx_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = 0,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = NVFX_COND_TR,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nvfx_sreg
+nvfx_sr_swz(struct nvfx_sreg src, int x, int y, int z, int w)
+{
+	struct nvfx_sreg dst = src;
+
+	dst.swz[NVFX_SWZ_X] = src.swz[x];
+	dst.swz[NVFX_SWZ_Y] = src.swz[y];
+	dst.swz[NVFX_SWZ_Z] = src.swz[z];
+	dst.swz[NVFX_SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nvfx_sreg
+nvfx_sr_neg(struct nvfx_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nvfx_sreg
+nvfx_sr_abs(struct nvfx_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nvfx_sreg
+nvfx_sr_scale(struct nvfx_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_state.c b/src/gallium/drivers/nvfx/nvfx_state.c
new file mode 100644
index 0000000000..30322d46d9
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state.c
@@ -0,0 +1,638 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+
+#include "draw/draw_context.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nvfx_context.h"
+#include "nvfx_state.h"
+#include "nvfx_tex.h"
+
+static void *
+nvfx_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_blend_state *bso = CALLOC(1, sizeof(*bso));
+	struct nouveau_statebuf_builder sb = sb_init(bso->sb);
+
+	if (cso->rt[0].blend_enable) {
+		sb_method(sb, NV34TCL_BLEND_FUNC_ENABLE, 3);
+		sb_data(sb, 1);
+		sb_data(sb, (nvgl_blend_func(cso->rt[0].alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rt[0].rgb_src_factor));
+		sb_data(sb, nvgl_blend_func(cso->rt[0].alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rt[0].rgb_dst_factor));
+		if(nvfx->screen->base.device->chipset < 0x40) {
+			sb_method(sb, NV34TCL_BLEND_EQUATION, 1);
+			sb_data(sb, nvgl_blend_eqn(cso->rt[0].rgb_func));
+		} else {
+			sb_method(sb, NV40TCL_BLEND_EQUATION, 1);
+			sb_data(sb, nvgl_blend_eqn(cso->rt[0].alpha_func) << 16 |
+			      nvgl_blend_eqn(cso->rt[0].rgb_func));
+		}
+	} else {
+		sb_method(sb, NV34TCL_BLEND_FUNC_ENABLE, 1);
+		sb_data(sb, 0);
+	}
+
+	sb_method(sb, NV34TCL_COLOR_MASK, 1);
+	sb_data(sb, (((cso->rt[0].colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+	       ((cso->rt[0].colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+	       ((cso->rt[0].colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+	       ((cso->rt[0].colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
+
+	/* TODO: add NV40 MRT color mask */
+
+	if (cso->logicop_enable) {
+		sb_method(sb, NV34TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		sb_data(sb, 1);
+		sb_data(sb, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		sb_method(sb, NV34TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		sb_data(sb, 0);
+	}
+
+	sb_method(sb, NV34TCL_DITHER_ENABLE, 1);
+	sb_data(sb, cso->dither ? 1 : 0);
+
+	bso->sb_len = sb_len(sb, bso->sb);
+	bso->pipe = *cso;
+	return (void *)bso;
+}
+
+static void
+nvfx_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->blend = hwcso;
+	nvfx->dirty |= NVFX_NEW_BLEND;
+}
+
+static void
+nvfx_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_blend_state *bso = hwcso;
+
+	FREE(bso);
+}
+
+static void *
+nvfx_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_sampler_state *ps;
+
+	ps = MALLOC(sizeof(struct nvfx_sampler_state));
+
+	/* on nv30, we use this as an internal flag */
+	ps->fmt = cso->normalized_coords ? 0 : NV40TCL_TEX_FORMAT_RECT;
+	ps->en = 0;
+	ps->filt = nvfx_tex_filter(cso);
+	ps->wrap = (nvfx_tex_wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) |
+		    (nvfx_tex_wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) |
+		    (nvfx_tex_wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT) |
+		    nvfx_tex_wrap_compare_mode(cso);
+	ps->bcol = nvfx_tex_border_color(cso->border_color);
+
+	if(nvfx->is_nv4x)
+		nv40_sampler_state_init(pipe, ps, cso);
+	else
+		nv30_sampler_state_init(pipe, ps, cso);
+
+	return (void *)ps;
+}
+
+static void
+nvfx_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nvfx->tex_sampler[unit] = sampler[unit];
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nvfx->nr_samplers; unit++) {
+		nvfx->tex_sampler[unit] = NULL;
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	nvfx->nr_samplers = nr;
+	nvfx->dirty |= NVFX_NEW_SAMPLER;
+}
+
+static void
+nvfx_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nvfx_set_fragment_sampler_views(struct pipe_context *pipe,
+				unsigned nr,
+				struct pipe_sampler_view **views)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit],
+                                            views[unit]);
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nvfx->nr_textures; unit++) {
+		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit],
+                                            NULL);
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	nvfx->nr_textures = nr;
+	nvfx->dirty |= NVFX_NEW_SAMPLER;
+}
+
+
+static struct pipe_sampler_view *
+nvfx_create_sampler_view(struct pipe_context *pipe,
+			 struct pipe_resource *texture,
+			 const struct pipe_sampler_view *templ)
+{
+	struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
+
+	if (view) {
+		*view = *templ;
+		view->reference.count = 1;
+		view->texture = NULL;
+		pipe_resource_reference(&view->texture, texture);
+		view->context = pipe;
+	}
+
+	return view;
+}
+
+
+static void
+nvfx_sampler_view_destroy(struct pipe_context *pipe,
+			  struct pipe_sampler_view *view)
+{
+	pipe_resource_reference(&view->texture, NULL);
+	FREE(view);
+}
+
+static void *
+nvfx_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nvfx_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
+	struct nouveau_statebuf_builder sb = sb_init(rsso->sb);
+
+	/*XXX: ignored:
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 */
+
+	sb_method(sb, NV34TCL_SHADE_MODEL, 1);
+	sb_data(sb, cso->flatshade ? NV34TCL_SHADE_MODEL_FLAT :
+				       NV34TCL_SHADE_MODEL_SMOOTH);
+
+	sb_method(sb, NV34TCL_VERTEX_TWO_SIDE_ENABLE, 1);
+	sb_data(sb, cso->light_twoside);
+
+	sb_method(sb, NV34TCL_LINE_WIDTH, 2);
+	sb_data(sb, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	sb_data(sb, cso->line_smooth ? 1 : 0);
+	sb_method(sb, NV34TCL_LINE_STIPPLE_ENABLE, 2);
+	sb_data(sb, cso->line_stipple_enable ? 1 : 0);
+	sb_data(sb, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
+
+	sb_method(sb, NV34TCL_POINT_SIZE, 1);
+	sb_data(sb, fui(cso->point_size));
+
+	sb_method(sb, NV34TCL_POLYGON_MODE_FRONT, 6);
+        sb_data(sb, nvgl_polygon_mode(cso->fill_front));
+        sb_data(sb, nvgl_polygon_mode(cso->fill_back));
+	switch (cso->cull_face) {
+	case PIPE_FACE_FRONT:
+		sb_data(sb, NV34TCL_CULL_FACE_FRONT);
+		break;
+	case PIPE_FACE_BACK:
+		sb_data(sb, NV34TCL_CULL_FACE_BACK);
+		break;
+	case PIPE_FACE_FRONT_AND_BACK:
+		sb_data(sb, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+		break;
+	default:
+		sb_data(sb, NV34TCL_CULL_FACE_BACK);
+		break;
+	}
+	if (cso->front_ccw) {
+		sb_data(sb, NV34TCL_FRONT_FACE_CCW);
+	} else {
+		sb_data(sb, NV34TCL_FRONT_FACE_CW);
+	}
+	sb_data(sb, cso->poly_smooth ? 1 : 0);
+	sb_data(sb, (cso->cull_face != PIPE_FACE_NONE) ? 1 : 0);
+
+	sb_method(sb, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+	sb_data(sb, cso->poly_stipple_enable ? 1 : 0);
+
+	sb_method(sb, NV34TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+        sb_data(sb, cso->offset_point);
+        sb_data(sb, cso->offset_line);
+        sb_data(sb, cso->offset_tri);
+
+	if (cso->offset_point || cso->offset_line || cso->offset_tri) {
+		sb_method(sb, NV34TCL_POLYGON_OFFSET_FACTOR, 2);
+		sb_data(sb, fui(cso->offset_scale));
+		sb_data(sb, fui(cso->offset_units * 2));
+	}
+
+	sb_method(sb, NV34TCL_POINT_SPRITE, 1);
+	if (cso->point_quad_rasterization) {
+		unsigned psctl = (1 << 0), i;
+
+		for (i = 0; i < 8; i++) {
+			if ((cso->sprite_coord_enable >> i) & 1)
+				psctl |= (1 << (8 + i));
+		}
+
+		sb_data(sb, psctl);
+	} else {
+		sb_data(sb, 0);
+	}
+
+	rsso->pipe = *cso;
+	rsso->sb_len = sb_len(sb, rsso->sb);
+	return (void *)rsso;
+}
+
+static void
+nvfx_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	if(nvfx->rasterizer && hwcso)
+	{
+		if(!nvfx->rasterizer || ((struct nvfx_rasterizer_state*)hwcso)->pipe.scissor
+					!= nvfx->rasterizer->pipe.scissor)
+		{
+			nvfx->dirty |= NVFX_NEW_SCISSOR;
+			nvfx->draw_dirty |= NVFX_NEW_SCISSOR;
+		}
+
+		if(((struct nvfx_rasterizer_state*)hwcso)->pipe.poly_stipple_enable
+					!= nvfx->rasterizer->pipe.poly_stipple_enable)
+		{
+			nvfx->dirty |= NVFX_NEW_STIPPLE;
+			nvfx->draw_dirty |= NVFX_NEW_STIPPLE;
+		}
+	}
+
+	nvfx->rasterizer = hwcso;
+	nvfx->dirty |= NVFX_NEW_RAST;
+	nvfx->draw_dirty |= NVFX_NEW_RAST;
+}
+
+static void
+nvfx_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_rasterizer_state *rsso = hwcso;
+
+	FREE(rsso);
+}
+
+static void *
+nvfx_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nvfx_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
+	struct nouveau_statebuf_builder sb = sb_init(zsaso->sb);
+
+	sb_method(sb, NV34TCL_DEPTH_FUNC, 3);
+	sb_data  (sb, nvgl_comparison_op(cso->depth.func));
+	sb_data  (sb, cso->depth.writemask ? 1 : 0);
+	sb_data  (sb, cso->depth.enabled ? 1 : 0);
+
+	sb_method(sb, NV34TCL_ALPHA_FUNC_ENABLE, 3);
+	sb_data  (sb, cso->alpha.enabled ? 1 : 0);
+	sb_data  (sb, nvgl_comparison_op(cso->alpha.func));
+	sb_data  (sb, float_to_ubyte(cso->alpha.ref_value));
+
+	if (cso->stencil[0].enabled) {
+		sb_method(sb, NV34TCL_STENCIL_FRONT_ENABLE, 3);
+		sb_data  (sb, cso->stencil[0].enabled ? 1 : 0);
+		sb_data  (sb, cso->stencil[0].writemask);
+		sb_data  (sb, nvgl_comparison_op(cso->stencil[0].func));
+		sb_method(sb, NV34TCL_STENCIL_FRONT_FUNC_MASK, 4);
+		sb_data  (sb, cso->stencil[0].valuemask);
+		sb_data  (sb, nvgl_stencil_op(cso->stencil[0].fail_op));
+		sb_data  (sb, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		sb_data  (sb, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		sb_method(sb, NV34TCL_STENCIL_FRONT_ENABLE, 1);
+		sb_data  (sb, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		sb_method(sb, NV34TCL_STENCIL_BACK_ENABLE, 3);
+		sb_data  (sb, cso->stencil[1].enabled ? 1 : 0);
+		sb_data  (sb, cso->stencil[1].writemask);
+		sb_data  (sb, nvgl_comparison_op(cso->stencil[1].func));
+		sb_method(sb, NV34TCL_STENCIL_BACK_FUNC_MASK, 4);
+		sb_data  (sb, cso->stencil[1].valuemask);
+		sb_data  (sb, nvgl_stencil_op(cso->stencil[1].fail_op));
+		sb_data  (sb, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		sb_data  (sb, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		sb_method(sb, NV34TCL_STENCIL_BACK_ENABLE, 1);
+		sb_data  (sb, 0);
+	}
+
+	zsaso->pipe = *cso;
+	zsaso->sb_len = sb_len(sb, zsaso->sb);
+	return (void *)zsaso;
+}
+
+static void
+nvfx_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->zsa = hwcso;
+	nvfx->dirty |= NVFX_NEW_ZSA;
+}
+
+static void
+nvfx_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_zsa_state *zsaso = hwcso;
+
+	FREE(zsaso);
+}
+
+static void *
+nvfx_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_vertex_program *vp;
+
+	vp = CALLOC(1, sizeof(struct nvfx_vertex_program));
+	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe);
+
+	return (void *)vp;
+}
+
+static void
+nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->vertprog = hwcso;
+	nvfx->dirty |= NVFX_NEW_VERTPROG;
+	nvfx->draw_dirty |= NVFX_NEW_VERTPROG;
+}
+
+static void
+nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_vertex_program *vp = hwcso;
+
+	draw_delete_vertex_shader(nvfx->draw, vp->draw);
+	nvfx_vertprog_destroy(nvfx, vp);
+	FREE((void*)vp->pipe.tokens);
+	FREE(vp);
+}
+
+static void *
+nvfx_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nvfx_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nvfx_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nvfx_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->fragprog = hwcso;
+	nvfx->dirty |= NVFX_NEW_FRAGPROG;
+}
+
+static void
+nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_fragment_program *fp = hwcso;
+
+	nvfx_fragprog_destroy(nvfx, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nvfx_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->blend_colour = *bcol;
+	nvfx->dirty |= NVFX_NEW_BCOL;
+}
+
+static void
+nvfx_set_stencil_ref(struct pipe_context *pipe,
+		     const struct pipe_stencil_ref *sr)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->stencil_ref = *sr;
+	nvfx->dirty |= NVFX_NEW_SR;
+}
+
+static void
+nvfx_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->clip = *clip;
+	nvfx->dirty |= NVFX_NEW_UCP;
+	nvfx->draw_dirty |= NVFX_NEW_UCP;
+}
+
+static void
+nvfx_set_sample_mask(struct pipe_context *pipe,
+		     unsigned sample_mask)
+{
+}
+
+static void
+nvfx_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 struct pipe_resource *buf )
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->constbuf[shader] = buf;
+	nvfx->constbuf_nr[shader] = buf->width0 / (4 * sizeof(float));
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nvfx->dirty |= NVFX_NEW_VERTCONST;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nvfx->dirty |= NVFX_NEW_FRAGCONST;
+	}
+}
+
+static void
+nvfx_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->framebuffer = *fb;
+	nvfx->dirty |= NVFX_NEW_FB;
+}
+
+static void
+nvfx_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	memcpy(nvfx->stipple, stipple->stipple, 4 * 32);
+	nvfx->dirty |= NVFX_NEW_STIPPLE;
+}
+
+static void
+nvfx_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->scissor = *s;
+	nvfx->dirty |= NVFX_NEW_SCISSOR;
+}
+
+static void
+nvfx_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->viewport = *vpt;
+	nvfx->dirty |= NVFX_NEW_VIEWPORT;
+	nvfx->draw_dirty |= NVFX_NEW_VIEWPORT;
+}
+
+static void
+nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	memcpy(nvfx->vtxbuf, vb, sizeof(*vb) * count);
+	nvfx->vtxbuf_nr = count;
+
+	nvfx->dirty |= NVFX_NEW_ARRAYS;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+}
+
+static void *
+nvfx_vtxelts_state_create(struct pipe_context *pipe,
+			  unsigned num_elements,
+			  const struct pipe_vertex_element *elements)
+{
+	struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state);
+
+	assert(num_elements < 16); /* not doing fallbacks yet */
+	cso->num_elements = num_elements;
+	memcpy(cso->pipe, elements, num_elements * sizeof(*elements));
+
+/*	nvfx_vtxelt_construct(cso);*/
+
+	return (void *)cso;
+}
+
+static void
+nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->vtxelt = hwcso;
+	nvfx->dirty |= NVFX_NEW_ARRAYS;
+	/*nvfx->draw_dirty |= NVFX_NEW_ARRAYS;*/
+}
+
+void
+nvfx_init_state_functions(struct nvfx_context *nvfx)
+{
+	nvfx->pipe.create_blend_state = nvfx_blend_state_create;
+	nvfx->pipe.bind_blend_state = nvfx_blend_state_bind;
+	nvfx->pipe.delete_blend_state = nvfx_blend_state_delete;
+
+	nvfx->pipe.create_sampler_state = nvfx_sampler_state_create;
+	nvfx->pipe.bind_fragment_sampler_states = nvfx_sampler_state_bind;
+	nvfx->pipe.delete_sampler_state = nvfx_sampler_state_delete;
+	nvfx->pipe.set_fragment_sampler_views = nvfx_set_fragment_sampler_views;
+        nvfx->pipe.create_sampler_view = nvfx_create_sampler_view;
+        nvfx->pipe.sampler_view_destroy = nvfx_sampler_view_destroy;
+
+	nvfx->pipe.create_rasterizer_state = nvfx_rasterizer_state_create;
+	nvfx->pipe.bind_rasterizer_state = nvfx_rasterizer_state_bind;
+	nvfx->pipe.delete_rasterizer_state = nvfx_rasterizer_state_delete;
+
+	nvfx->pipe.create_depth_stencil_alpha_state =
+		nvfx_depth_stencil_alpha_state_create;
+	nvfx->pipe.bind_depth_stencil_alpha_state =
+		nvfx_depth_stencil_alpha_state_bind;
+	nvfx->pipe.delete_depth_stencil_alpha_state =
+		nvfx_depth_stencil_alpha_state_delete;
+
+	nvfx->pipe.create_vs_state = nvfx_vp_state_create;
+	nvfx->pipe.bind_vs_state = nvfx_vp_state_bind;
+	nvfx->pipe.delete_vs_state = nvfx_vp_state_delete;
+
+	nvfx->pipe.create_fs_state = nvfx_fp_state_create;
+	nvfx->pipe.bind_fs_state = nvfx_fp_state_bind;
+	nvfx->pipe.delete_fs_state = nvfx_fp_state_delete;
+
+	nvfx->pipe.set_blend_color = nvfx_set_blend_color;
+        nvfx->pipe.set_stencil_ref = nvfx_set_stencil_ref;
+	nvfx->pipe.set_clip_state = nvfx_set_clip_state;
+	nvfx->pipe.set_sample_mask = nvfx_set_sample_mask;
+	nvfx->pipe.set_constant_buffer = nvfx_set_constant_buffer;
+	nvfx->pipe.set_framebuffer_state = nvfx_set_framebuffer_state;
+	nvfx->pipe.set_polygon_stipple = nvfx_set_polygon_stipple;
+	nvfx->pipe.set_scissor_state = nvfx_set_scissor_state;
+	nvfx->pipe.set_viewport_state = nvfx_set_viewport_state;
+
+	nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create;
+	nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete;
+	nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind;
+
+	nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
new file mode 100644
index 0000000000..9ceb2577ec
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -0,0 +1,77 @@
+#ifndef __NVFX_STATE_H__
+#define __NVFX_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+#include "nouveau/nouveau_statebuf.h"
+
+struct nvfx_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nvfx_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nvfx_vertex_program {
+	struct pipe_shader_state pipe;
+
+	struct draw_vertex_shader *draw;
+
+	boolean translated;
+
+	struct pipe_clip_state ucp;
+
+	struct nvfx_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nvfx_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+	uint32_t clip_ctrl;
+};
+
+struct nvfx_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nvfx_fragment_program_bo {
+	struct nvfx_fragment_program_bo* next;
+	struct nouveau_bo* bo;
+	char insn[] __attribute__((aligned(16)));
+};
+
+struct nvfx_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nvfx_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	uint32_t fp_control;
+
+	unsigned bo_prog_idx;
+	unsigned prog_size;
+	unsigned progs_per_bo;
+	struct nvfx_fragment_program_bo* fpbo;
+};
+
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_state_blend.c b/src/gallium/drivers/nvfx/nvfx_state_blend.c
new file mode 100644
index 0000000000..fe34e98364
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_blend.c
@@ -0,0 +1,22 @@
+#include "nvfx_context.h"
+
+void
+nvfx_state_blend_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	sb_emit(chan, nvfx->blend->sb, nvfx->blend->sb_len);
+}
+
+void
+nvfx_state_blend_colour_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct pipe_blend_color *bcol = &nvfx->blend_colour;
+
+	WAIT_RING(chan, 2);
+	OUT_RING(chan, RING_3D(NV34TCL_BLEND_COLOR, 1));
+	OUT_RING(chan, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
new file mode 100644
index 0000000000..f91ae19ecd
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -0,0 +1,180 @@
+#include "nvfx_context.h"
+#include "nvfx_state.h"
+#include "draw/draw_context.h"
+
+static boolean
+nvfx_state_validate_common(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned dirty = nvfx->dirty;
+
+	if(nvfx != nvfx->screen->cur_ctx)
+		dirty = ~0;
+
+	if(nvfx->render_mode == HW)
+	{
+		if(dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_VERTCONST | NVFX_NEW_UCP))
+		{
+			if(!nvfx_vertprog_validate(nvfx))
+				return FALSE;
+		}
+
+		if(dirty & (NVFX_NEW_ARRAYS))
+		{
+			if(!nvfx_vbo_validate(nvfx))
+				return FALSE;
+		}
+	}
+	else
+	{
+		/* TODO: this looks a bit misdesigned */
+		if(dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
+			nvfx_vertprog_validate(nvfx);
+
+		if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_FRAGPROG))
+			nvfx_vtxfmt_validate(nvfx);
+	}
+
+	if(dirty & NVFX_NEW_FB)
+		nvfx_state_framebuffer_validate(nvfx);
+
+	if(dirty & NVFX_NEW_RAST)
+		sb_emit(chan, nvfx->rasterizer->sb, nvfx->rasterizer->sb_len);
+
+	if(dirty & NVFX_NEW_SCISSOR)
+		nvfx_state_scissor_validate(nvfx);
+
+	if(dirty & NVFX_NEW_STIPPLE)
+		nvfx_state_stipple_validate(nvfx);
+
+	if(dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST))
+		nvfx_fragprog_validate(nvfx);
+
+	if(dirty & NVFX_NEW_SAMPLER)
+		nvfx_fragtex_validate(nvfx);
+
+	if(dirty & NVFX_NEW_BLEND)
+		sb_emit(chan, nvfx->blend->sb, nvfx->blend->sb_len);
+
+	if(dirty & NVFX_NEW_BCOL)
+		nvfx_state_blend_colour_validate(nvfx);
+
+	if(dirty & NVFX_NEW_ZSA)
+		sb_emit(chan, nvfx->zsa->sb, nvfx->zsa->sb_len);
+
+	if(dirty & NVFX_NEW_SR)
+		nvfx_state_sr_validate(nvfx);
+
+/* Having this depend on FB looks wrong, but it seems
+   necessary to make this work on nv3x
+   TODO: find the right fix
+*/
+	if(dirty & (NVFX_NEW_VIEWPORT | NVFX_NEW_FB))
+		nvfx_state_viewport_validate(nvfx);
+
+	/* TODO: could nv30 need this or something similar too? */
+	if((dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_SAMPLER)) && nvfx->is_nv4x) {
+		WAIT_RING(chan, 4);
+		OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1));
+		OUT_RING(chan, 2);
+		OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1));
+		OUT_RING(chan, 1);
+	}
+	nvfx->dirty = 0;
+	return TRUE;
+}
+
+void
+nvfx_state_emit(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	/* we need to ensure there is enough space to output relocations in one go */
+	unsigned max_relocs = 0
+	      + 16 /* vertex buffers, incl. dma flag */
+	      + 2 /* index buffer plus format+dma flag */
+	      + 2 * 5 /* 4 cbufs + zsbuf, plus dma objects */
+	      + 2 * 16 /* fragment textures plus format+dma flag */
+	      + 2 * 4 /* vertex textures plus format+dma flag */
+	      + 1 /* fragprog incl dma flag */
+	      ;
+	MARK_RING(chan, max_relocs * 2, max_relocs * 2);
+	nvfx_state_relocate(nvfx);
+}
+
+void
+nvfx_state_relocate(struct nvfx_context *nvfx)
+{
+	nvfx_framebuffer_relocate(nvfx);
+	nvfx_fragtex_relocate(nvfx);
+	nvfx_fragprog_relocate(nvfx);
+	if (nvfx->render_mode == HW)
+		nvfx_vbo_relocate(nvfx);
+}
+
+boolean
+nvfx_state_validate(struct nvfx_context *nvfx)
+{
+	boolean was_sw = nvfx->fallback_swtnl ? TRUE : FALSE;
+
+	if (nvfx->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nvfx->fallback_swtnl & nvfx->dirty)
+				!= nvfx->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nvfx->dirty |= (NVFX_NEW_VIEWPORT |
+				NVFX_NEW_VERTPROG |
+				NVFX_NEW_ARRAYS);
+		nvfx->render_mode = HW;
+	}
+
+	if(!nvfx_state_validate_common(nvfx))
+		return FALSE;
+
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+
+	return TRUE;
+}
+
+boolean
+nvfx_state_validate_swtnl(struct nvfx_context *nvfx)
+{
+	struct draw_context *draw = nvfx->draw;
+
+	/* Setup for swtnl */
+	if (nvfx->render_mode == HW) {
+		NOUVEAU_ERR("hw->swtnl 0x%08x\n", nvfx->fallback_swtnl);
+		nvfx->pipe.flush(&nvfx->pipe, 0, NULL);
+		nvfx->dirty |= (NVFX_NEW_VIEWPORT |
+				NVFX_NEW_VERTPROG |
+				NVFX_NEW_ARRAYS);
+		nvfx->render_mode = SWTNL;
+	}
+
+	if (nvfx->draw_dirty & NVFX_NEW_VERTPROG)
+		draw_bind_vertex_shader(draw, nvfx->vertprog->draw);
+
+	if (nvfx->draw_dirty & NVFX_NEW_RAST)
+           draw_set_rasterizer_state(draw, &nvfx->rasterizer->pipe,
+                                     nvfx->rasterizer);
+
+	if (nvfx->draw_dirty & NVFX_NEW_UCP)
+		draw_set_clip_state(draw, &nvfx->clip);
+
+	if (nvfx->draw_dirty & NVFX_NEW_VIEWPORT)
+		draw_set_viewport_state(draw, &nvfx->viewport);
+
+	if (nvfx->draw_dirty & NVFX_NEW_ARRAYS) {
+		draw_set_vertex_buffers(draw, nvfx->vtxbuf_nr, nvfx->vtxbuf);
+		draw_set_vertex_elements(draw, nvfx->vtxelt->num_elements, nvfx->vtxelt->pipe);
+	}
+
+	nvfx_state_validate_common(nvfx);
+
+	nvfx->draw_dirty = 0;
+	return TRUE;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_fb.c b/src/gallium/drivers/nvfx/nvfx_state_fb.c
new file mode 100644
index 0000000000..360e569f77
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_fb.c
@@ -0,0 +1,250 @@
+#include "nvfx_context.h"
+#include "nvfx_resource.h"
+#include "nouveau/nouveau_util.h"
+
+
+
+void
+nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
+{
+	struct pipe_framebuffer_state *fb = &nvfx->framebuffer;
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	uint32_t rt_enable = 0, rt_format = 0;
+	int i, colour_format = 0, zeta_format = 0;
+	int depth_only = 0;
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+	int colour_bits = 32, zeta_bits = 32;
+
+	if(!nvfx->is_nv4x)
+		assert(fb->nr_cbufs <= 2);
+	else
+		assert(fb->nr_cbufs <= 4);
+
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		if (colour_format)
+			assert(colour_format == fb->cbufs[i]->format);
+		else
+			colour_format = fb->cbufs[i]->format;
+
+		rt_enable |= (NV34TCL_RT_ENABLE_COLOR0 << i);
+		nvfx->hw_rt[i].bo = nvfx_surface_buffer(fb->cbufs[i]);
+		nvfx->hw_rt[i].offset = fb->cbufs[i]->offset;
+		nvfx->hw_rt[i].pitch = ((struct nv04_surface *)fb->cbufs[i])->pitch;
+	}
+	for(; i < 4; ++i)
+		nvfx->hw_rt[i].bo = 0;
+
+	if (rt_enable & (NV34TCL_RT_ENABLE_COLOR1 |
+			 NV40TCL_RT_ENABLE_COLOR2 | NV40TCL_RT_ENABLE_COLOR3))
+		rt_enable |= NV34TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		zeta_format = fb->zsbuf->format;
+		nvfx->hw_zeta.bo = nvfx_surface_buffer(fb->zsbuf);
+		nvfx->hw_zeta.offset = fb->zsbuf->offset;
+		nvfx->hw_zeta.pitch = ((struct nv04_surface *)fb->zsbuf)->pitch;
+	}
+	else
+		nvfx->hw_zeta.bo = 0;
+
+	if (rt_enable & (NV34TCL_RT_ENABLE_COLOR0 | NV34TCL_RT_ENABLE_COLOR1 |
+		NV40TCL_RT_ENABLE_COLOR2 | NV40TCL_RT_ENABLE_COLOR3)) {
+		/* Render to at least a colour buffer */
+		if (!(fb->cbufs[0]->texture->flags & NVFX_RESOURCE_FLAG_LINEAR)) {
+			assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+			for (i = 1; i < fb->nr_cbufs; i++)
+				assert(!(fb->cbufs[i]->texture->flags & NVFX_RESOURCE_FLAG_LINEAR));
+
+			rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+				(log2i(fb->cbufs[0]->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
+				(log2i(fb->cbufs[0]->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+		}
+		else
+			rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+	} else if (fb->zsbuf) {
+		depth_only = 1;
+
+		/* Render to depth buffer only */
+		if (!(fb->zsbuf->texture->usage & NVFX_RESOURCE_FLAG_LINEAR)) {
+			assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+
+			rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+				(log2i(fb->zsbuf->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
+				(log2i(fb->zsbuf->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+		}
+		else
+			rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+	} else {
+		return;
+	}
+
+	switch (colour_format) {
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_X8R8G8B8;
+		break;
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_B5G6R5_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
+		colour_bits = 16;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
+		zeta_bits = 16;
+		break;
+	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if ((!nvfx->is_nv4x) && colour_bits > zeta_bits) {
+		/* TODO: does this limitation really exist?
+		   TODO: can it be worked around somehow? */
+		assert(0);
+	}
+
+	if ((rt_enable & NV34TCL_RT_ENABLE_COLOR0)
+		|| ((!nvfx->is_nv4x) && depth_only)) {
+		struct nvfx_render_target *rt0 = (depth_only ? &nvfx->hw_zeta : &nvfx->hw_rt[0]);
+		uint32_t pitch = rt0->pitch;
+
+		if(!nvfx->is_nv4x)
+		{
+			if (nvfx->hw_zeta.bo) {
+				pitch |= (nvfx->hw_zeta.pitch << 16);
+			} else {
+				pitch |= (pitch << 16);
+			}
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_DMA_COLOR0, 1));
+		OUT_RELOC(chan, rt0->bo, 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		OUT_RING(chan, RING_3D(NV34TCL_COLOR0_PITCH, 2));
+		OUT_RING(chan, pitch);
+		OUT_RELOC(chan, rt0->bo,
+			      rt0->offset, rt_flags | NOUVEAU_BO_LOW,
+			      0, 0);
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1) {
+		OUT_RING(chan, RING_3D(NV34TCL_DMA_COLOR1, 1));
+		OUT_RELOC(chan, nvfx->hw_rt[1].bo, 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		OUT_RING(chan, RING_3D(NV34TCL_COLOR1_OFFSET, 2));
+		OUT_RELOC(chan, nvfx->hw_rt[1].bo,
+				nvfx->hw_rt[1].offset, rt_flags | NOUVEAU_BO_LOW,
+			      0, 0);
+		OUT_RING(chan, nvfx->hw_rt[1].pitch);
+	}
+
+	if(nvfx->is_nv4x)
+	{
+		if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
+			OUT_RING(chan, RING_3D(NV40TCL_DMA_COLOR2, 1));
+			OUT_RELOC(chan, nvfx->hw_rt[2].bo, 0,
+				      rt_flags | NOUVEAU_BO_OR,
+				      chan->vram->handle, chan->gart->handle);
+			OUT_RING(chan, RING_3D(NV40TCL_COLOR2_OFFSET, 1));
+			OUT_RELOC(chan, nvfx->hw_rt[2].bo,
+				      nvfx->hw_rt[2].offset, rt_flags | NOUVEAU_BO_LOW,
+				      0, 0);
+			OUT_RING(chan, RING_3D(NV40TCL_COLOR2_PITCH, 1));
+			OUT_RING(chan, nvfx->hw_rt[2].pitch);
+		}
+
+		if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
+			OUT_RING(chan, RING_3D(NV40TCL_DMA_COLOR3, 1));
+			OUT_RELOC(chan, nvfx->hw_rt[3].bo, 0,
+				      rt_flags | NOUVEAU_BO_OR,
+				      chan->vram->handle, chan->gart->handle);
+			OUT_RING(chan, RING_3D(NV40TCL_COLOR3_OFFSET, 1));
+			OUT_RELOC(chan, nvfx->hw_rt[3].bo,
+					nvfx->hw_rt[3].offset, rt_flags | NOUVEAU_BO_LOW,
+				      0, 0);
+			OUT_RING(chan, RING_3D(NV40TCL_COLOR3_PITCH, 1));
+			OUT_RING(chan, nvfx->hw_rt[3].pitch);
+		}
+	}
+
+	if (zeta_format) {
+		OUT_RING(chan, RING_3D(NV34TCL_DMA_ZETA, 1));
+		OUT_RELOC(chan, nvfx->hw_zeta.bo, 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		OUT_RING(chan, RING_3D(NV34TCL_ZETA_OFFSET, 1));
+		/* TODO: reverse engineer LMA */
+		OUT_RELOC(chan, nvfx->hw_zeta.bo,
+			     nvfx->hw_zeta.offset, rt_flags | NOUVEAU_BO_LOW, 0, 0);
+	        if(nvfx->is_nv4x) {
+			OUT_RING(chan, RING_3D(NV40TCL_ZETA_PITCH, 1));
+			OUT_RING(chan, nvfx->hw_zeta.pitch);
+		}
+	}
+
+	OUT_RING(chan, RING_3D(NV34TCL_RT_ENABLE, 1));
+	OUT_RING(chan, rt_enable);
+	OUT_RING(chan, RING_3D(NV34TCL_RT_HORIZ, 3));
+	OUT_RING(chan, (w << 16) | 0);
+	OUT_RING(chan, (h << 16) | 0);
+	OUT_RING(chan, rt_format);
+	OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_HORIZ, 2));
+	OUT_RING(chan, (w << 16) | 0);
+	OUT_RING(chan, (h << 16) | 0);
+	OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_CLIP_HORIZ(0), 2));
+	OUT_RING(chan, ((w - 1) << 16) | 0);
+	OUT_RING(chan, ((h - 1) << 16) | 0);
+	OUT_RING(chan, RING_3D(0x1d88, 1));
+	OUT_RING(chan, (1 << 12) | h);
+
+	if(!nvfx->is_nv4x) {
+		/* Wonder why this is needed, context should all be set to zero on init */
+		/* TODO: we can most likely remove this, after putting it in context init */
+		OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_TX_ORIGIN, 1));
+		OUT_RING(chan, 0);
+	}
+}
+
+void
+nvfx_framebuffer_relocate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	rt_flags |= NOUVEAU_BO_DUMMY;
+	MARK_RING(chan, 20, 20);
+
+#define DO_(var, pfx, name) \
+	if(var.bo) { \
+		OUT_RELOC(chan, var.bo, RING_3D(pfx##TCL_DMA_##name, 1), rt_flags, 0, 0); \
+		OUT_RELOC(chan, var.bo, 0, \
+			rt_flags | NOUVEAU_BO_OR, \
+			chan->vram->handle, chan->gart->handle); \
+		OUT_RELOC(chan, var.bo, RING_3D(pfx##TCL_##name##_OFFSET, 1), rt_flags, 0, 0); \
+		OUT_RELOC(chan, var.bo, \
+			var.offset, rt_flags | NOUVEAU_BO_LOW, \
+			0, 0); \
+	}
+
+#define DO(pfx, num) DO_(nvfx->hw_rt[num], pfx, COLOR##num)
+	DO(NV34, 0);
+	DO(NV34, 1);
+	DO(NV40, 2);
+	DO(NV40, 3);
+
+	DO_(nvfx->hw_zeta, NV34, ZETA);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_rasterizer.c b/src/gallium/drivers/nvfx/nvfx_state_rasterizer.c
new file mode 100644
index 0000000000..7f14ae85d5
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_rasterizer.c
@@ -0,0 +1,9 @@
+#include "nvfx_context.h"
+
+void
+nvfx_state_rasterizer_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	sb_emit(chan, nvfx->rasterizer->sb, nvfx->rasterizer->sb_len);
+}
+
diff --git a/src/gallium/drivers/nvfx/nvfx_state_scissor.c b/src/gallium/drivers/nvfx/nvfx_state_scissor.c
new file mode 100644
index 0000000000..9077266120
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_scissor.c
@@ -0,0 +1,23 @@
+#include "nvfx_context.h"
+
+void
+nvfx_state_scissor_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct pipe_rasterizer_state *rast = &nvfx->rasterizer->pipe;
+	struct pipe_scissor_state *s = &nvfx->scissor;
+
+	if ((rast->scissor == 0 && nvfx->state.scissor_enabled == 0))
+		return;
+	nvfx->state.scissor_enabled = rast->scissor;
+
+	WAIT_RING(chan, 3);
+	OUT_RING(chan, RING_3D(NV34TCL_SCISSOR_HORIZ, 2));
+	if (nvfx->state.scissor_enabled) {
+		OUT_RING(chan, ((s->maxx - s->minx) << 16) | s->minx);
+		OUT_RING(chan, ((s->maxy - s->miny) << 16) | s->miny);
+	} else {
+		OUT_RING(chan, 4096 << 16);
+		OUT_RING(chan, 4096 << 16);
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_stipple.c b/src/gallium/drivers/nvfx/nvfx_state_stipple.c
new file mode 100644
index 0000000000..4da968f093
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_stipple.c
@@ -0,0 +1,26 @@
+#include "nvfx_context.h"
+
+void
+nvfx_state_stipple_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct pipe_rasterizer_state *rast = &nvfx->rasterizer->pipe;
+
+	if ((rast->poly_stipple_enable == 0 && nvfx->state.stipple_enabled == 0))
+		return;
+
+	if (rast->poly_stipple_enable) {
+		unsigned i;
+
+		WAIT_RING(chan, 35);
+		OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_ENABLE, 1));
+		OUT_RING(chan, 1);
+		OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32));
+		for (i = 0; i < 32; i++)
+			OUT_RING(chan, nvfx->stipple[i]);
+	} else {
+		WAIT_RING(chan, 2);
+		OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_ENABLE, 1));
+		OUT_RING(chan, 0);
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_viewport.c b/src/gallium/drivers/nvfx/nvfx_state_viewport.c
new file mode 100644
index 0000000000..e983b16f32
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_viewport.c
@@ -0,0 +1,35 @@
+#include "nvfx_context.h"
+
+void
+nvfx_state_viewport_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct pipe_viewport_state *vpt = &nvfx->viewport;
+
+	WAIT_RING(chan, 11);
+	if(nvfx->render_mode == HW) {
+		OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_TRANSLATE_X, 8));
+		OUT_RINGf(chan, vpt->translate[0]);
+		OUT_RINGf(chan, vpt->translate[1]);
+		OUT_RINGf(chan, vpt->translate[2]);
+		OUT_RINGf(chan, vpt->translate[3]);
+		OUT_RINGf(chan, vpt->scale[0]);
+		OUT_RINGf(chan, vpt->scale[1]);
+		OUT_RINGf(chan, vpt->scale[2]);
+		OUT_RINGf(chan, vpt->scale[3]);
+		OUT_RING(chan, RING_3D(0x1d78, 1));
+		OUT_RING(chan, 1);
+	} else {
+		OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_TRANSLATE_X, 8));
+		OUT_RINGf(chan, 0.0f);
+		OUT_RINGf(chan, 0.0f);
+		OUT_RINGf(chan, 0.0f);
+		OUT_RINGf(chan, 0.0f);
+		OUT_RINGf(chan, 1.0f);
+		OUT_RINGf(chan, 1.0f);
+		OUT_RINGf(chan, 1.0f);
+		OUT_RINGf(chan, 1.0f);
+		OUT_RING(chan, RING_3D(0x1d78, 1));
+		OUT_RING(chan, nvfx->is_nv4x ? 0x110 : 1);
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_zsa.c b/src/gallium/drivers/nvfx/nvfx_state_zsa.c
new file mode 100644
index 0000000000..608605d32b
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_zsa.c
@@ -0,0 +1,21 @@
+#include "nvfx_context.h"
+
+void
+nvfx_state_zsa_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	sb_emit(chan, nvfx->zsa->sb, nvfx->zsa->sb_len);
+}
+
+void
+nvfx_state_sr_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct pipe_stencil_ref *sr = &nvfx->stencil_ref;
+
+	WAIT_RING(chan, 4);
+	OUT_RING(chan, RING_3D(NV34TCL_STENCIL_FRONT_FUNC_REF, 1));
+	OUT_RING(chan, sr->ref_value[0]);
+	OUT_RING(chan, RING_3D(NV34TCL_STENCIL_BACK_FUNC_REF, 1));
+	OUT_RING(chan, sr->ref_value[1]);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_surface.c b/src/gallium/drivers/nvfx/nvfx_surface.c
new file mode 100644
index 0000000000..a605d2b754
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_surface.c
@@ -0,0 +1,96 @@
+
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "nvfx_context.h"
+#include "nvfx_resource.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_pack_color.h"
+
+static void
+nvfx_surface_copy(struct pipe_context *pipe,
+		  struct pipe_resource *dest, struct pipe_subresource subdst,
+		  unsigned destx, unsigned desty, unsigned destz,
+		  struct pipe_resource *src, struct pipe_subresource subsrc,
+		  unsigned srcx, unsigned srcy, unsigned srcz,
+		  unsigned width, unsigned height)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
+	struct pipe_surface *ps_dst, *ps_src;
+
+	ps_src = nvfx_miptree_surface_new(pipe->screen, src, subsrc.face,
+					  subsrc.level, srcz, 0 /* bind flags */);
+	ps_dst = nvfx_miptree_surface_new(pipe->screen, dest, subdst.face,
+					  subdst.level, destz, 0 /* bindflags */);
+
+	eng2d->copy(eng2d, ps_dst, destx, desty, ps_src, srcx, srcy, width, height);
+
+	nvfx_miptree_surface_del(ps_src);
+	nvfx_miptree_surface_del(ps_dst);
+}
+
+static void
+nvfx_clear_render_target(struct pipe_context *pipe,
+			 struct pipe_surface *dst,
+			 const float *rgba,
+			 unsigned dstx, unsigned dsty,
+			 unsigned width, unsigned height)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
+	union util_color uc;
+	util_pack_color(rgba, dst->format, &uc);
+
+	eng2d->fill(eng2d, dst, dstx, dsty, width, height, uc.ui);
+}
+
+static void
+nvfx_clear_depth_stencil(struct pipe_context *pipe,
+			 struct pipe_surface *dst,
+			 unsigned clear_flags,
+			 double depth,
+			 unsigned stencil,
+			 unsigned dstx, unsigned dsty,
+			 unsigned width, unsigned height)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
+
+	eng2d->fill(eng2d, dst, dstx, dsty, width, height,
+		    util_pack_z_stencil(dst->format, depth, stencil));
+}
+
+
+void
+nvfx_init_surface_functions(struct nvfx_context *nvfx)
+{
+	nvfx->pipe.resource_copy_region = nvfx_surface_copy;
+	nvfx->pipe.clear_render_target = nvfx_clear_render_target;
+	nvfx->pipe.clear_depth_stencil = nvfx_clear_depth_stencil;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_tex.h b/src/gallium/drivers/nvfx/nvfx_tex.h
new file mode 100644
index 0000000000..69187a79e7
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_tex.h
@@ -0,0 +1,133 @@
+#ifndef NVFX_TEX_H_
+#define NVFX_TEX_H_
+
+static inline unsigned
+nvfx_tex_wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP;
+		break;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV34TCL_TX_WRAP_S_SHIFT;
+}
+
+static inline unsigned
+nvfx_tex_wrap_compare_mode(const struct pipe_sampler_state* cso)
+{
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			return NV34TCL_TX_WRAP_RCOMP_NEVER;
+		case PIPE_FUNC_GREATER:
+			return NV34TCL_TX_WRAP_RCOMP_GREATER;
+		case PIPE_FUNC_EQUAL:
+			return NV34TCL_TX_WRAP_RCOMP_EQUAL;
+		case PIPE_FUNC_GEQUAL:
+			return NV34TCL_TX_WRAP_RCOMP_GEQUAL;
+		case PIPE_FUNC_LESS:
+			return NV34TCL_TX_WRAP_RCOMP_LESS;
+		case PIPE_FUNC_NOTEQUAL:
+			return NV34TCL_TX_WRAP_RCOMP_NOTEQUAL;
+		case PIPE_FUNC_LEQUAL:
+			return NV34TCL_TX_WRAP_RCOMP_LEQUAL;
+		case PIPE_FUNC_ALWAYS:
+			return NV34TCL_TX_WRAP_RCOMP_ALWAYS;
+		default:
+			break;
+		}
+	}
+	return 0;
+}
+
+static inline unsigned nvfx_tex_filter(const struct pipe_sampler_state* cso)
+{
+	unsigned filter = 0;
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+	return filter;
+}
+
+static inline unsigned nvfx_tex_border_color(const float* border_color)
+{
+	return ((float_to_ubyte(border_color[3]) << 24) |
+		    (float_to_ubyte(border_color[0]) << 16) |
+		    (float_to_ubyte(border_color[1]) <<  8) |
+		    (float_to_ubyte(border_color[2]) <<  0));
+}
+
+struct nvfx_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+#endif /* NVFX_TEX_H_ */
diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.c b/src/gallium/drivers/nvfx/nvfx_transfer.c
new file mode 100644
index 0000000000..9ff0a93d30
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_transfer.c
@@ -0,0 +1,207 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "nouveau/nouveau_winsys.h"
+#include "nvfx_context.h"
+#include "nvfx_screen.h"
+#include "nvfx_state.h"
+#include "nvfx_resource.h"
+#include "nvfx_transfer.h"
+
+struct nvfx_transfer {
+	struct pipe_transfer base;
+	struct pipe_surface *surface;
+	boolean direct;
+};
+
+static void
+nvfx_compatible_transfer_tex(struct pipe_resource *pt, unsigned width, unsigned height,
+			     unsigned bind,
+                             struct pipe_resource *template)
+{
+	memset(template, 0, sizeof(struct pipe_resource));
+	template->target = pt->target;
+	template->format = pt->format;
+	template->width0 = width;
+	template->height0 = height;
+	template->depth0 = 1;
+	template->last_level = 0;
+	template->nr_samples = pt->nr_samples;
+	template->bind = bind;
+	template->usage = PIPE_USAGE_DYNAMIC;
+	template->flags = NVFX_RESOURCE_FLAG_LINEAR;
+}
+
+
+static unsigned nvfx_transfer_bind_flags( unsigned transfer_usage )
+{
+	unsigned bind = 0;
+
+#if 0
+	if (transfer_usage & PIPE_TRANSFER_WRITE)
+		bind |= PIPE_BIND_BLIT_SOURCE;
+
+	if (transfer_usage & PIPE_TRANSFER_READ)
+		bind |= PIPE_BIND_BLIT_DESTINATION;
+#endif
+
+	return bind;
+}
+
+struct pipe_transfer *
+nvfx_miptree_transfer_new(struct pipe_context *pipe,
+			  struct pipe_resource *pt,
+			  struct pipe_subresource sr,
+			  unsigned usage,
+			  const struct pipe_box *box)
+{
+	struct pipe_screen *pscreen = pipe->screen;
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+	struct nvfx_transfer *tx;
+	struct pipe_resource tx_tex_template, *tx_tex;
+	static int no_transfer = -1;
+	unsigned bind = nvfx_transfer_bind_flags(usage);
+	if(no_transfer < 0)
+		no_transfer = debug_get_bool_option("NOUVEAU_NO_TRANSFER", FALSE);
+
+
+	tx = CALLOC_STRUCT(nvfx_transfer);
+	if (!tx)
+		return NULL;
+
+	/* Don't handle 3D transfers yet.
+	 */
+	assert(box->depth == 1);
+
+	pipe_resource_reference(&tx->base.resource, pt);
+	tx->base.sr = sr;
+	tx->base.usage = usage;
+	tx->base.box = *box;
+	tx->base.stride = mt->level[sr.level].pitch;
+
+	/* Direct access to texture */
+	if ((pt->usage == PIPE_USAGE_DYNAMIC ||
+	     no_transfer) &&
+	    pt->flags & NVFX_RESOURCE_FLAG_LINEAR)
+	{
+		tx->direct = true;
+
+		/* XXX: just call the internal nvfx function.  
+		 */
+		tx->surface = pscreen->get_tex_surface(pscreen, pt,
+	                                               sr.face, sr.level,
+						       box->z,
+	                                               bind);
+		return &tx->base;
+	}
+
+	tx->direct = false;
+
+	nvfx_compatible_transfer_tex(pt, box->width, box->height, bind, &tx_tex_template);
+
+	tx_tex = pscreen->resource_create(pscreen, &tx_tex_template);
+	if (!tx_tex)
+	{
+		FREE(tx);
+		return NULL;
+	}
+
+	tx->base.stride = ((struct nvfx_miptree*)tx_tex)->level[0].pitch;
+
+	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
+	                                       0, 0, 0,
+	                                       bind);
+
+	pipe_resource_reference(&tx_tex, NULL);
+
+	if (!tx->surface)
+	{
+		pipe_surface_reference(&tx->surface, NULL);
+		FREE(tx);
+		return NULL;
+	}
+
+	if (usage & PIPE_TRANSFER_READ) {
+		struct nvfx_screen *nvscreen = nvfx_screen(pscreen);
+		struct pipe_surface *src;
+
+		src = pscreen->get_tex_surface(pscreen, pt,
+	                                       sr.face, sr.level, box->z,
+	                                       0 /*PIPE_BIND_BLIT_SOURCE*/);
+
+		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
+		/* TODO: Check if SIFM can un-swizzle */
+		nvscreen->eng2d->copy(nvscreen->eng2d,
+		                      tx->surface, 0, 0,
+		                      src,
+				      box->x, box->y,
+		                      box->width, box->height);
+
+		pipe_surface_reference(&src, NULL);
+	}
+
+	return &tx->base;
+}
+
+void
+nvfx_miptree_transfer_del(struct pipe_context *pipe,
+			  struct pipe_transfer *ptx)
+{
+	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
+
+	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
+		struct pipe_screen *pscreen = pipe->screen;
+		struct nvfx_screen *nvscreen = nvfx_screen(pscreen);
+		struct pipe_surface *dst;
+
+		dst = pscreen->get_tex_surface(pscreen,
+					       ptx->resource,
+	                                       ptx->sr.face,
+					       ptx->sr.level,
+					       ptx->box.z,
+	                                       0 /*PIPE_BIND_BLIT_DESTINATION*/);
+
+		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
+		nvscreen->eng2d->copy(nvscreen->eng2d,
+		                      dst, ptx->box.x, ptx->box.y,
+		                      tx->surface, 0, 0,
+		                      ptx->box.width, ptx->box.height);
+
+		pipe_surface_reference(&dst, NULL);
+	}
+
+	pipe_surface_reference(&tx->surface, NULL);
+	pipe_resource_reference(&ptx->resource, NULL);
+	FREE(ptx);
+}
+
+void *
+nvfx_miptree_transfer_map(struct pipe_context *pipe, struct pipe_transfer *ptx)
+{
+	struct pipe_screen *pscreen = pipe->screen;
+	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
+	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->surface->texture;
+	uint8_t *map = nouveau_screen_bo_map(pscreen, mt->base.bo,
+					     nouveau_screen_transfer_flags(ptx->usage));
+
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return (map + ns->base.offset + 
+			ptx->box.y * ns->pitch + 
+			ptx->box.x * util_format_get_blocksize(ptx->resource->format));
+}
+
+void
+nvfx_miptree_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *ptx)
+{
+	struct pipe_screen *pscreen = pipe->screen;
+	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->surface->texture;
+
+	nouveau_screen_bo_unmap(pscreen, mt->base.bo);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.h b/src/gallium/drivers/nvfx/nvfx_transfer.h
new file mode 100644
index 0000000000..3e3317b2c7
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_transfer.h
@@ -0,0 +1,26 @@
+
+#ifndef NVFX_TRANSFER_H
+#define NVFX_TRANSFER_H
+
+#include "util/u_transfer.h"
+#include "pipe/p_state.h"
+
+
+struct pipe_transfer *
+nvfx_miptree_transfer_new(struct pipe_context *pcontext,
+			  struct pipe_resource *pt,
+			  struct pipe_subresource sr,
+			  unsigned usage,
+			  const struct pipe_box *box);
+void
+nvfx_miptree_transfer_del(struct pipe_context *pcontext,
+			  struct pipe_transfer *ptx);
+void *
+nvfx_miptree_transfer_map(struct pipe_context *pcontext,
+			  struct pipe_transfer *ptx);
+void
+nvfx_miptree_transfer_unmap(struct pipe_context *pcontext,
+			    struct pipe_transfer *ptx);
+
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_vbo.c b/src/gallium/drivers/nvfx/nvfx_vbo.c
new file mode 100644
index 0000000000..520bae5aed
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_vbo.c
@@ -0,0 +1,628 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+
+#include "nvfx_context.h"
+#include "nvfx_state.h"
+#include "nvfx_resource.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_class.h"
+#include "nouveau/nouveau_pushbuf.h"
+#include "nouveau/nouveau_util.h"
+
+static INLINE int
+nvfx_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+{
+	switch (pipe) {
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*fmt = NV34TCL_VTXFMT_TYPE_FLOAT;
+		break;
+	case PIPE_FORMAT_R16_FLOAT:
+	case PIPE_FORMAT_R16G16_FLOAT:
+	case PIPE_FORMAT_R16G16B16_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_FLOAT:
+		*fmt = NV34TCL_VTXFMT_TYPE_HALF;
+		break;
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+		*fmt = NV34TCL_VTXFMT_TYPE_UBYTE;
+		break;
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*fmt = NV34TCL_VTXFMT_TYPE_USHORT;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
+		return 1;
+	}
+
+	switch (pipe) {
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R16_FLOAT:
+	case PIPE_FORMAT_R16_SSCALED:
+		*ncomp = 1;
+		break;
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R16G16_FLOAT:
+	case PIPE_FORMAT_R16G16_SSCALED:
+		*ncomp = 2;
+		break;
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R16G16B16_FLOAT:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+		*ncomp = 3;
+		break;
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*ncomp = 4;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
+		return 1;
+	}
+
+	return 0;
+}
+
+static boolean
+nvfx_vbo_set_idxbuf(struct nvfx_context *nvfx, struct pipe_resource *ib,
+		    unsigned ib_size)
+{
+	unsigned type;
+
+	if (!ib) {
+		nvfx->idxbuf = NULL;
+		nvfx->idxbuf_format = 0xdeadbeef;
+		return FALSE;
+	}
+
+	if (!nvfx->screen->index_buffer_reloc_flags || ib_size == 1)
+		return FALSE;
+
+	switch (ib_size) {
+	case 2:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		return FALSE;
+	}
+
+	if (ib != nvfx->idxbuf ||
+	    type != nvfx->idxbuf_format) {
+		nvfx->dirty |= NVFX_NEW_ARRAYS;
+		nvfx->idxbuf = ib;
+		nvfx->idxbuf_format = type;
+	}
+
+	return TRUE;
+}
+
+// type must be floating point
+static inline void
+nvfx_vbo_static_attrib(struct nvfx_context *nvfx,
+		       int attrib, struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb, unsigned ncomp)
+{
+	struct pipe_transfer *transfer;
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	void *map;
+	float *v;
+
+	map  = pipe_buffer_map(&nvfx->pipe, vb->buffer, PIPE_TRANSFER_READ, &transfer);
+	map = (uint8_t *) map + vb->buffer_offset + ve->src_offset;
+
+	v = map;
+
+	switch (ncomp) {
+	case 4:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_4F_X(attrib), 4));
+		OUT_RING(chan, fui(v[0]));
+		OUT_RING(chan, fui(v[1]));
+		OUT_RING(chan,  fui(v[2]));
+		OUT_RING(chan,  fui(v[3]));
+		break;
+	case 3:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_3F_X(attrib), 3));
+		OUT_RING(chan,  fui(v[0]));
+		OUT_RING(chan,  fui(v[1]));
+		OUT_RING(chan,  fui(v[2]));
+		break;
+	case 2:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_2F_X(attrib), 2));
+		OUT_RING(chan,  fui(v[0]));
+		OUT_RING(chan,  fui(v[1]));
+		break;
+	case 1:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_1F(attrib), 1));
+		OUT_RING(chan,  fui(v[0]));
+		break;
+	}
+
+	pipe_buffer_unmap(&nvfx->pipe, vb->buffer, transfer);
+}
+
+void
+nvfx_draw_arrays(struct pipe_context *pipe,
+		 unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	unsigned restart = 0;
+
+	nvfx_vbo_set_idxbuf(nvfx, NULL, 0);
+	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) {
+		nvfx_draw_elements_swtnl(pipe, NULL, 0, 0,
+                                           mode, start, count);
+                return;
+	}
+
+	while (count) {
+		unsigned vc, nr, avail;
+
+		nvfx_state_emit(nvfx);
+
+		avail = AVAIL_RING(chan);
+		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+
+		vc = nouveau_vbuf_split(avail, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(chan);
+			continue;
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			OUT_RING(chan, RING_3D(NV34TCL_VB_VERTEX_BATCH, 1));
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_VERTEX_BATCH, push));
+			while (push--) {
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, 0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	pipe->flush(pipe, 0, NULL);
+}
+
+static INLINE void
+nvfx_draw_elements_u08(struct nvfx_context *nvfx, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+
+	while (count) {
+		uint8_t *elts = (uint8_t *)ib + start;
+		unsigned vc, push, restart = 0, avail;
+
+		nvfx_state_emit(nvfx);
+
+		avail = AVAIL_RING(chan);
+		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+
+		vc = nouveau_vbuf_split(avail, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(chan);
+			continue;
+		}
+		count -= vc;
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		if (vc & 1) {
+			OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+			OUT_RING  (chan, elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+			for (i = 0; i < push; i+=2)
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, 0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nvfx_draw_elements_u16(struct nvfx_context *nvfx, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+
+	while (count) {
+		uint16_t *elts = (uint16_t *)ib + start;
+		unsigned vc, push, restart = 0, avail;
+
+		nvfx_state_emit(nvfx);
+
+		avail = AVAIL_RING(chan);
+		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+
+		vc = nouveau_vbuf_split(avail, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(chan);
+			continue;
+		}
+		count -= vc;
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		if (vc & 1) {
+			OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+			OUT_RING  (chan, elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+			for (i = 0; i < push; i+=2)
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, 0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nvfx_draw_elements_u32(struct nvfx_context *nvfx, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+
+	while (count) {
+		uint32_t *elts = (uint32_t *)ib + start;
+		unsigned vc, push, restart = 0, avail;
+
+		nvfx_state_emit(nvfx);
+
+		avail = AVAIL_RING(chan);
+		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+
+		vc = nouveau_vbuf_split(avail, 5, 1,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(chan);
+			continue;
+		}
+		count -= vc;
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		while (vc) {
+			push = MIN2(vc, 2047);
+
+			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push));
+			OUT_RINGp    (chan, elts, push);
+
+			vc -= push;
+			elts += push;
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, 0);
+
+		start = restart;
+	}
+}
+
+static void
+nvfx_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_resource *ib,
+			  unsigned ib_size, int ib_bias,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct pipe_transfer *transfer;
+	void *map;
+
+	map = pipe_buffer_map(pipe, ib, PIPE_TRANSFER_READ, &transfer);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return;
+	}
+
+	assert(ib_bias == 0);
+
+	switch (ib_size) {
+	case 1:
+		nvfx_draw_elements_u08(nvfx, map, mode, start, count);
+		break;
+	case 2:
+		nvfx_draw_elements_u16(nvfx, map, mode, start, count);
+		break;
+	case 4:
+		nvfx_draw_elements_u32(nvfx, map, mode, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		break;
+	}
+
+	pipe_buffer_unmap(pipe, ib, transfer);
+}
+
+static void
+nvfx_draw_elements_vbo(struct pipe_context *pipe,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	unsigned restart = 0;
+
+	while (count) {
+		unsigned nr, vc, avail;
+
+		nvfx_state_emit(nvfx);
+
+		avail = AVAIL_RING(chan);
+		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+
+		vc = nouveau_vbuf_split(avail, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(chan);
+			continue;
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			OUT_RING(chan, RING_3D(NV34TCL_VB_INDEX_BATCH, 1));
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_INDEX_BATCH, push));
+			while (push--) {
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+		OUT_RING  (chan, 0);
+
+		count -= vc;
+		start = restart;
+	}
+}
+
+void
+nvfx_draw_elements(struct pipe_context *pipe,
+		   struct pipe_resource *indexBuffer,
+		   unsigned indexSize, int indexBias,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nvfx_vbo_set_idxbuf(nvfx, indexBuffer, indexSize);
+	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) {
+		nvfx_draw_elements_swtnl(pipe,
+		                         indexBuffer, indexSize, indexBias,
+		                         mode, start, count);
+		return;
+	}
+
+	if (idxbuf) {
+		nvfx_draw_elements_vbo(pipe, mode, start, count);
+	} else {
+		nvfx_draw_elements_inline(pipe,
+		                          indexBuffer, indexSize, indexBias,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0, NULL);
+}
+
+boolean
+nvfx_vbo_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct pipe_resource *ib = nvfx->idxbuf;
+	unsigned ib_format = nvfx->idxbuf_format;
+	int i;
+	int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr);
+	uint32_t vtxfmt[16];
+	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD;
+
+	if (!elements)
+		return TRUE;
+
+	nvfx->vbo_bo = 0;
+
+	MARK_RING(chan, (5 + 2) * 16 + 2 + 11, 16 + 2);
+	for (i = 0; i < nvfx->vtxelt->num_elements; i++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned type, ncomp;
+
+		ve = &nvfx->vtxelt->pipe[i];
+		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+
+		if (nvfx_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
+			MARK_UNDO(chan);
+			nvfx->fallback_swtnl |= NVFX_NEW_ARRAYS;
+			return FALSE;
+		}
+
+		if (!vb->stride && type == NV34TCL_VTXFMT_TYPE_FLOAT) {
+			nvfx_vbo_static_attrib(nvfx, i, ve, vb, ncomp);
+			vtxfmt[i] = type;
+		} else {
+			vtxfmt[i] = ((vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT) |
+				(ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type);
+			nvfx->vbo_bo |= (1 << i);
+		}
+	}
+
+	for(; i < elements; ++i)
+		vtxfmt[i] = NV34TCL_VTXFMT_TYPE_FLOAT;
+
+	OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements));
+	OUT_RINGp(chan, vtxfmt, elements);
+
+	if(nvfx->is_nv4x) {
+		unsigned i;
+		/* seems to be some kind of cache flushing */
+		for(i = 0; i < 3; ++i) {
+			OUT_RING(chan, RING_3D(0x1718, 1));
+			OUT_RING(chan, 0);
+		}
+	}
+
+	OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements));
+	for (i = 0; i < nvfx->vtxelt->num_elements; i++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+
+		ve = &nvfx->vtxelt->pipe[i];
+		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+
+		if (!(nvfx->vbo_bo & (1 << i)))
+			OUT_RING(chan, 0);
+		else
+		{
+			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
+			OUT_RELOC(chan, bo,
+				 vb->buffer_offset + ve->src_offset,
+				 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+				 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+		}
+	}
+
+        for (; i < elements; i++)
+		OUT_RING(chan, 0);
+
+	OUT_RING(chan, RING_3D(0x1710, 1));
+	OUT_RING(chan, 0);
+
+	if (ib) {
+		unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD;
+		struct nouveau_bo* bo = nvfx_resource(ib)->bo;
+
+		assert(nvfx->screen->index_buffer_reloc_flags);
+
+		OUT_RING(chan, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2));
+		OUT_RELOC(chan, bo, 0, ib_flags | NOUVEAU_BO_LOW, 0, 0);
+		OUT_RELOC(chan, bo, ib_format, ib_flags | NOUVEAU_BO_OR,
+				  0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements;
+	return TRUE;
+}
+
+void
+nvfx_vbo_relocate(struct nvfx_context *nvfx)
+{
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
+	int i;
+
+	MARK_RING(chan, 2 * 16 + 3, 2 * 16 + 3);
+	for(i = 0; i < nvfx->vtxelt->num_elements; ++i) {
+		if(nvfx->vbo_bo & (1 << i)) {
+			struct pipe_vertex_element *ve = &nvfx->vtxelt->pipe[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
+			OUT_RELOC(chan, bo, RING_3D(NV34TCL_VTXBUF_ADDRESS(i), 1),
+					vb_flags, 0, 0);
+			OUT_RELOC(chan, bo, vb->buffer_offset + ve->src_offset,
+					vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+					0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+		}
+	}
+
+	if(nvfx->idxbuf)
+	{
+		unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
+		struct nouveau_bo* bo = nvfx_resource(nvfx->idxbuf)->bo;
+
+		assert(nvfx->screen->index_buffer_reloc_flags);
+
+		OUT_RELOC(chan, bo, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2),
+				ib_flags, 0, 0);
+		OUT_RELOC(chan, bo, 0,
+				ib_flags | NOUVEAU_BO_LOW, 0, 0);
+		OUT_RELOC(chan, bo, nvfx->idxbuf_format,
+				ib_flags | NOUVEAU_BO_OR,
+				0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
new file mode 100644
index 0000000000..80b98b62d3
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -0,0 +1,1066 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nvfx_context.h"
+#include "nvfx_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#include "nv30_vertprog.h"
+#include "nv40_vertprog.h"
+
+#define NVFX_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
+
+struct nvfx_vpc {
+	struct nvfx_vertex_program *vp;
+
+	struct nvfx_vertex_program_exec *vpi;
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nvfx_sreg *r_address;
+	struct nvfx_sreg *r_temp;
+
+	struct nvfx_sreg *imm;
+	unsigned nr_imm;
+
+	unsigned hpos_idx;
+};
+
+static struct nvfx_sreg
+temp(struct nvfx_vpc *vpc)
+{
+	int idx = ffs(~vpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nvfx_sr(NVFXSR_TEMP, 0);
+	}
+
+	vpc->r_temps |= (1 << idx);
+	vpc->r_temps_discard |= (1 << idx);
+	return nvfx_sr(NVFXSR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nvfx_vpc *vpc)
+{
+	vpc->r_temps &= ~vpc->r_temps_discard;
+	vpc->r_temps_discard = 0;
+}
+
+static struct nvfx_sreg
+constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nvfx_vertex_program *vp = vpc->vp;
+	struct nvfx_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nvfx_sr(NVFXSR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nvfx_sr(NVFXSR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nvfx_vp_arith(nvfx, (cc), NVFX_VP_INST_SLOT_##s, NVFX_VP_INST_##s##_OP_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, struct nvfx_sreg src)
+{
+	struct nvfx_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NVFXSR_TEMP:
+		sr |= (NVFX_VP(SRC_REG_TYPE_TEMP) << NVFX_VP(SRC_REG_TYPE_SHIFT));
+		sr |= (src.index << NVFX_VP(SRC_TEMP_SRC_SHIFT));
+		break;
+	case NVFXSR_INPUT:
+		sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
+		       NVFX_VP(SRC_REG_TYPE_SHIFT));
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NVFX_VP(INST_INPUT_SRC_SHIFT));
+		break;
+	case NVFXSR_CONST:
+		sr |= (NVFX_VP(SRC_REG_TYPE_CONST) <<
+		       NVFX_VP(SRC_REG_TYPE_SHIFT));
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NVFXSR_NONE:
+		sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
+		       NVFX_VP(SRC_REG_TYPE_SHIFT));
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NVFX_VP(SRC_NEGATE);
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NVFX_VP(SRC_SWZ_X_SHIFT)) |
+	       (src.swz[1] << NVFX_VP(SRC_SWZ_Y_SHIFT)) |
+	       (src.swz[2] << NVFX_VP(SRC_SWZ_Z_SHIFT)) |
+	       (src.swz[3] << NVFX_VP(SRC_SWZ_W_SHIFT)));
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NVFX_VP(SRC0_HIGH_MASK)) >>
+			  NVFX_VP(SRC0_HIGH_SHIFT)) << NVFX_VP(INST_SRC0H_SHIFT);
+		hw[2] |= (sr & NVFX_VP(SRC0_LOW_MASK)) <<
+			  NVFX_VP(INST_SRC0L_SHIFT);
+		break;
+	case 1:
+		hw[2] |= sr << NVFX_VP(INST_SRC1_SHIFT);
+		break;
+	case 2:
+		hw[2] |= ((sr & NVFX_VP(SRC2_HIGH_MASK)) >>
+			  NVFX_VP(SRC2_HIGH_SHIFT)) << NVFX_VP(INST_SRC2H_SHIFT);
+		hw[3] |= (sr & NVFX_VP(SRC2_LOW_MASK)) <<
+			  NVFX_VP(INST_SRC2L_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot, struct nvfx_sreg dst)
+{
+	struct nvfx_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NVFXSR_TEMP:
+		if(!nvfx->is_nv4x)
+			hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+		else {
+			hw[3] |= NV40_VP_INST_DEST_MASK;
+			if (slot == 0) {
+				hw[0] |= (dst.index <<
+					  NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
+			} else {
+				hw[3] |= (dst.index <<
+					  NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
+			}
+		}
+		break;
+	case NVFXSR_OUTPUT:
+		/* TODO: this may be wrong because on nv30 COL0 and BFC0 are swapped */
+		switch (dst.index) {
+		case NVFX_VP_INST_DEST_CLIP(0):
+			vp->or |= (1 << 6);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0;
+			dst.index = NVFX_VP(INST_DEST_FOGC);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(1):
+			vp->or |= (1 << 7);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1;
+			dst.index = NVFX_VP(INST_DEST_FOGC);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(2):
+			vp->or |= (1 << 8);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2;
+			dst.index = NVFX_VP(INST_DEST_FOGC);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(3):
+			vp->or |= (1 << 9);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3;
+			dst.index = NVFX_VP(INST_DEST_PSZ);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(4):
+			vp->or |= (1 << 10);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4;
+			dst.index = NVFX_VP(INST_DEST_PSZ);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(5):
+			vp->or |= (1 << 11);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5;
+			dst.index = NVFX_VP(INST_DEST_PSZ);
+			break;
+		default:
+			if(!nvfx->is_nv4x) {
+				switch (dst.index) {
+				case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+				case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+				case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+				case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+				case NV30_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
+				case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+				case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+				case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+				case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+				case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+				case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+				case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+				case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+				case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+				}
+			} else {
+				switch (dst.index) {
+				case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+				case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+				case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+				case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+				case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
+				case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+				case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+				case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+				case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+				case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+				case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+				case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+				case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+				case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+				}
+			}
+			break;
+		}
+
+		if(!nvfx->is_nv4x) {
+			hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+			hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+
+			/*XXX: no way this is entirely correct, someone needs to
+			 *     figure out what exactly it is.
+			 */
+			hw[3] |= 0x800;
+		} else {
+			hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
+			if (slot == 0) {
+				hw[0] |= NV40_VP_INST_VEC_RESULT;
+				hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+			} else {
+				hw[3] |= NV40_VP_INST_SCA_RESULT;
+				hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+			}
+		}
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nvfx_vp_arith(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, int slot, int op,
+	      struct nvfx_sreg dst, int mask,
+	      struct nvfx_sreg s0, struct nvfx_sreg s1,
+	      struct nvfx_sreg s2)
+{
+	struct nvfx_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NVFX_COND_TR << NVFX_VP(INST_COND_SHIFT));
+	hw[0] |= ((0 << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
+		  (1 << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
+		  (2 << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
+		  (3 << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
+
+	if(!nvfx->is_nv4x) {
+		hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+//		hw[3] |= NVFX_VP(INST_SCA_DEST_TEMP_MASK);
+//		hw[3] |= (mask << NVFX_VP(INST_VEC_WRITEMASK_SHIFT));
+
+		if (dst.type == NVFXSR_OUTPUT) {
+			if (slot)
+				hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+			else
+				hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+		} else {
+			if (slot)
+				hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+			else
+				hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+		}
+	 } else {
+		if (slot == 0) {
+			hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
+			hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+			hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
+	    } else {
+			hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
+			hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
+			hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
+		}
+	}
+
+	emit_dst(nvfx, vpc, hw, slot, dst);
+	emit_src(nvfx, vpc, hw, 0, s0);
+	emit_src(nvfx, vpc, hw, 1, s1);
+	emit_src(nvfx, vpc, hw, 2, s2);
+}
+
+static INLINE struct nvfx_sreg
+tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nvfx_sreg src = { 0 };
+
+	switch (fsrc->Register.File) {
+	case TGSI_FILE_INPUT:
+		src = nvfx_sr(NVFXSR_INPUT, fsrc->Register.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->Register.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = vpc->r_temp[fsrc->Register.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->Register.Absolute;
+	src.negate = fsrc->Register.Negate;
+	src.swz[0] = fsrc->Register.SwizzleX;
+	src.swz[1] = fsrc->Register.SwizzleY;
+	src.swz[2] = fsrc->Register.SwizzleZ;
+	src.swz[3] = fsrc->Register.SwizzleW;
+	return src;
+}
+
+static INLINE struct nvfx_sreg
+tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nvfx_sreg dst = { 0 };
+
+	switch (fdst->Register.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = vpc->r_result[fdst->Register.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = vpc->r_temp[fdst->Register.Index];
+		break;
+	case TGSI_FILE_ADDRESS:
+		dst = vpc->r_address[fdst->Register.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_VP_MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_VP_MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_VP_MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_VP_MASK_W;
+	return mask;
+}
+
+static boolean
+nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nvfx_sreg src[3], dst, tmp;
+	struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->Src[i];
+		if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->Src[i];
+
+		switch (fsrc->Register.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->Register.Index) {
+				ai = fsrc->Register.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->Register.Index) {
+				ci = fsrc->Register.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->Register.Index) {
+				ii = fsrc->Register.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->Dst[0]);
+	mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, VEC, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, VEC, ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, VEC, ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(vpc, SCA, COS, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, VEC, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, VEC, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, VEC, DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, VEC, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, SCA, EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, SCA, EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, VEC, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, VEC, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, SCA, LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, SCA, LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, SCA, LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LRP:
+		tmp = temp(vpc);
+		arith(vpc, VEC, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
+		arith(vpc, VEC, MAD, dst, mask, src[0], src[1], tmp);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, VEC, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, VEC, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, VEC, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, VEC, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, VEC, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, SCA, LG2, tmp, NVFX_VP_MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, VEC, MUL, tmp, NVFX_VP_MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, SCA, EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, SCA, RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, SCA, RSQ, dst, mask, none, none, abs(src[0]));
+		break;
+	case TGSI_OPCODE_SEQ:
+		arith(vpc, VEC, SEQ, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SFL:
+		arith(vpc, VEC, SFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, VEC, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(vpc, VEC, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(vpc, SCA, SIN, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SLE:
+		arith(vpc, VEC, SLE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, VEC, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SNE:
+		arith(vpc, VEC, SNE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SSG:
+		arith(vpc, VEC, SSG, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_STR:
+		arith(vpc, VEC, STR, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, VEC, ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, VEC, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(vpc);
+	return TRUE;
+}
+
+static boolean
+nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->Range.First;
+	int hw;
+
+	switch (fdec->Semantic.Name) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NVFX_VP(INST_DEST_POS);
+		vpc->hpos_idx = idx;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.Index == 0) {
+			hw = NVFX_VP(INST_DEST_COL0);
+		} else
+		if (fdec->Semantic.Index == 1) {
+			hw = NVFX_VP(INST_DEST_COL1);
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.Index == 0) {
+			hw = NVFX_VP(INST_DEST_BFC0);
+		} else
+		if (fdec->Semantic.Index == 1) {
+			hw = NVFX_VP(INST_DEST_BFC1);
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NVFX_VP(INST_DEST_FOGC);
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NVFX_VP(INST_DEST_PSZ);
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.Index <= 7) {
+			hw = NVFX_VP(INST_DEST_TC(fdec->Semantic.Index));
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_EDGEFLAG:
+		/* not really an error just a fallback */
+		NOUVEAU_ERR("cannot handle edgeflag output\n");
+		return FALSE;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->r_result[idx] = nvfx_sr(NVFXSR_OUTPUT, hw);
+	return TRUE;
+}
+
+static boolean
+nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, high_addr = -1, nr_imm = 0, i;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->Range.Last > high_temp) {
+					high_temp =
+						fdec->Range.Last;
+				}
+				break;
+#if 0 /* this would be nice.. except gallium doesn't track it */
+			case TGSI_FILE_ADDRESS:
+				if (fdec->Range.Last > high_addr) {
+					high_addr =
+						fdec->Range.Last;
+				}
+				break;
+#endif
+			case TGSI_FILE_OUTPUT:
+				if (!nvfx_vertprog_parse_decl_output(nvfx, vpc, fdec))
+					return FALSE;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+#if 1 /* yay, parse instructions looking for address regs instead */
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			const struct tgsi_full_dst_register *fdst;
+
+			finst = &p.FullToken.FullInstruction;
+			fdst = &finst->Dst[0];
+
+			if (fdst->Register.File == TGSI_FILE_ADDRESS) {
+				if (fdst->Register.Index > high_addr)
+					high_addr = fdst->Register.Index;
+			}
+
+		}
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_sreg));
+		assert(vpc->imm);
+	}
+
+	if (++high_temp) {
+		vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_sreg));
+		for (i = 0; i < high_temp; i++)
+			vpc->r_temp[i] = temp(vpc);
+	}
+
+	if (++high_addr) {
+		vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_sreg));
+		for (i = 0; i < high_addr; i++)
+			vpc->r_address[i] = temp(vpc);
+	}
+
+	vpc->r_temps_discard = 0;
+	return TRUE;
+}
+
+static void
+nvfx_vertprog_translate(struct nvfx_context *nvfx,
+			struct nvfx_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nvfx_vpc *vpc = NULL;
+	struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	int i;
+
+	vpc = CALLOC(1, sizeof(struct nvfx_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+
+	if (!nvfx_vertprog_prepare(nvfx, vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	/* Redirect post-transform vertex position to a temp if user clip
+	 * planes are enabled.  We need to append code to the vtxprog
+	 * to handle clip planes later.
+	 */
+	if (vp->ucp.nr)  {
+		vpc->r_result[vpc->hpos_idx] = temp(vpc);
+		vpc->r_temps_discard = 0;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(imm->Immediate.NrTokens == 4 + 1);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u[0].Float,
+					 imm->u[1].Float,
+					 imm->u[2].Float,
+					 imm->u[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nvfx_vertprog_parse_instruction(nvfx, vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Write out HPOS if it was redirected to a temp earlier */
+	if (vpc->r_result[vpc->hpos_idx].type != NVFXSR_OUTPUT) {
+		struct nvfx_sreg hpos = nvfx_sr(NVFXSR_OUTPUT,
+						NVFX_VP(INST_DEST_POS));
+		struct nvfx_sreg htmp = vpc->r_result[vpc->hpos_idx];
+
+		arith(vpc, VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none);
+	}
+
+	/* Insert code to handle user clip planes */
+	for (i = 0; i < vp->ucp.nr; i++) {
+		struct nvfx_sreg cdst = nvfx_sr(NVFXSR_OUTPUT,
+						NVFX_VP_INST_DEST_CLIP(i));
+		struct nvfx_sreg ceqn = constant(vpc, -1,
+						 nvfx->clip.ucp[i][0],
+						 nvfx->clip.ucp[i][1],
+						 nvfx->clip.ucp[i][2],
+						 nvfx->clip.ucp[i][3]);
+		struct nvfx_sreg htmp = vpc->r_result[vpc->hpos_idx];
+		unsigned mask;
+
+		switch (i) {
+		case 0: case 3: mask = NVFX_VP_MASK_Y; break;
+		case 1: case 4: mask = NVFX_VP_MASK_Z; break;
+		case 2: case 5: mask = NVFX_VP_MASK_W; break;
+		default:
+			NOUVEAU_ERR("invalid clip dist #%d\n", i);
+			goto out_err;
+		}
+
+		arith(vpc, VEC, DP4, cdst, mask, htmp, ceqn, none);
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (vpc->r_temp)
+		FREE(vpc->r_temp);
+	if (vpc->r_address)
+		FREE(vpc->r_address);
+	if (vpc->imm)
+		FREE(vpc->imm);
+	FREE(vpc);
+}
+
+boolean
+nvfx_vertprog_validate(struct nvfx_context *nvfx)
+{
+	struct pipe_context *pipe = &nvfx->pipe;
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	struct nvfx_vertex_program *vp;
+	struct pipe_resource *constbuf;
+	struct pipe_transfer *transfer = NULL;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	if (nvfx->render_mode == HW) {
+		vp = nvfx->vertprog;
+		constbuf = nvfx->constbuf[PIPE_SHADER_VERTEX];
+
+		// TODO: ouch! can't we just use constant slots for these?!
+		if ((nvfx->dirty & NVFX_NEW_UCP) ||
+		    memcmp(&nvfx->clip, &vp->ucp, sizeof(vp->ucp))) {
+			nvfx_vertprog_destroy(nvfx, vp);
+			memcpy(&vp->ucp, &nvfx->clip, sizeof(vp->ucp));
+		}
+	} else {
+		vp = nvfx->swtnl.vertprog;
+		constbuf = NULL;
+	}
+
+	/* Translate TGSI shader into hw bytecode */
+	if (!vp->translated)
+	{
+		nvfx->fallback_swtnl &= ~NVFX_NEW_VERTPROG;
+		nvfx_vertprog_translate(nvfx, vp);
+		if (!vp->translated) {
+			nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
+			return FALSE;
+		}
+	}
+
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nvfx->screen->vp_exec_heap;
+		uint vplen = vp->nr_insns;
+
+		if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nvfx_vertex_program *evict;
+
+				evict = heap->next->priv;
+				nouveau_resource_free(&evict->exec);
+			}
+
+			if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nvfx->screen->vp_data_heap;
+
+		if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nvfx_vertex_program *evict;
+
+				evict = heap->next->priv;
+				nouveau_resource_free(&evict->data);
+			}
+
+			if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nvfx_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nvfx_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NVFX_VP(INST_CONST_SRC_SHIFT);
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = pipe_buffer_map(pipe, constbuf,
+					      PIPE_TRANSFER_READ,
+					      &transfer);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nvfx_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (chan, i + vp->data->start);
+			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf)
+			pipe_buffer_unmap(pipe, constbuf, transfer);
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (chan, vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (chan, vp->insns[i].data, 4);
+		}
+	}
+
+	if(nvfx->dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
+	{
+		WAIT_RING(chan, 7);
+		OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1));
+		OUT_RING(chan, vp->exec->start);
+		if(nvfx->is_nv4x) {
+			OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 2));
+			OUT_RING(chan, vp->ir);
+			OUT_RING(chan, vp->or);
+		}
+		OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
+		OUT_RING(chan, vp->clip_ctrl);
+	}
+
+	return TRUE;
+}
+
+void
+nvfx_vertprog_destroy(struct nvfx_context *nvfx, struct nvfx_vertex_program *vp)
+{
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nouveau_resource_free(&vp->exec);
+	vp->exec_start = 0;
+	nouveau_resource_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = vp->clip_ctrl = 0;
+}
diff --git a/src/gallium/drivers/r300/Makefile b/src/gallium/drivers/r300/Makefile
new file mode 100644
index 0000000000..dd897f6072
--- /dev/null
+++ b/src/gallium/drivers/r300/Makefile
@@ -0,0 +1,43 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = r300
+
+C_SOURCES = \
+	r300_blit.c \
+	r300_chipset.c \
+	r300_context.c \
+	r300_debug.c \
+	r300_emit.c \
+	r300_flush.c \
+	r300_fs.c \
+	r300_hyperz.c \
+	r300_query.c \
+	r300_render.c \
+	r300_render_stencilref.c \
+	r300_render_translate.c \
+	r300_resource.c \
+	r300_screen.c \
+	r300_screen_buffer.c \
+	r300_state.c \
+	r300_state_derived.c \
+	r300_state_invariant.c \
+	r300_vs.c \
+	r300_vs_draw.c \
+	r300_texture.c \
+	r300_tgsi_to_rc.c \
+	r300_transfer.c
+
+LIBRARY_INCLUDES = \
+	-I$(TOP)/src/mesa/drivers/dri/r300/compiler \
+	-I$(TOP)/src/gallium/winsys/drm/radeon/core
+
+COMPILER_ARCHIVE = $(TOP)/src/mesa/drivers/dri/r300/compiler/libr300compiler.a
+
+EXTRA_OBJECTS = \
+	$(COMPILER_ARCHIVE)
+
+include ../../Makefile.template
+
+$(COMPILER_ARCHIVE):
+	$(MAKE) -C $(TOP)/src/mesa/drivers/dri/r300/compiler
diff --git a/src/gallium/drivers/r300/SConscript b/src/gallium/drivers/r300/SConscript
new file mode 100644
index 0000000000..ee19e9d278
--- /dev/null
+++ b/src/gallium/drivers/r300/SConscript
@@ -0,0 +1,43 @@
+Import('*')
+
+r300compiler = SConscript('#/src/mesa/drivers/dri/r300/compiler/SConscript')
+
+env = env.Clone()
+# add the paths for r300compiler
+env.Append(CPPPATH = [
+    '#/src/mesa/drivers/dri/r300/compiler', 
+    '#/src/gallium/winsys/drm/radeon/core',
+    '#/include', 
+    '#/src/mesa',
+])
+
+r300 = env.ConvenienceLibrary(
+    target = 'r300',
+    source = [
+        'r300_blit.c',
+        'r300_chipset.c',
+        'r300_context.c',
+        'r300_debug.c',
+        'r300_emit.c',
+        'r300_flush.c',
+        'r300_fs.c',
+        'r300_hyperz.c',
+        'r300_query.c',
+        'r300_render.c',
+        'r300_render_stencilref.c',
+        'r300_render_translate.c',
+        'r300_resource.c',
+        'r300_screen.c',
+        'r300_screen_buffer.c',
+        'r300_state.c',
+        'r300_state_derived.c',
+        'r300_state_invariant.c',
+        'r300_vs.c',
+        'r300_vs_draw.c',
+        'r300_texture.c',
+        'r300_tgsi_to_rc.c',
+        'r300_transfer.c',
+    ] + r300compiler) + r300compiler
+
+Export('r300')
+
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
new file mode 100644
index 0000000000..2a47701291
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_context.h"
+#include "r300_texture.h"
+
+#include "util/u_format.h"
+
+enum r300_blitter_op
+{
+    R300_CLEAR,
+    R300_CLEAR_SURFACE,
+    R300_COPY
+};
+
+static void r300_blitter_begin(struct r300_context* r300, enum r300_blitter_op op)
+{
+    if (r300->query_current) {
+        r300->blitter_saved_query = r300->query_current;
+        r300_stop_query(r300);
+    }
+
+    /* Yeah we have to save all those states to ensure the blitter operation
+     * is really transparent. The states will be restored by the blitter once
+     * copying is done. */
+    util_blitter_save_blend(r300->blitter, r300->blend_state.state);
+    util_blitter_save_depth_stencil_alpha(r300->blitter, r300->dsa_state.state);
+    util_blitter_save_stencil_ref(r300->blitter, &(r300->stencil_ref));
+    util_blitter_save_rasterizer(r300->blitter, r300->rs_state.state);
+    util_blitter_save_fragment_shader(r300->blitter, r300->fs.state);
+    util_blitter_save_vertex_shader(r300->blitter, r300->vs_state.state);
+    util_blitter_save_viewport(r300->blitter, &r300->viewport);
+    util_blitter_save_clip(r300->blitter, (struct pipe_clip_state*)r300->clip_state.state);
+    util_blitter_save_vertex_elements(r300->blitter, r300->velems);
+    util_blitter_save_vertex_buffers(r300->blitter, r300->vertex_buffer_count,
+                                     r300->vertex_buffer);
+
+    if (op & (R300_CLEAR_SURFACE | R300_COPY))
+        util_blitter_save_framebuffer(r300->blitter, r300->fb_state.state);
+
+    if (op & R300_COPY) {
+        struct r300_textures_state* state =
+            (struct r300_textures_state*)r300->textures_state.state;
+
+        util_blitter_save_fragment_sampler_states(
+            r300->blitter, state->sampler_state_count,
+            (void**)state->sampler_states);
+
+        util_blitter_save_fragment_sampler_views(
+            r300->blitter, state->sampler_view_count,
+            (struct pipe_sampler_view**)state->sampler_views);
+    }
+}
+
+static void r300_blitter_end(struct r300_context *r300)
+{
+    if (r300->blitter_saved_query) {
+        r300_resume_query(r300, r300->blitter_saved_query);
+        r300->blitter_saved_query = NULL;
+    }
+}
+
+/* Clear currently bound buffers. */
+static void r300_clear(struct pipe_context* pipe,
+                       unsigned buffers,
+                       const float* rgba,
+                       double depth,
+                       unsigned stencil)
+{
+    /* XXX Implement fastfill.
+     *
+     * If fastfill is enabled, a few facts should be considered:
+     *
+     * 1) Zbuffer must be micro-tiled and whole microtiles must be
+     *    written.
+     *
+     * 2) ZB_DEPTHCLEARVALUE is used to clear a zbuffer and Z Mask must be
+     *    equal to 0.
+     *
+     * 3) For 16-bit integer buffering, compression causes a hung with one or
+     *    two samples and should not be used.
+     *
+     * 4) Fastfill must not be used if reading of compressed Z data is disabled
+     *    and writing of compressed Z data is enabled (RD/WR_COMP_ENABLE),
+     *    i.e. it cannot be used to compress the zbuffer.
+     *    (what the hell does that mean and how does it fit in clearing
+     *    the buffers?)
+     *
+     * - Marek
+     */
+
+    struct r300_context* r300 = r300_context(pipe);
+    struct pipe_framebuffer_state* fb =
+        (struct pipe_framebuffer_state*)r300->fb_state.state;
+
+    r300_blitter_begin(r300, R300_CLEAR);
+    util_blitter_clear(r300->blitter,
+                       fb->width,
+                       fb->height,
+                       fb->nr_cbufs,
+                       buffers, rgba, depth, stencil);
+    r300_blitter_end(r300);
+}
+
+/* Clear a region of a color surface to a constant value. */
+static void r300_clear_render_target(struct pipe_context *pipe,
+                                     struct pipe_surface *dst,
+                                     const float *rgba,
+                                     unsigned dstx, unsigned dsty,
+                                     unsigned width, unsigned height)
+{
+    struct r300_context *r300 = r300_context(pipe);
+
+    r300_blitter_begin(r300, R300_CLEAR_SURFACE);
+    util_blitter_clear_render_target(r300->blitter, dst, rgba,
+                                     dstx, dsty, width, height);
+    r300_blitter_end(r300);
+}
+
+/* Clear a region of a depth stencil surface. */
+static void r300_clear_depth_stencil(struct pipe_context *pipe,
+                                     struct pipe_surface *dst,
+                                     unsigned clear_flags,
+                                     double depth,
+                                     unsigned stencil,
+                                     unsigned dstx, unsigned dsty,
+                                     unsigned width, unsigned height)
+{
+    struct r300_context *r300 = r300_context(pipe);
+
+    r300_blitter_begin(r300, R300_CLEAR_SURFACE);
+    util_blitter_clear_depth_stencil(r300->blitter, dst, clear_flags, depth, stencil,
+                                     dstx, dsty, width, height);
+    r300_blitter_end(r300);
+}
+
+/* Copy a block of pixels from one surface to another using HW. */
+static void r300_hw_copy_region(struct pipe_context* pipe,
+                                struct pipe_resource *dst,
+                                struct pipe_subresource subdst,
+                                unsigned dstx, unsigned dsty, unsigned dstz,
+                                struct pipe_resource *src,
+                                struct pipe_subresource subsrc,
+                                unsigned srcx, unsigned srcy, unsigned srcz,
+                                unsigned width, unsigned height)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    r300_blitter_begin(r300, R300_COPY);
+    util_blitter_copy_region(r300->blitter, dst, subdst, dstx, dsty, dstz,
+                             src, subsrc, srcx, srcy, srcz, width, height,
+                             TRUE);
+    r300_blitter_end(r300);
+}
+
+/* Copy a block of pixels from one surface to another. */
+static void r300_resource_copy_region(struct pipe_context *pipe,
+                                      struct pipe_resource *dst,
+                                      struct pipe_subresource subdst,
+                                      unsigned dstx, unsigned dsty, unsigned dstz,
+                                      struct pipe_resource *src,
+                                      struct pipe_subresource subsrc,
+                                      unsigned srcx, unsigned srcy, unsigned srcz,
+                                      unsigned width, unsigned height)
+{
+    enum pipe_format old_format = dst->format;
+    enum pipe_format new_format = old_format;
+
+    if (dst->format != src->format) {
+        debug_printf("r300: Implementation error: Format mismatch in %s\n"
+            "    : src: %s dst: %s\n", __FUNCTION__,
+            util_format_short_name(src->format),
+            util_format_short_name(dst->format));
+        debug_assert(0);
+    }
+
+    if (!pipe->screen->is_format_supported(pipe->screen,
+                                           old_format, src->target,
+                                           src->nr_samples,
+                                           PIPE_BIND_RENDER_TARGET |
+                                           PIPE_BIND_SAMPLER_VIEW, 0) &&
+        util_format_is_plain(old_format)) {
+        switch (util_format_get_blocksize(old_format)) {
+            case 1:
+                new_format = PIPE_FORMAT_I8_UNORM;
+                break;
+            case 2:
+                new_format = PIPE_FORMAT_B4G4R4A4_UNORM;
+                break;
+            case 4:
+                new_format = PIPE_FORMAT_B8G8R8A8_UNORM;
+                break;
+            case 8:
+                new_format = PIPE_FORMAT_R16G16B16A16_UNORM;
+                break;
+            default:
+                debug_printf("r300: surface_copy: Unhandled format: %s. Falling back to software.\n"
+                             "r300: surface_copy: Software fallback doesn't work for tiled textures.\n",
+                             util_format_short_name(old_format));
+        }
+    }
+
+    if (old_format != new_format) {
+        dst->format = new_format;
+        src->format = new_format;
+
+        r300_texture_reinterpret_format(pipe->screen,
+                                        dst, new_format);
+        r300_texture_reinterpret_format(pipe->screen,
+                                        src, new_format);
+    }
+
+    r300_hw_copy_region(pipe, dst, subdst, dstx, dsty, dstz,
+                        src, subsrc, srcx, srcy, srcz, width, height);
+
+    if (old_format != new_format) {
+        dst->format = old_format;
+        src->format = old_format;
+
+        r300_texture_reinterpret_format(pipe->screen,
+                                        dst, old_format);
+        r300_texture_reinterpret_format(pipe->screen,
+                                        src, old_format);
+    }
+}
+
+void r300_init_blit_functions(struct r300_context *r300)
+{
+    r300->context.clear = r300_clear;
+    r300->context.clear_render_target = r300_clear_render_target;
+    r300->context.clear_depth_stencil = r300_clear_depth_stencil;
+    r300->context.resource_copy_region = r300_resource_copy_region;
+}
diff --git a/src/gallium/drivers/r300/r300_cb.h b/src/gallium/drivers/r300/r300_cb.h
new file mode 100644
index 0000000000..6987471244
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_cb.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/**
+ * This file contains macros for building command buffers in memory.
+ *
+ * Use NEW_CB for buffers with a varying size and it will also allocate
+ * the buffer.
+ * Use BEGIN_CB for arrays with a static size.
+ *
+ * Example:
+ *
+ *     uint32_t cb[3];
+ *     CB_LOCALS;
+ *
+ *     BEGIN_CB(cb, 3);
+ *     OUT_CB_REG_SEQ(R500_RB3D_CONSTANT_COLOR_AR, 2);
+ *     OUT_CB(blend_color_red_alpha);
+ *     OUT_CB(blend_color_green_blue);
+ *     END_CB;
+ *
+ * And later:
+ *
+ *     CS_LOCALS;
+ *     WRITE_CS_TABLE(cb, 3);
+ *
+ * Or using a little slower variant:
+ *
+ *     CS_LOCALS;
+ *     BEGIN_CS(cb, 3);
+ *     OUT_CS_TABLE(cb, 3);
+ *     END_CS;
+ */
+
+#ifndef R300_CB_H
+#define R300_CB_H
+
+#include "r300_reg.h"
+
+/* Yes, I know macros are ugly. However, they are much prettier than the code
+ * that they neatly hide away, and don't have the cost of function setup, so
+ * we're going to use them. */
+
+#ifdef DEBUG
+#define CB_DEBUG(x) x
+#else
+#define CB_DEBUG(x)
+#endif
+
+
+/**
+ * Command buffer setup.
+ */
+
+#define CB_LOCALS \
+    CB_DEBUG(int cs_count = 0;) \
+    uint32_t *cs_ptr = NULL; \
+    CB_DEBUG((void) cs_count;) (void) cs_ptr;
+
+#define NEW_CB(ptr, size) do { \
+    assert(sizeof(*ptr) == sizeof(uint32_t)); \
+    cs_ptr = (ptr) = (uint32_t*)malloc((size) * sizeof(uint32_t)); \
+    CB_DEBUG(cs_count = size;) \
+} while (0)
+
+#define BEGIN_CB(ptr, size) do { \
+    assert(sizeof(*ptr) == sizeof(uint32_t)); \
+    cs_ptr = ptr; \
+    CB_DEBUG(cs_count = size;) \
+} while (0)
+
+#define BEGIN_CS_AS_CB(r300, size) \
+    BEGIN_CB(r300->rws->get_cs_pointer(r300->rws, dwords), dwords)
+
+#define END_CB do { \
+    CB_DEBUG(if (cs_count != 0) \
+        debug_printf("r300: Warning: cs_count off by %d at (%s, %s:%i)\n", \
+                     cs_count, __FUNCTION__, __FILE__, __LINE__);) \
+} while (0)
+
+
+/**
+ * Storing pure DWORDs.
+ */
+
+#define OUT_CB(value) do { \
+    *cs_ptr = (value); \
+    cs_ptr++; \
+    CB_DEBUG(cs_count--;) \
+} while (0)
+
+#define OUT_CB_TABLE(values, count) do { \
+    memcpy(cs_ptr, values, count * sizeof(uint32_t)); \
+    cs_ptr += count; \
+    CB_DEBUG(cs_count -= count;) \
+} while (0)
+
+#define OUT_CB_32F(value) \
+    OUT_CB(fui(value));
+
+#define OUT_CB_REG(register, value) do { \
+    assert(register); \
+    OUT_CB(CP_PACKET0(register, 0)); \
+    OUT_CB(value); \
+} while (0)
+
+/* Note: This expects count to be the number of registers,
+ * not the actual packet0 count! */
+#define OUT_CB_REG_SEQ(register, count) do { \
+    assert(register); \
+    OUT_CB(CP_PACKET0(register, (count) - 1)); \
+} while (0)
+
+#define OUT_CB_ONE_REG(register, count) do { \
+    assert(register); \
+    OUT_CB(CP_PACKET0(register, (count) - 1) | RADEON_ONE_REG_WR); \
+} while (0)
+
+#define OUT_CB_PKT3(op, count) \
+    OUT_CB(CP_PACKET3(op, count))
+
+#endif /* R300_CB_H */
diff --git a/src/gallium/drivers/r300/r300_chipset.c b/src/gallium/drivers/r300/r300_chipset.c
new file mode 100644
index 0000000000..e6dca66d4a
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_chipset.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_chipset.h"
+
+#include "util/u_debug.h"
+
+#include <stdio.h>
+
+/* r300_chipset: A file all to itself for deducing the various properties of
+ * Radeons. */
+
+/* Parse a PCI ID and fill an r300_capabilities struct with information. */
+void r300_parse_chipset(struct r300_capabilities* caps)
+{
+    /* Reasonable defaults */
+    caps->num_vert_fpus = 2;
+    caps->num_tex_units = 16;
+    caps->has_tcl = debug_get_bool_option("RADEON_NO_TCL", FALSE) ? FALSE : TRUE;
+    caps->is_r400 = FALSE;
+    caps->is_r500 = FALSE;
+    caps->high_second_pipe = FALSE;
+
+    /* Note: These are not ordered by PCI ID. I leave that task to GCC,
+     * which will perform the ordering while collating jump tables. Instead,
+     * I've tried to group them according to capabilities and age. */
+    switch (caps->pci_id) {
+        case 0x4144:
+            caps->family = CHIP_FAMILY_R300;
+            caps->high_second_pipe = TRUE;
+            caps->num_vert_fpus = 4;
+            break;
+
+        case 0x4145:
+        case 0x4146:
+        case 0x4147:
+        case 0x4E44:
+        case 0x4E45:
+        case 0x4E46:
+        case 0x4E47:
+            caps->family = CHIP_FAMILY_R300;
+            caps->high_second_pipe = TRUE;
+            caps->num_vert_fpus = 4;
+            break;
+
+        case 0x4150:
+        case 0x4151:
+        case 0x4152:
+        case 0x4153:
+        case 0x4154:
+        case 0x4155:
+        case 0x4156:
+        case 0x4E50:
+        case 0x4E51:
+        case 0x4E52:
+        case 0x4E53:
+        case 0x4E54:
+        case 0x4E56:
+            caps->family = CHIP_FAMILY_RV350;
+            caps->high_second_pipe = TRUE;
+            break;
+
+        case 0x4148:
+        case 0x4149:
+        case 0x414A:
+        case 0x414B:
+        case 0x4E48:
+        case 0x4E49:
+        case 0x4E4B:
+            caps->family = CHIP_FAMILY_R350;
+            caps->high_second_pipe = TRUE;
+            caps->num_vert_fpus = 4;
+            break;
+
+        case 0x4E4A:
+            caps->family = CHIP_FAMILY_R360;
+            caps->high_second_pipe = TRUE;
+            caps->num_vert_fpus = 4;
+            break;
+
+        case 0x5460:
+        case 0x5462:
+        case 0x5464:
+        case 0x5B60:
+        case 0x5B62:
+        case 0x5B63:
+        case 0x5B64:
+        case 0x5B65:
+            caps->family = CHIP_FAMILY_RV370;
+            caps->high_second_pipe = TRUE;
+            break;
+
+        case 0x3150:
+        case 0x3152:
+        case 0x3154:
+        case 0x3155:
+        case 0x3E50:
+        case 0x3E54:
+            caps->family = CHIP_FAMILY_RV380;
+            caps->high_second_pipe = TRUE;
+            break;
+
+        case 0x4A48:
+        case 0x4A49:
+        case 0x4A4A:
+        case 0x4A4B:
+        case 0x4A4C:
+        case 0x4A4D:
+        case 0x4A4E:
+        case 0x4A4F:
+        case 0x4A50:
+        case 0x4A54:
+            caps->family = CHIP_FAMILY_R420;
+            caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
+            break;
+
+        case 0x5548:
+        case 0x5549:
+        case 0x554A:
+        case 0x554B:
+        case 0x5550:
+        case 0x5551:
+        case 0x5552:
+        case 0x5554:
+        case 0x5D57:
+            caps->family = CHIP_FAMILY_R423;
+            caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
+            break;
+
+        case 0x554C:
+        case 0x554D:
+        case 0x554E:
+        case 0x554F:
+        case 0x5D48:
+        case 0x5D49:
+        case 0x5D4A:
+            caps->family = CHIP_FAMILY_R430;
+            caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
+            break;
+
+        case 0x5D4C:
+        case 0x5D4D:
+        case 0x5D4E:
+        case 0x5D4F:
+        case 0x5D50:
+        case 0x5D52:
+            caps->family = CHIP_FAMILY_R480;
+            caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
+            break;
+
+        case 0x4B48:
+        case 0x4B49:
+        case 0x4B4A:
+        case 0x4B4B:
+        case 0x4B4C:
+            caps->family = CHIP_FAMILY_R481;
+            caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
+            break;
+
+        case 0x5E4C:
+        case 0x5E4F:
+        case 0x564A:
+        case 0x564B:
+        case 0x564F:
+        case 0x5652:
+        case 0x5653:
+        case 0x5657:
+        case 0x5E48:
+        case 0x5E4A:
+        case 0x5E4B:
+        case 0x5E4D:
+            caps->family = CHIP_FAMILY_RV410;
+            caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
+            break;
+
+        case 0x5954:
+        case 0x5955:
+            caps->family = CHIP_FAMILY_RS480;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x5974:
+        case 0x5975:
+            caps->family = CHIP_FAMILY_RS482;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x5A41:
+        case 0x5A42:
+            caps->family = CHIP_FAMILY_RS400;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x5A61:
+        case 0x5A62:
+            caps->family = CHIP_FAMILY_RC410;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x791E:
+        case 0x791F:
+            caps->family = CHIP_FAMILY_RS690;
+            caps->has_tcl = FALSE;
+            caps->is_r400 = TRUE;
+            break;
+
+        case 0x793F:
+        case 0x7941:
+        case 0x7942:
+            caps->family = CHIP_FAMILY_RS600;
+            caps->has_tcl = FALSE;
+            caps->is_r400 = TRUE;
+            break;
+
+        case 0x796C:
+        case 0x796D:
+        case 0x796E:
+        case 0x796F:
+            caps->family = CHIP_FAMILY_RS740;
+            caps->has_tcl = FALSE;
+            caps->is_r400 = TRUE;
+            break;
+
+        case 0x7100:
+        case 0x7101:
+        case 0x7102:
+        case 0x7103:
+        case 0x7104:
+        case 0x7105:
+        case 0x7106:
+        case 0x7108:
+        case 0x7109:
+        case 0x710A:
+        case 0x710B:
+        case 0x710C:
+        case 0x710E:
+        case 0x710F:
+            caps->family = CHIP_FAMILY_R520;
+            caps->num_vert_fpus = 8;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x7140:
+        case 0x7141:
+        case 0x7142:
+        case 0x7143:
+        case 0x7144:
+        case 0x7145:
+        case 0x7146:
+        case 0x7147:
+        case 0x7149:
+        case 0x714A:
+        case 0x714B:
+        case 0x714C:
+        case 0x714D:
+        case 0x714E:
+        case 0x714F:
+        case 0x7151:
+        case 0x7152:
+        case 0x7153:
+        case 0x715E:
+        case 0x715F:
+        case 0x7180:
+        case 0x7181:
+        case 0x7183:
+        case 0x7186:
+        case 0x7187:
+        case 0x7188:
+        case 0x718A:
+        case 0x718B:
+        case 0x718C:
+        case 0x718D:
+        case 0x718F:
+        case 0x7193:
+        case 0x7196:
+        case 0x719B:
+        case 0x719F:
+        case 0x7200:
+        case 0x7210:
+        case 0x7211:
+            caps->family = CHIP_FAMILY_RV515;
+            caps->num_vert_fpus = 2;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x71C0:
+        case 0x71C1:
+        case 0x71C2:
+        case 0x71C3:
+        case 0x71C4:
+        case 0x71C5:
+        case 0x71C6:
+        case 0x71C7:
+        case 0x71CD:
+        case 0x71CE:
+        case 0x71D2:
+        case 0x71D4:
+        case 0x71D5:
+        case 0x71D6:
+        case 0x71DA:
+        case 0x71DE:
+            caps->family = CHIP_FAMILY_RV530;
+            caps->num_vert_fpus = 5;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x7240:
+        case 0x7243:
+        case 0x7244:
+        case 0x7245:
+        case 0x7246:
+        case 0x7247:
+        case 0x7248:
+        case 0x7249:
+        case 0x724A:
+        case 0x724B:
+        case 0x724C:
+        case 0x724D:
+        case 0x724E:
+        case 0x724F:
+        case 0x7284:
+            caps->family = CHIP_FAMILY_R580;
+            caps->num_vert_fpus = 8;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x7280:
+            caps->family = CHIP_FAMILY_RV570;
+            caps->num_vert_fpus = 8;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x7281:
+        case 0x7283:
+        case 0x7287:
+        case 0x7288:
+        case 0x7289:
+        case 0x728B:
+        case 0x728C:
+        case 0x7290:
+        case 0x7291:
+        case 0x7293:
+        case 0x7297:
+            caps->family = CHIP_FAMILY_RV560;
+            caps->num_vert_fpus = 8;
+            caps->is_r500 = TRUE;
+            break;
+
+        default:
+            fprintf(stderr, "r300: Warning: Unknown chipset 0x%x\n",
+                    caps->pci_id);
+    }
+
+    caps->is_rv350 = caps->family >= CHIP_FAMILY_RV350;
+}
diff --git a/src/gallium/drivers/r300/r300_chipset.h b/src/gallium/drivers/r300/r300_chipset.h
new file mode 100644
index 0000000000..ab649c3857
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_chipset.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_CHIPSET_H
+#define R300_CHIPSET_H
+
+#include "pipe/p_compiler.h"
+
+/* Structure containing all the possible information about a specific Radeon
+ * in the R3xx, R4xx, and R5xx families. */
+struct r300_capabilities {
+    /* PCI ID */
+    uint32_t pci_id;
+    /* Chipset family */
+    int family;
+    /* The number of vertex floating-point units */
+    unsigned num_vert_fpus;
+    /* The number of fragment pipes */
+    unsigned num_frag_pipes;
+    /* The number of z pipes */
+    unsigned num_z_pipes;
+    /* The number of texture units. */
+    unsigned num_tex_units;
+    /* Whether or not TCL is physically present */
+    boolean has_tcl;
+    /* Whether or not this is RV350 or newer, including all r400 and r500
+     * chipsets. The differences compared to the oldest r300 chips are:
+     * - Blend LTE/GTE thresholds
+     * - Better MACRO_SWITCH in texture tiling
+     * - Half float vertex
+     * - More HyperZ optimizations */
+    boolean is_rv350;
+    /* Whether or not this is R400. The differences compared their rv350
+     * cousins are:
+     * - Extended fragment shader registers
+     * - 3DC texture compression (RGTC2) */
+    boolean is_r400;
+    /* Whether or not this is an RV515 or newer; R500s have many differences
+     * that require extra consideration, compared to their rv350 cousins:
+     * - Extra bit of width and height on texture sizes
+     * - Blend color is split across two registers
+     * - Universal Shader (US) block used for fragment shaders
+     * - FP16 blending and multisampling
+     * - Full RGTC texture compression
+     * - 24-bit depth textures
+     * - Stencil back-face reference value
+     * - Ability to render up to 2^24 - 1 vertices with signed index offset */
+    boolean is_r500;
+    /* Whether or not the second pixel pipe is accessed with the high bit */
+    boolean high_second_pipe;
+};
+
+/* Enumerations for legibility and telling which card we're running on. */
+enum {
+    CHIP_FAMILY_R300 = 0,
+    CHIP_FAMILY_R350,
+    CHIP_FAMILY_R360,
+    CHIP_FAMILY_RV350,
+    CHIP_FAMILY_RV370,
+    CHIP_FAMILY_RV380,
+    CHIP_FAMILY_R420,
+    CHIP_FAMILY_R423,
+    CHIP_FAMILY_R430,
+    CHIP_FAMILY_R480,
+    CHIP_FAMILY_R481,
+    CHIP_FAMILY_RV410,
+    CHIP_FAMILY_RS400,
+    CHIP_FAMILY_RC410,
+    CHIP_FAMILY_RS480,
+    CHIP_FAMILY_RS482,
+    CHIP_FAMILY_RS600,
+    CHIP_FAMILY_RS690,
+    CHIP_FAMILY_RS740,
+    CHIP_FAMILY_RV515,
+    CHIP_FAMILY_R520,
+    CHIP_FAMILY_RV530,
+    CHIP_FAMILY_R580,
+    CHIP_FAMILY_RV560,
+    CHIP_FAMILY_RV570
+};
+
+void r300_parse_chipset(struct r300_capabilities* caps);
+
+#endif /* R300_CHIPSET_H */
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
new file mode 100644
index 0000000000..16a75aa612
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "draw/draw_context.h"
+
+#include "util/u_memory.h"
+#include "util/u_sampler.h"
+#include "util/u_simple_list.h"
+#include "util/u_upload_mgr.h"
+
+#include "r300_cb.h"
+#include "r300_context.h"
+#include "r300_emit.h"
+#include "r300_screen.h"
+#include "r300_screen_buffer.h"
+#include "r300_state_invariant.h"
+#include "r300_winsys.h"
+
+#include <inttypes.h>
+
+static void r300_destroy_context(struct pipe_context* context)
+{
+    struct r300_context* r300 = r300_context(context);
+    struct r300_query *query, *temp;
+    struct r300_atom *atom;
+
+    if (r300->texkill_sampler) {
+        pipe_sampler_view_reference(
+                (struct pipe_sampler_view**)&r300->texkill_sampler,
+                NULL);
+    }
+
+    util_blitter_destroy(r300->blitter);
+    draw_destroy(r300->draw);
+
+    /* Print stats, if enabled. */
+    if (SCREEN_DBG_ON(r300->screen, DBG_STATS)) {
+        fprintf(stderr, "r300: Stats for context %p:\n", r300);
+        fprintf(stderr, "    : Flushes: %" PRIu64 "\n", r300->flush_counter);
+        foreach(atom, &r300->atom_list) {
+            fprintf(stderr, "    : %s: %" PRIu64 " emits\n",
+                atom->name, atom->counter);
+        }
+    }
+
+    /* If there are any queries pending or not destroyed, remove them now. */
+    foreach_s(query, temp, &r300->query_list) {
+        remove_from_list(query);
+        FREE(query);
+    }
+
+    u_upload_destroy(r300->upload_vb);
+    u_upload_destroy(r300->upload_ib);
+
+    translate_cache_destroy(r300->tran.translate_cache);
+
+    FREE(r300->blend_color_state.state);
+    FREE(r300->clip_state.state);
+    FREE(r300->fb_state.state);
+    FREE(r300->rs_block_state.state);
+    FREE(r300->scissor_state.state);
+    FREE(r300->textures_state.state);
+    FREE(r300->viewport_state.state);
+    FREE(r300->ztop_state.state);
+    FREE(r300->fs_constants.state);
+    FREE(r300->vs_constants.state);
+    if (!r300->screen->caps.has_tcl) {
+        FREE(r300->vertex_stream_state.state);
+    }
+    FREE(r300);
+}
+
+static void r300_flush_cb(void *data)
+{
+    struct r300_context* const cs_context_copy = data;
+
+    cs_context_copy->context.flush(&cs_context_copy->context, 0, NULL);
+}
+
+#define R300_INIT_ATOM(atomname, atomsize) \
+    r300->atomname.name = #atomname; \
+    r300->atomname.state = NULL; \
+    r300->atomname.size = atomsize; \
+    r300->atomname.emit = r300_emit_##atomname; \
+    r300->atomname.dirty = FALSE; \
+    insert_at_tail(&r300->atom_list, &r300->atomname);
+
+static void r300_setup_atoms(struct r300_context* r300)
+{
+    boolean is_r500 = r300->screen->caps.is_r500;
+    boolean has_tcl = r300->screen->caps.has_tcl;
+
+    /* Create the actual atom list.
+     *
+     * Each atom is examined and emitted in the order it appears here, which
+     * can affect performance and conformance if not handled with care.
+     *
+     * Some atoms never change size, others change every emit - those have
+     * the size of 0 here. */
+    make_empty_list(&r300->atom_list);
+    R300_INIT_ATOM(invariant_state, 71);
+    R300_INIT_ATOM(ztop_state, 2);
+    R300_INIT_ATOM(query_start, 4);
+    R300_INIT_ATOM(blend_state, 8);
+    R300_INIT_ATOM(blend_color_state, is_r500 ? 3 : 2);
+    R300_INIT_ATOM(clip_state, has_tcl ? 5 + (6 * 4) : 2);
+    R300_INIT_ATOM(dsa_state, is_r500 ? 8 : 6);
+    R300_INIT_ATOM(fb_state, 0);
+    R300_INIT_ATOM(rs_state, 0);
+    R300_INIT_ATOM(scissor_state, 3);
+    R300_INIT_ATOM(viewport_state, 9);
+    R300_INIT_ATOM(rs_block_state, 0);
+    R300_INIT_ATOM(vertex_stream_state, 0);
+    R300_INIT_ATOM(pvs_flush, 2);
+    R300_INIT_ATOM(vs_state, 0);
+    R300_INIT_ATOM(vs_constants, 0);
+    R300_INIT_ATOM(texture_cache_inval, 2);
+    R300_INIT_ATOM(textures_state, 0);
+    R300_INIT_ATOM(fs, 0);
+    R300_INIT_ATOM(fs_rc_constant_state, 0);
+    R300_INIT_ATOM(fs_constants, 0);
+
+    /* Replace emission functions for r500. */
+    if (r300->screen->caps.is_r500) {
+        r300->fs.emit = r500_emit_fs;
+        r300->fs_rc_constant_state.emit = r500_emit_fs_rc_constant_state;
+        r300->fs_constants.emit = r500_emit_fs_constants;
+    }
+
+    /* Some non-CSO atoms need explicit space to store the state locally. */
+    r300->blend_color_state.state = CALLOC_STRUCT(r300_blend_color_state);
+    r300->clip_state.state = CALLOC_STRUCT(r300_clip_state);
+    r300->fb_state.state = CALLOC_STRUCT(pipe_framebuffer_state);
+    r300->rs_block_state.state = CALLOC_STRUCT(r300_rs_block);
+    r300->scissor_state.state = CALLOC_STRUCT(pipe_scissor_state);
+    r300->textures_state.state = CALLOC_STRUCT(r300_textures_state);
+    r300->viewport_state.state = CALLOC_STRUCT(r300_viewport_state);
+    r300->ztop_state.state = CALLOC_STRUCT(r300_ztop_state);
+    r300->fs_constants.state = CALLOC_STRUCT(r300_constant_buffer);
+    r300->vs_constants.state = CALLOC_STRUCT(r300_constant_buffer);
+    if (!r300->screen->caps.has_tcl) {
+        r300->vertex_stream_state.state = CALLOC_STRUCT(r300_vertex_stream_state);
+    }
+
+    /* Some non-CSO atoms don't use the state pointer. */
+    r300->invariant_state.allow_null_state = TRUE;
+    r300->fs_rc_constant_state.allow_null_state = TRUE;
+    r300->pvs_flush.allow_null_state = TRUE;
+    r300->query_start.allow_null_state = TRUE;
+    r300->texture_cache_inval.allow_null_state = TRUE;
+}
+
+/* Not every state tracker calls every driver function before the first draw
+ * call and we must initialize the command buffers somehow. */
+static void r300_init_states(struct pipe_context *pipe)
+{
+    struct pipe_blend_color bc = {{0}};
+    struct pipe_clip_state cs = {{{0}}};
+    struct pipe_scissor_state ss = {0};
+    struct r300_clip_state *clip =
+            (struct r300_clip_state*)r300_context(pipe)->clip_state.state;
+    CB_LOCALS;
+
+    pipe->set_blend_color(pipe, &bc);
+    pipe->set_scissor_state(pipe, &ss);
+
+    if (r300_context(pipe)->screen->caps.has_tcl) {
+        pipe->set_clip_state(pipe, &cs);
+    } else {
+        BEGIN_CB(clip->cb, 2);
+        OUT_CB_REG(R300_VAP_CLIP_CNTL, R300_CLIP_DISABLE);
+        END_CB;
+    }
+}
+
+struct pipe_context* r300_create_context(struct pipe_screen* screen,
+                                         void *priv)
+{
+    struct r300_context* r300 = CALLOC_STRUCT(r300_context);
+    struct r300_screen* r300screen = r300_screen(screen);
+    struct r300_winsys_screen *rws = r300screen->rws;
+
+    if (!r300)
+        return NULL;
+
+    r300->rws = rws;
+    r300->screen = r300screen;
+
+    r300->context.winsys = (struct pipe_winsys*)rws;
+    r300->context.screen = screen;
+    r300->context.priv = priv;
+
+    r300->context.destroy = r300_destroy_context;
+
+    if (!r300screen->caps.has_tcl) {
+        /* Create a Draw. This is used for SW TCL. */
+        r300->draw = draw_create(&r300->context);
+        /* Enable our renderer. */
+        draw_set_rasterize_stage(r300->draw, r300_draw_stage(r300));
+        /* Enable Draw's clipping. */
+        draw_set_driver_clipping(r300->draw, FALSE);
+        /* Disable converting points/lines to triangles. */
+        draw_wide_line_threshold(r300->draw, 10000000.f);
+        draw_wide_point_threshold(r300->draw, 10000000.f);
+    }
+
+    r300_setup_atoms(r300);
+
+    make_empty_list(&r300->query_list);
+
+    r300_init_blit_functions(r300);
+    r300_init_flush_functions(r300);
+    r300_init_query_functions(r300);
+    r300_init_render_functions(r300);
+    r300_init_state_functions(r300);
+    r300_init_resource_functions(r300);
+
+    r300->invariant_state.dirty = TRUE;
+
+    rws->set_flush_cb(r300->rws, r300_flush_cb, r300);
+    r300->dirty_hw++;
+
+    r300->blitter = util_blitter_create(&r300->context);
+
+    r300->upload_ib = u_upload_create(&r300->context,
+				      32 * 1024, 16,
+				      PIPE_BIND_INDEX_BUFFER);
+
+    if (r300->upload_ib == NULL)
+        goto no_upload_ib;
+
+    r300->upload_vb = u_upload_create(&r300->context,
+				      128 * 1024, 16,
+				      PIPE_BIND_VERTEX_BUFFER);
+    if (r300->upload_vb == NULL)
+        goto no_upload_vb;
+
+    r300->tran.translate_cache = translate_cache_create();
+
+    r300_init_states(&r300->context);
+
+    /* The KIL opcode needs the first texture unit to be enabled
+     * on r3xx-r4xx. In order to calm down the CS checker, we bind this
+     * dummy texture there. */
+    if (!r300->screen->caps.is_r500) {
+        struct pipe_resource *tex;
+        struct pipe_resource rtempl = {{0}};
+        struct pipe_sampler_view vtempl = {{0}};
+
+        rtempl.target = PIPE_TEXTURE_2D;
+        rtempl.format = PIPE_FORMAT_I8_UNORM;
+        rtempl.bind = PIPE_BIND_SAMPLER_VIEW;
+        rtempl.width0 = 1;
+        rtempl.height0 = 1;
+        rtempl.depth0 = 1;
+        tex = screen->resource_create(screen, &rtempl);
+
+        u_sampler_view_default_template(&vtempl, tex, tex->format);
+
+        r300->texkill_sampler = (struct r300_sampler_view*)
+            r300->context.create_sampler_view(&r300->context, tex, &vtempl);
+
+        pipe_resource_reference(&tex, NULL);
+
+        /* This will make sure that the dummy texture is set up
+         * from the beginning even if an application does not use
+         * textures. */
+        r300->textures_state.dirty = TRUE;
+    }
+
+    return &r300->context;
+
+ no_upload_ib:
+    u_upload_destroy(r300->upload_ib);
+ no_upload_vb:
+    FREE(r300);
+    return NULL;
+}
+
+boolean r300_check_cs(struct r300_context *r300, unsigned size)
+{
+    return size <= r300->rws->get_cs_free_dwords(r300->rws);
+}
+
+void r300_finish(struct r300_context *r300)
+{
+    struct pipe_framebuffer_state *fb;
+    unsigned i;
+
+    /* This is a preliminary implementation of glFinish.
+     *
+     * The ideal implementation should use something like EmitIrqLocked and
+     * WaitIrq, or better, real fences.
+     */
+    if (r300->fb_state.state) {
+        fb = r300->fb_state.state;
+
+        for (i = 0; i < fb->nr_cbufs; i++) {
+            if (fb->cbufs[i]->texture) {
+                r300->rws->buffer_wait(r300->rws,
+                    r300_texture(fb->cbufs[i]->texture)->buffer);
+                return;
+            }
+        }
+        if (fb->zsbuf && fb->zsbuf->texture) {
+            r300->rws->buffer_wait(r300->rws,
+                r300_texture(fb->zsbuf->texture)->buffer);
+        }
+    }
+}
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
new file mode 100644
index 0000000000..8d0b4bb3d3
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -0,0 +1,591 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_CONTEXT_H
+#define R300_CONTEXT_H
+
+#include "draw/draw_vertex.h"
+
+#include "util/u_blitter.h"
+
+#include "pipe/p_context.h"
+#include "util/u_inlines.h"
+#include "util/u_transfer.h"
+
+#include "translate/translate_cache.h"
+
+#include "r300_defines.h"
+#include "r300_screen.h"
+
+struct u_upload_mgr;
+struct r300_context;
+struct r300_fragment_shader;
+struct r300_vertex_shader;
+struct r300_stencilref_context;
+
+struct r300_atom {
+    /* List pointers. */
+    struct r300_atom *prev, *next;
+    /* Name, for debugging. */
+    const char* name;
+    /* Stat counter. */
+    uint64_t counter;
+    /* Opaque state. */
+    void* state;
+    /* Emit the state to the context. */
+    void (*emit)(struct r300_context*, unsigned, void*);
+    /* Upper bound on number of dwords to emit. */
+    unsigned size;
+    /* Whether this atom should be emitted. */
+    boolean dirty;
+    /* Whether this atom may be emitted with state == NULL. */
+    boolean allow_null_state;
+};
+
+struct r300_blend_state {
+    uint32_t cb[8];
+    uint32_t cb_no_readwrite[8];
+};
+
+struct r300_blend_color_state {
+    uint32_t cb[3];
+};
+
+struct r300_clip_state {
+    struct pipe_clip_state clip;
+
+    uint32_t cb[29];
+};
+
+struct r300_dsa_state {
+    struct pipe_depth_stencil_alpha_state dsa;
+
+    /* This is actually a command buffer with named dwords. */
+    uint32_t cb_begin;
+    uint32_t alpha_function;    /* R300_FG_ALPHA_FUNC: 0x4bd4 */
+    uint32_t cb_reg_seq;
+    uint32_t z_buffer_control;  /* R300_ZB_CNTL: 0x4f00 */
+    uint32_t z_stencil_control; /* R300_ZB_ZSTENCILCNTL: 0x4f04 */
+    uint32_t stencil_ref_mask;  /* R300_ZB_STENCILREFMASK: 0x4f08 */
+    uint32_t cb_reg;
+    uint32_t stencil_ref_bf;    /* R500_ZB_STENCILREFMASK_BF: 0x4fd4 */
+
+    /* The second command buffer disables zbuffer reads and writes. */
+    uint32_t cb_no_readwrite[8];
+
+    /* Whether a two-sided stencil is enabled. */
+    boolean two_sided;
+    /* Whether a fallback should be used for a two-sided stencil ref value. */
+    boolean two_sided_stencil_ref;
+};
+
+struct r300_rs_state {
+    /* Original rasterizer state. */
+    struct pipe_rasterizer_state rs;
+    /* Draw-specific rasterizer state. */
+    struct pipe_rasterizer_state rs_draw;
+
+    uint32_t vap_control_status;    /* R300_VAP_CNTL_STATUS: 0x2140 */
+    uint32_t multisample_position_0;/* R300_GB_MSPOS0: 0x4010 */
+    uint32_t multisample_position_1;/* R300_GB_MSPOS1: 0x4014 */
+    uint32_t antialiasing_config;   /* R300_GB_AA_CONFIG: 0x4020 */
+    uint32_t point_size;            /* R300_GA_POINT_SIZE: 0x421c */
+    uint32_t point_minmax;          /* R300_GA_POINT_MINMAX: 0x4230 */
+    uint32_t line_control;          /* R300_GA_LINE_CNTL: 0x4234 */
+    float depth_scale;            /* R300_SU_POLY_OFFSET_FRONT_SCALE: 0x42a4 */
+                                  /* R300_SU_POLY_OFFSET_BACK_SCALE: 0x42ac */
+    float depth_offset;           /* R300_SU_POLY_OFFSET_FRONT_OFFSET: 0x42a8 */
+                                  /* R300_SU_POLY_OFFSET_BACK_OFFSET: 0x42b0 */
+    uint32_t polygon_offset_enable; /* R300_SU_POLY_OFFSET_ENABLE: 0x42b4 */
+    uint32_t cull_mode;             /* R300_SU_CULL_MODE: 0x42b8 */
+    uint32_t line_stipple_config;   /* R300_GA_LINE_STIPPLE_CONFIG: 0x4328 */
+    uint32_t line_stipple_value;    /* R300_GA_LINE_STIPPLE_VALUE: 0x4260 */
+    uint32_t color_control;         /* R300_GA_COLOR_CONTROL: 0x4278 */
+    uint32_t polygon_mode;          /* R300_GA_POLY_MODE: 0x4288 */
+    uint32_t clip_rule;             /* R300_SC_CLIP_RULE: 0x43D0 */
+
+    /* Specifies top of Raster pipe specific enable controls,
+     * i.e. texture coordinates stuffing for points, lines, triangles */
+    uint32_t stuffing_enable;       /* R300_GB_ENABLE: 0x4008 */
+
+    /* Point sprites texture coordinates, 0: lower left, 1: upper right */
+    float point_texcoord_left;      /* R300_GA_POINT_S0: 0x4200 */
+    float point_texcoord_bottom;    /* R300_GA_POINT_T0: 0x4204 */
+    float point_texcoord_right;     /* R300_GA_POINT_S1: 0x4208 */
+    float point_texcoord_top;       /* R300_GA_POINT_T1: 0x420c */
+};
+
+struct r300_rs_block {
+    uint32_t vap_vtx_state_cntl;  /* R300_VAP_VTX_STATE_CNTL: 0x2180 */
+    uint32_t vap_vsm_vtx_assm;    /* R300_VAP_VSM_VTX_ASSM: 0x2184 */
+    uint32_t vap_out_vtx_fmt[2];  /* R300_VAP_OUTPUT_VTX_FMT_[0-1]: 0x2090 */
+
+    uint32_t ip[8]; /* R300_RS_IP_[0-7], R500_RS_IP_[0-7] */
+    uint32_t count; /* R300_RS_COUNT */
+    uint32_t inst_count; /* R300_RS_INST_COUNT */
+    uint32_t inst[8]; /* R300_RS_INST_[0-7] */
+};
+
+struct r300_sampler_state {
+    struct pipe_sampler_state state;
+
+    uint32_t filter0;      /* R300_TX_FILTER0: 0x4400 */
+    uint32_t filter1;      /* R300_TX_FILTER1: 0x4440 */
+    uint32_t border_color; /* R300_TX_BORDER_COLOR: 0x45c0 */
+
+    /* Min/max LOD must be clamped to [0, last_level], thus
+     * it's dependent on a currently bound texture */
+    unsigned min_lod, max_lod;
+};
+
+struct r300_texture_format_state {
+    uint32_t format0; /* R300_TX_FORMAT0: 0x4480 */
+    uint32_t format1; /* R300_TX_FORMAT1: 0x44c0 */
+    uint32_t format2; /* R300_TX_FORMAT2: 0x4500 */
+    uint32_t tile_config; /* R300_TX_OFFSET (subset thereof) */
+};
+
+struct r300_sampler_view {
+    struct pipe_sampler_view base;
+
+    /* Swizzles in the UTIL_FORMAT_SWIZZLE_* representation,
+     * derived from base. */
+    unsigned char swizzle[4];
+
+    /* Copy of r300_texture::texture_format_state with format-specific bits
+     * added. */
+    struct r300_texture_format_state format;
+
+    /* The texture cache region for this texture. */
+    uint32_t texcache_region;
+};
+
+struct r300_texture_fb_state {
+    uint32_t pitch[R300_MAX_TEXTURE_LEVELS]; /* COLORPITCH or DEPTHPITCH. */
+    uint32_t format; /* US_OUT_FMT or R300_ZB_FORMAT */
+};
+
+struct r300_texture_sampler_state {
+    struct r300_texture_format_state format;
+    uint32_t filter0;      /* R300_TX_FILTER0: 0x4400 */
+    uint32_t filter1;      /* R300_TX_FILTER1: 0x4440 */
+    uint32_t border_color;  /* R300_TX_BORDER_COLOR: 0x45c0 */
+};
+
+struct r300_textures_state {
+    /* Textures. */
+    struct r300_sampler_view *sampler_views[16];
+    int sampler_view_count;
+    /* Sampler states. */
+    struct r300_sampler_state *sampler_states[16];
+    int sampler_state_count;
+
+    /* This is the merge of the texture and sampler states. */
+    unsigned count;
+    uint32_t tx_enable;         /* R300_TX_ENABLE: 0x4101 */
+    struct r300_texture_sampler_state regs[16];
+};
+
+struct r300_vertex_stream_state {
+    /* R300_VAP_PROG_STREAK_CNTL_[0-7] */
+    uint32_t vap_prog_stream_cntl[8];
+    /* R300_VAP_PROG_STREAK_CNTL_EXT_[0-7] */
+    uint32_t vap_prog_stream_cntl_ext[8];
+
+    unsigned count;
+};
+
+struct r300_viewport_state {
+    float xscale;         /* R300_VAP_VPORT_XSCALE:  0x2098 */
+    float xoffset;        /* R300_VAP_VPORT_XOFFSET: 0x209c */
+    float yscale;         /* R300_VAP_VPORT_YSCALE:  0x20a0 */
+    float yoffset;        /* R300_VAP_VPORT_YOFFSET: 0x20a4 */
+    float zscale;         /* R300_VAP_VPORT_ZSCALE:  0x20a8 */
+    float zoffset;        /* R300_VAP_VPORT_ZOFFSET: 0x20ac */
+    uint32_t vte_control; /* R300_VAP_VTE_CNTL:      0x20b0 */
+};
+
+struct r300_ztop_state {
+    uint32_t z_buffer_top;      /* R300_ZB_ZTOP: 0x4f14 */
+};
+
+/* The next several objects are not pure Radeon state; they inherit from
+ * various Gallium classes. */
+
+struct r300_constant_buffer {
+    /* Buffer of constants */
+    uint32_t constants[256][4];
+    /* Total number of constants */
+    unsigned count;
+};
+
+/* Query object.
+ *
+ * This is not a subclass of pipe_query because pipe_query is never
+ * actually fully defined. So, rather than have it as a member, and do
+ * subclass-style casting, we treat pipe_query as an opaque, and just
+ * trust that our state tracker does not ever mess up query objects.
+ */
+struct r300_query {
+    /* The kind of query. Currently only OQ is supported. */
+    unsigned type;
+    /* The number of pipes where query results are stored. */
+    unsigned num_pipes;
+    /* How many results have been written, in dwords. It's incremented
+     * after end_query and flush. */
+    unsigned num_results;
+    /* if we've flushed the query */
+    boolean flushed;
+    /* if begin has been emitted */
+    boolean begin_emitted;
+
+    /* The buffer where query results are stored. */
+    struct r300_winsys_buffer *buffer;
+    /* The size of the buffer. */
+    unsigned buffer_size;
+    /* The domain of the buffer. */
+    enum r300_buffer_domain domain;
+
+    /* Linked list members. */
+    struct r300_query* prev;
+    struct r300_query* next;
+};
+
+/* Fence object.
+ *
+ * This is a fake fence. Instead of syncing with the fence, we sync
+ * with the context, which is inefficient but compliant.
+ *
+ * This is not a subclass of pipe_fence_handle because pipe_fence_handle is
+ * never actually fully defined. So, rather than have it as a member, and do
+ * subclass-style casting, we treat pipe_fence_handle as an opaque, and just
+ * trust that our state tracker does not ever mess up fence objects.
+ */
+struct r300_fence {
+    struct pipe_reference reference;
+    struct r300_context *ctx;
+    boolean signalled;
+};
+
+struct r300_surface {
+    struct pipe_surface base;
+
+    /* Winsys buffer backing the texture. */
+    struct r300_winsys_buffer *buffer;
+
+    enum r300_buffer_domain domain;
+
+    uint32_t offset;
+    uint32_t pitch;     /* COLORPITCH or DEPTHPITCH. */
+    uint32_t format;    /* US_OUT_FMT or R300_ZB_FORMAT. */
+};
+
+struct r300_texture {
+    /* Parent class */
+    struct u_resource b;
+
+    enum r300_buffer_domain domain;
+
+    /* Offsets into the buffer. */
+    unsigned offset[R300_MAX_TEXTURE_LEVELS];
+
+    /* A pitch for each mip-level */
+    unsigned pitch[R300_MAX_TEXTURE_LEVELS];
+
+    /* A pitch multiplied by blockwidth as hardware wants
+     * the number of pixels instead of the number of blocks. */
+    unsigned hwpitch[R300_MAX_TEXTURE_LEVELS];
+
+    /* Size of one zslice or face based on the texture target */
+    unsigned layer_size[R300_MAX_TEXTURE_LEVELS];
+
+    /* Whether the mipmap level is macrotiled. */
+    enum r300_buffer_tiling mip_macrotile[R300_MAX_TEXTURE_LEVELS];
+
+    /**
+     * If non-zero, override the natural texture layout with
+     * a custom stride (in bytes).
+     *
+     * \note Mipmapping fails for textures with a non-natural layout!
+     *
+     * \sa r300_texture_get_stride
+     */
+    unsigned stride_override;
+
+    /* Total size of this texture, in bytes. */
+    unsigned size;
+
+    /* Whether this texture has non-power-of-two dimensions
+     * or a user-specified pitch.
+     * It can be either a regular texture or a rectangle one.
+     */
+    boolean uses_pitch;
+
+    /* Pipe buffer backing this texture. */
+    struct r300_winsys_buffer *buffer;
+
+    /* Registers carrying texture format data. */
+    /* Only format-independent bits should be filled in. */
+    struct r300_texture_format_state tx_format;
+    /* All bits should be filled in. */
+    struct r300_texture_fb_state fb_state;
+
+    /* Buffer tiling */
+    enum r300_buffer_tiling microtile, macrotile;
+};
+
+struct r300_vertex_element_state {
+    unsigned count;
+    struct pipe_vertex_element velem[PIPE_MAX_ATTRIBS];
+
+    /* If (velem[i].src_format != hw_format[i]), the vertex buffer
+     * referenced by this vertex element cannot be used for rendering and
+     * its vertex data must be translated to hw_format[i]. */
+    enum pipe_format hw_format[PIPE_MAX_ATTRIBS];
+    unsigned hw_format_size[PIPE_MAX_ATTRIBS];
+
+    /* The size of the vertex, in dwords. */
+    unsigned vertex_size_dwords;
+
+    /* This might mean two things:
+     * - src_format != hw_format, as discussed above.
+     * - src_offset % 4 != 0. */
+    boolean incompatible_layout;
+
+    struct r300_vertex_stream_state vertex_stream;
+};
+
+struct r300_translate_context {
+    /* Translate cache for incompatible vertex offset/stride/format fallback. */
+    struct translate_cache *translate_cache;
+
+    /* The vertex buffer slot containing the translated buffer. */
+    unsigned vb_slot;
+
+    /* Saved and new vertex element state. */
+    void *saved_velems, *new_velems;
+};
+
+struct r300_context {
+    /* Parent class */
+    struct pipe_context context;
+
+    /* The interface to the windowing system, etc. */
+    struct r300_winsys_screen *rws;
+    /* Screen. */
+    struct r300_screen *screen;
+    /* Draw module. Used mostly for SW TCL. */
+    struct draw_context* draw;
+    /* Accelerated blit support. */
+    struct blitter_context* blitter;
+    /* Stencil two-sided reference value fallback. */
+    struct r300_stencilref_context *stencilref_fallback;
+    /* For translating vertex buffers having incompatible vertex layout. */
+    struct r300_translate_context tran;
+
+    /* Vertex buffer for rendering. */
+    struct pipe_resource* vbo;
+    /* The KIL opcode needs the first texture unit to be enabled
+     * on r3xx-r4xx. In order to calm down the CS checker, we bind this
+     * dummy texture there. */
+    struct r300_sampler_view *texkill_sampler;
+    /* Offset into the VBO. */
+    size_t vbo_offset;
+
+    /* The currently active query. */
+    struct r300_query *query_current;
+    /* The saved query for blitter operations. */
+    struct r300_query *blitter_saved_query;
+    /* Query list. */
+    struct r300_query query_list;
+
+    /* Various CSO state objects. */
+    /* Beginning of atom list. */
+    struct r300_atom atom_list;
+    /* Blend state. */
+    struct r300_atom blend_state;
+    /* Blend color state. */
+    struct r300_atom blend_color_state;
+    /* User clip planes. */
+    struct r300_atom clip_state;
+    /* Depth, stencil, and alpha state. */
+    struct r300_atom dsa_state;
+    /* Fragment shader. */
+    struct r300_atom fs;
+    /* Fragment shader RC_CONSTANT_STATE variables. */
+    struct r300_atom fs_rc_constant_state;
+    /* Fragment shader constant buffer. */
+    struct r300_atom fs_constants;
+    /* Framebuffer state. */
+    struct r300_atom fb_state;
+    /* Occlusion query. */
+    struct r300_atom query_start;
+    /* Rasterizer state. */
+    struct r300_atom rs_state;
+    /* RS block state + VAP (vertex shader) output mapping state. */
+    struct r300_atom rs_block_state;
+    /* Scissor state. */
+    struct r300_atom scissor_state;
+    /* Textures state. */
+    struct r300_atom textures_state;
+    /* Vertex stream formatting state. */
+    struct r300_atom vertex_stream_state;
+    /* Vertex shader. */
+    struct r300_atom vs_state;
+    /* Vertex shader constant buffer. */
+    struct r300_atom vs_constants;
+    /* Viewport state. */
+    struct r300_atom viewport_state;
+    /* ZTOP state. */
+    struct r300_atom ztop_state;
+    /* PVS flush. */
+    struct r300_atom pvs_flush;
+    /* Texture cache invalidate. */
+    struct r300_atom texture_cache_inval;
+
+    /* Invariant state. This must be emitted to get the engine started. */
+    struct r300_atom invariant_state;
+
+    /* Vertex buffers for Gallium. */
+    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+    int vertex_buffer_count;
+    int vertex_buffer_max_index;
+    /* Vertex elements for Gallium. */
+    struct r300_vertex_element_state *velems;
+    bool any_user_vbs;
+
+    /* Vertex info for Draw. */
+    struct vertex_info vertex_info;
+
+    struct pipe_stencil_ref stencil_ref;
+    struct pipe_viewport_state viewport;
+
+    /* Stream locations for SWTCL. */
+    int stream_loc_notcl[16];
+
+    /* Flag indicating whether or not the HW is dirty. */
+    uint32_t dirty_hw;
+    /* Whether polygon offset is enabled. */
+    boolean polygon_offset_enabled;
+    /* Z buffer bit depth. */
+    uint32_t zbuffer_bpp;
+    /* Whether rendering is conditional and should be skipped. */
+    boolean skip_rendering;
+    /* Point sprites texcoord index,  1 bit per texcoord */
+    int sprite_coord_enable;
+    /* Whether two-sided color selection is enabled (AKA light_twoside). */
+    boolean two_sided_color;
+    /* Incompatible vertex buffer layout? (misaligned stride or buffer_offset) */
+    boolean incompatible_vb_layout;
+
+    /* upload managers */
+    struct u_upload_mgr *upload_vb;
+    struct u_upload_mgr *upload_ib;
+
+    /* Stat counter. */
+    uint64_t flush_counter;
+};
+
+/* Convenience cast wrappers. */
+static INLINE struct r300_query* r300_query(struct pipe_query* q)
+{
+    return (struct r300_query*)q;
+}
+
+static INLINE struct r300_surface* r300_surface(struct pipe_surface* surf)
+{
+    return (struct r300_surface*)surf;
+}
+
+static INLINE struct r300_texture* r300_texture(struct pipe_resource* tex)
+{
+    return (struct r300_texture*)tex;
+}
+
+static INLINE struct r300_context* r300_context(struct pipe_context* context)
+{
+    return (struct r300_context*)context;
+}
+
+static INLINE struct r300_fragment_shader *r300_fs(struct r300_context *r300)
+{
+    return (struct r300_fragment_shader*)r300->fs.state;
+}
+
+struct pipe_context* r300_create_context(struct pipe_screen* screen,
+                                         void *priv);
+
+boolean r300_check_cs(struct r300_context *r300, unsigned size);
+void r300_finish(struct r300_context *r300);
+
+/* Context initialization. */
+struct draw_stage* r300_draw_stage(struct r300_context* r300);
+void r300_init_blit_functions(struct r300_context *r300);
+void r300_init_flush_functions(struct r300_context* r300);
+void r300_init_query_functions(struct r300_context* r300);
+void r300_init_render_functions(struct r300_context *r300);
+void r300_init_state_functions(struct r300_context* r300);
+void r300_init_resource_functions(struct r300_context* r300);
+
+/* r300_query.c */
+void r300_resume_query(struct r300_context *r300,
+                       struct r300_query *query);
+void r300_stop_query(struct r300_context *r300);
+
+/* r300_render_translate.c */
+void r300_begin_vertex_translate(struct r300_context *r300);
+void r300_end_vertex_translate(struct r300_context *r300);
+void r300_translate_index_buffer(struct r300_context *r300,
+                                 struct pipe_resource **index_buffer,
+                                 unsigned *index_size, unsigned index_offset,
+                                 unsigned *start, unsigned count);
+
+/* r300_render_stencilref.c */
+void r300_plug_in_stencil_ref_fallback(struct r300_context *r300);
+
+/* r300_state.c */
+void r300_mark_fs_code_dirty(struct r300_context *r300);
+
+/* r300_debug.c */
+void r500_dump_rs_block(struct r300_rs_block *rs);
+
+
+static INLINE boolean CTX_DBG_ON(struct r300_context * ctx, unsigned flags)
+{
+    return SCREEN_DBG_ON(ctx->screen, flags);
+}
+
+static INLINE void CTX_DBG(struct r300_context * ctx, unsigned flags,
+                       const char * fmt, ...)
+{
+    if (CTX_DBG_ON(ctx, flags)) {
+        va_list va;
+        va_start(va, fmt);
+        vfprintf(stderr, fmt, va);
+        va_end(va);
+    }
+}
+
+#define DBG_ON  CTX_DBG_ON
+#define DBG     CTX_DBG
+
+#endif /* R300_CONTEXT_H */
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
new file mode 100644
index 0000000000..1db7da642b
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/**
+ * This file contains macros for immediate command submission.
+ */
+
+#ifndef R300_CS_H
+#define R300_CS_H
+
+#include "r300_reg.h"
+#include "r300_context.h"
+#include "r300_winsys.h"
+
+/* Yes, I know macros are ugly. However, they are much prettier than the code
+ * that they neatly hide away, and don't have the cost of function setup,so
+ * we're going to use them. */
+
+#ifdef DEBUG
+#define CS_DEBUG(x) x
+#else
+#define CS_DEBUG(x)
+#endif
+
+/**
+ * Command submission setup.
+ */
+
+#define CS_LOCALS(context) \
+    struct r300_context* const cs_context_copy = (context); \
+    struct r300_winsys_screen *cs_winsys = cs_context_copy->rws; \
+    CS_DEBUG(int cs_count = 0; (void) cs_count;)
+
+#define BEGIN_CS(size) do { \
+    assert(r300_check_cs(cs_context_copy, (size))); \
+    CS_DEBUG(cs_count = size;) \
+} while (0)
+
+#ifdef DEBUG
+#define END_CS do { \
+    if (cs_count != 0) \
+        debug_printf("r300: Warning: cs_count off by %d at (%s, %s:%i)\n", \
+                     cs_count, __FUNCTION__, __FILE__, __LINE__); \
+    cs_count = 0; \
+} while (0)
+#else
+#define END_CS
+#endif
+
+/**
+ * Writing pure DWORDs.
+ */
+
+#define OUT_CS(value) do { \
+    cs_winsys->write_cs_dword(cs_winsys, (value)); \
+    CS_DEBUG(cs_count--;) \
+} while (0)
+
+#define OUT_CS_32F(value) do { \
+    cs_winsys->write_cs_dword(cs_winsys, fui(value)); \
+    CS_DEBUG(cs_count--;) \
+} while (0)
+
+#define OUT_CS_REG(register, value) do { \
+    assert(register); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0(register, 0)); \
+    cs_winsys->write_cs_dword(cs_winsys, value); \
+    CS_DEBUG(cs_count -= 2;) \
+} while (0)
+
+/* Note: This expects count to be the number of registers,
+ * not the actual packet0 count! */
+#define OUT_CS_REG_SEQ(register, count) do { \
+    assert(register); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0((register), ((count) - 1))); \
+    CS_DEBUG(cs_count--;) \
+} while (0)
+
+#define OUT_CS_TABLE(values, count) do { \
+    cs_winsys->write_cs_table(cs_winsys, values, count); \
+    CS_DEBUG(cs_count -= count;) \
+} while (0)
+
+#define OUT_CS_ONE_REG(register, count) do { \
+    assert(register); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0((register), ((count) - 1)) | RADEON_ONE_REG_WR); \
+    CS_DEBUG(cs_count--;) \
+} while (0)
+
+#define OUT_CS_PKT3(op, count) do { \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET3(op, count)); \
+    CS_DEBUG(cs_count--;) \
+} while (0)
+
+
+/**
+ * Writing relocations.
+ */
+
+#define OUT_CS_RELOC(bo, offset, rd, wd, flags) do { \
+    assert(bo); \
+    cs_winsys->write_cs_dword(cs_winsys, offset); \
+    cs_winsys->write_cs_reloc(cs_winsys, bo, rd, wd, flags); \
+    CS_DEBUG(cs_count -= 3;) \
+} while (0)
+
+#define OUT_CS_BUF_RELOC(bo, offset, rd, wd, flags) do { \
+    assert(bo); \
+    OUT_CS_RELOC(r300_buffer(bo)->buf, offset, rd, wd, flags); \
+} while (0)
+
+#define OUT_CS_TEX_RELOC(tex, offset, rd, wd, flags) do { \
+    assert(tex); \
+    OUT_CS_RELOC(tex->buffer, offset, rd, wd, flags); \
+} while (0)
+
+#define OUT_CS_BUF_RELOC_NO_OFFSET(bo, rd, wd, flags) do { \
+    assert(bo); \
+    cs_winsys->write_cs_reloc(cs_winsys, r300_buffer(bo)->buf, rd, wd, flags); \
+    CS_DEBUG(cs_count -= 2;) \
+} while (0)
+
+
+/**
+ * Command buffer emission.
+ */
+
+#define WRITE_CS_TABLE(values, count) do { \
+    CS_DEBUG(assert(cs_count == 0);) \
+    cs_winsys->write_cs_table(cs_winsys, values, count); \
+} while (0)
+
+#endif /* R300_CS_H */
diff --git a/src/gallium/drivers/r300/r300_debug.c b/src/gallium/drivers/r300/r300_debug.c
new file mode 100644
index 0000000000..a6cd86e392
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_debug.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2009 Nicolai Haehnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_context.h"
+
+#include "util/u_debug.h"
+
+#include <stdio.h>
+
+static const struct debug_named_value debug_options[] = {
+    { "fp", DBG_FP, "Fragment program handling (for debugging)" },
+    { "vp", DBG_VP, "Vertex program handling (for debugging)" },
+    { "draw", DBG_DRAW, "Draw and emit (for debugging)" },
+    { "tex", DBG_TEX, "Textures (for debugging)" },
+    { "texalloc", DBG_TEXALLOC, "Texture allocation (for debugging)" },
+    { "fall", DBG_FALL, "Fallbacks (for debugging)" },
+    { "rs", DBG_RS, "Rasterizer (for debugging)" },
+    { "fb", DBG_FB, "Framebuffer (for debugging)" },
+    { "anisohq", DBG_ANISOHQ, "High quality anisotropic filtering (for benchmarking)" },
+    { "notiling", DBG_NO_TILING, "Disable tiling (for benchmarking)" },
+    { "noimmd", DBG_NO_IMMD, "Disable immediate mode (for benchmarking)" },
+    { "fakeocc", DBG_FAKE_OCC, "Use fake occlusion queries (for lulz)" },
+    { "stats", DBG_STATS, "Gather statistics (for lulz)" },
+
+    /* must be last */
+    DEBUG_NAMED_VALUE_END
+};
+
+void r300_init_debug(struct r300_screen * screen)
+{
+    screen->debug = debug_get_flags_option("RADEON_DEBUG", debug_options, 0);
+}
+
+void r500_dump_rs_block(struct r300_rs_block *rs)
+{
+    unsigned count, ip, it_count, ic_count, i, j;
+    unsigned tex_ptr;
+    unsigned col_ptr, col_fmt;
+
+    count = rs->inst_count & 0xf;
+    count++;
+
+    it_count = rs->count & 0x7f;
+    ic_count = (rs->count >> 7) & 0xf;
+
+    fprintf(stderr, "RS Block: %d texcoords (linear), %d colors (perspective)\n",
+        it_count, ic_count);
+    fprintf(stderr, "%d instructions\n", count);
+
+    for (i = 0; i < count; i++) {
+        if (rs->inst[i] & 0x10) {
+            ip = rs->inst[i] & 0xf;
+            fprintf(stderr, "texture: ip %d to psf %d\n",
+                ip, (rs->inst[i] >> 5) & 0x7f);
+
+            tex_ptr = rs->ip[ip] & 0xffffff;
+            fprintf(stderr, "       : ");
+
+            j = 3;
+            do {
+                if ((tex_ptr & 0x3f) == 63) {
+                    fprintf(stderr, "1.0");
+                } else if ((tex_ptr & 0x3f) == 62) {
+                    fprintf(stderr, "0.0");
+                } else {
+                    fprintf(stderr, "[%d]", tex_ptr & 0x3f);
+                }
+            } while (j-- && fprintf(stderr, "/"));
+            fprintf(stderr, "\n");
+        }
+
+        if (rs->inst[i] & 0x10000) {
+            ip = (rs->inst[i] >> 12) & 0xf;
+            fprintf(stderr, "color: ip %d to psf %d\n",
+                ip, (rs->inst[i] >> 18) & 0x7f);
+
+            col_ptr = (rs->ip[ip] >> 24) & 0x7;
+            col_fmt = (rs->ip[ip] >> 27) & 0xf;
+            fprintf(stderr, "     : offset %d ", col_ptr);
+
+            switch (col_fmt) {
+                case 0:
+                    fprintf(stderr, "(R/G/B/A)");
+                    break;
+                case 1:
+                    fprintf(stderr, "(R/G/B/0)");
+                    break;
+                case 2:
+                    fprintf(stderr, "(R/G/B/1)");
+                    break;
+                case 4:
+                    fprintf(stderr, "(0/0/0/A)");
+                    break;
+                case 5:
+                    fprintf(stderr, "(0/0/0/0)");
+                    break;
+                case 6:
+                    fprintf(stderr, "(0/0/0/1)");
+                    break;
+                case 8:
+                    fprintf(stderr, "(1/1/1/A)");
+                    break;
+                case 9:
+                    fprintf(stderr, "(1/1/1/0)");
+                    break;
+                case 10:
+                    fprintf(stderr, "(1/1/1/1)");
+                    break;
+            }
+            fprintf(stderr, "\n");
+        }
+    }
+}
diff --git a/src/gallium/drivers/r300/r300_defines.h b/src/gallium/drivers/r300/r300_defines.h
new file mode 100644
index 0000000000..d510d80a7b
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_defines.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_DEFINES_H
+#define R300_DEFINES_H
+
+#include "pipe/p_defines.h"
+
+#define R300_MAX_TEXTURE_LEVELS         13
+#define R300_MAX_DRAW_VBO_SIZE          (1024 * 1024)
+
+#define R300_RESOURCE_FLAG_TRANSFER     PIPE_RESOURCE_FLAG_DRV_PRIV
+
+#define R300_INVALID_FORMAT 0xffff
+
+/* Tiling flags. */
+enum r300_buffer_tiling {
+    R300_BUFFER_LINEAR = 0,
+    R300_BUFFER_TILED,
+    R300_BUFFER_SQUARETILED
+};
+
+enum r300_buffer_domain { /* bitfield */
+    R300_DOMAIN_GTT  = 1,
+    R300_DOMAIN_VRAM = 2
+};
+
+#endif
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
new file mode 100644
index 0000000000..e2c40d823d
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -0,0 +1,1042 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/* r300_emit: Functions for emitting state. */
+
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_simple_list.h"
+
+#include "r300_context.h"
+#include "r300_cs.h"
+#include "r300_emit.h"
+#include "r300_fs.h"
+#include "r300_screen.h"
+#include "r300_screen_buffer.h"
+#include "r300_vs.h"
+
+void r300_emit_blend_state(struct r300_context* r300,
+                           unsigned size, void* state)
+{
+    struct r300_blend_state* blend = (struct r300_blend_state*)state;
+    struct pipe_framebuffer_state* fb =
+        (struct pipe_framebuffer_state*)r300->fb_state.state;
+    CS_LOCALS(r300);
+
+    if (fb->nr_cbufs) {
+        WRITE_CS_TABLE(blend->cb, size);
+    } else {
+        WRITE_CS_TABLE(blend->cb_no_readwrite, size);
+    }
+}
+
+void r300_emit_blend_color_state(struct r300_context* r300,
+                                 unsigned size, void* state)
+{
+    struct r300_blend_color_state* bc = (struct r300_blend_color_state*)state;
+    CS_LOCALS(r300);
+
+    WRITE_CS_TABLE(bc->cb, size);
+}
+
+void r300_emit_clip_state(struct r300_context* r300,
+                          unsigned size, void* state)
+{
+    struct r300_clip_state* clip = (struct r300_clip_state*)state;
+    CS_LOCALS(r300);
+
+    WRITE_CS_TABLE(clip->cb, size);
+}
+
+void r300_emit_dsa_state(struct r300_context* r300, unsigned size, void* state)
+{
+    struct r300_dsa_state* dsa = (struct r300_dsa_state*)state;
+    struct pipe_framebuffer_state* fb =
+        (struct pipe_framebuffer_state*)r300->fb_state.state;
+    CS_LOCALS(r300);
+
+    if (fb->zsbuf) {
+        WRITE_CS_TABLE(&dsa->cb_begin, size);
+    } else {
+        WRITE_CS_TABLE(dsa->cb_no_readwrite, size);
+    }
+}
+
+static const float * get_rc_constant_state(
+    struct r300_context * r300,
+    struct rc_constant * constant)
+{
+    struct r300_textures_state* texstate = r300->textures_state.state;
+    static float vec[4] = { 0.0, 0.0, 0.0, 1.0 };
+    struct pipe_resource *tex;
+
+    assert(constant->Type == RC_CONSTANT_STATE);
+
+    switch (constant->u.State[0]) {
+        /* Factor for converting rectangle coords to
+         * normalized coords. Should only show up on non-r500. */
+        case RC_STATE_R300_TEXRECT_FACTOR:
+            tex = texstate->sampler_views[constant->u.State[1]]->base.texture;
+            vec[0] = 1.0 / tex->width0;
+            vec[1] = 1.0 / tex->height0;
+            break;
+
+        case RC_STATE_R300_VIEWPORT_SCALE:
+            vec[0] = r300->viewport.scale[0];
+            vec[1] = r300->viewport.scale[1];
+            vec[2] = r300->viewport.scale[2];
+            break;
+
+        case RC_STATE_R300_VIEWPORT_OFFSET:
+            vec[0] = r300->viewport.translate[0];
+            vec[1] = r300->viewport.translate[1];
+            vec[2] = r300->viewport.translate[2];
+            break;
+
+        default:
+            fprintf(stderr, "r300: Implementation error: "
+                "Unknown RC_CONSTANT type %d\n", constant->u.State[0]);
+    }
+
+    /* This should either be (0, 0, 0, 1), which should be a relatively safe
+     * RGBA or STRQ value, or it could be one of the RC_CONSTANT_STATE
+     * state factors. */
+    return vec;
+}
+
+/* Convert a normal single-precision float into the 7.16 format
+ * used by the R300 fragment shader.
+ */
+uint32_t pack_float24(float f)
+{
+    union {
+        float fl;
+        uint32_t u;
+    } u;
+    float mantissa;
+    int exponent;
+    uint32_t float24 = 0;
+
+    if (f == 0.0)
+        return 0;
+
+    u.fl = f;
+
+    mantissa = frexpf(f, &exponent);
+
+    /* Handle -ve */
+    if (mantissa < 0) {
+        float24 |= (1 << 23);
+        mantissa = mantissa * -1.0;
+    }
+    /* Handle exponent, bias of 63 */
+    exponent += 62;
+    float24 |= (exponent << 16);
+    /* Kill 7 LSB of mantissa */
+    float24 |= (u.u & 0x7FFFFF) >> 7;
+
+    return float24;
+}
+
+void r300_emit_fs(struct r300_context* r300, unsigned size, void *state)
+{
+    struct r300_fragment_shader *fs = r300_fs(r300);
+    CS_LOCALS(r300);
+
+    WRITE_CS_TABLE(fs->shader->cb_code, fs->shader->cb_code_size);
+}
+
+void r300_emit_fs_constants(struct r300_context* r300, unsigned size, void *state)
+{
+    struct r300_fragment_shader *fs = r300_fs(r300);
+    struct r300_constant_buffer *buf = (struct r300_constant_buffer*)state;
+    unsigned count = fs->shader->externals_count * 4;
+    CS_LOCALS(r300);
+
+    if (count == 0)
+        return;
+
+    BEGIN_CS(size);
+    OUT_CS_REG_SEQ(R300_PFS_PARAM_0_X, count);
+    OUT_CS_TABLE(buf->constants, count);
+    END_CS;
+}
+
+void r300_emit_fs_rc_constant_state(struct r300_context* r300, unsigned size, void *state)
+{
+    struct r300_fragment_shader *fs = r300_fs(r300);
+    struct rc_constant_list *constants = &fs->shader->code.constants;
+    unsigned i;
+    unsigned count = fs->shader->rc_state_count;
+    unsigned first = fs->shader->externals_count;
+    unsigned end = constants->Count;
+    uint32_t cdata[4];
+    unsigned j;
+    CS_LOCALS(r300);
+
+    if (count == 0)
+        return;
+
+    BEGIN_CS(size);
+    for(i = first; i < end; ++i) {
+        if (constants->Constants[i].Type == RC_CONSTANT_STATE) {
+            const float *data =
+                    get_rc_constant_state(r300, &constants->Constants[i]);
+
+            for (j = 0; j < 4; j++)
+                cdata[j] = pack_float24(data[j]);
+
+            OUT_CS_REG_SEQ(R300_PFS_PARAM_0_X + i * 16, 4);
+            OUT_CS_TABLE(cdata, 4);
+        }
+    }
+    END_CS;
+}
+
+void r500_emit_fs(struct r300_context* r300, unsigned size, void *state)
+{
+    struct r300_fragment_shader *fs = r300_fs(r300);
+    CS_LOCALS(r300);
+
+    WRITE_CS_TABLE(fs->shader->cb_code, fs->shader->cb_code_size);
+}
+
+void r500_emit_fs_constants(struct r300_context* r300, unsigned size, void *state)
+{
+    struct r300_fragment_shader *fs = r300_fs(r300);
+    struct r300_constant_buffer *buf = (struct r300_constant_buffer*)state;
+    unsigned count = fs->shader->externals_count * 4;
+    CS_LOCALS(r300);
+
+    if (count == 0)
+        return;
+
+    BEGIN_CS(size);
+    OUT_CS_REG(R500_GA_US_VECTOR_INDEX, R500_GA_US_VECTOR_INDEX_TYPE_CONST);
+    OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, count);
+    OUT_CS_TABLE(buf->constants, count);
+    END_CS;
+}
+
+void r500_emit_fs_rc_constant_state(struct r300_context* r300, unsigned size, void *state)
+{
+    struct r300_fragment_shader *fs = r300_fs(r300);
+    struct rc_constant_list *constants = &fs->shader->code.constants;
+    unsigned i;
+    unsigned count = fs->shader->rc_state_count;
+    unsigned first = fs->shader->externals_count;
+    unsigned end = constants->Count;
+    CS_LOCALS(r300);
+
+    if (count == 0)
+        return;
+
+    BEGIN_CS(size);
+    for(i = first; i < end; ++i) {
+        if (constants->Constants[i].Type == RC_CONSTANT_STATE) {
+            const float *data =
+                    get_rc_constant_state(r300, &constants->Constants[i]);
+
+            OUT_CS_REG(R500_GA_US_VECTOR_INDEX,
+                       R500_GA_US_VECTOR_INDEX_TYPE_CONST |
+                       (i & R500_GA_US_VECTOR_INDEX_MASK));
+            OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, 4);
+            OUT_CS_TABLE(data, 4);
+        }
+    }
+    END_CS;
+}
+
+void r300_emit_fb_state(struct r300_context* r300, unsigned size, void* state)
+{
+    struct pipe_framebuffer_state* fb = (struct pipe_framebuffer_state*)state;
+    struct r300_surface* surf;
+    unsigned i;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(size);
+
+    /* Set up scissors.
+     * By writing to the SC registers, SC & US assert idle. */
+    OUT_CS_REG_SEQ(R300_SC_SCISSORS_TL, 2);
+    if (r300->screen->caps.is_r500) {
+        OUT_CS(0);
+        OUT_CS(((fb->width  - 1) << R300_SCISSORS_X_SHIFT) |
+               ((fb->height - 1) << R300_SCISSORS_Y_SHIFT));
+    } else {
+        OUT_CS((1440 << R300_SCISSORS_X_SHIFT) |
+               (1440 << R300_SCISSORS_Y_SHIFT));
+        OUT_CS(((fb->width  + 1440-1) << R300_SCISSORS_X_SHIFT) |
+               ((fb->height + 1440-1) << R300_SCISSORS_Y_SHIFT));
+    }
+
+    /* Flush and free renderbuffer caches. */
+    OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT,
+        R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
+        R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+    OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT,
+        R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+        R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+
+    /* Wait until the GPU is idle.
+     * This fixes random pixels sometimes appearing probably caused
+     * by incomplete rendering. */
+    OUT_CS_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
+
+    /* NUM_MULTIWRITES replicates COLOR[0] to all colorbuffers, which is not
+     * what we usually want. */
+    if (r300->screen->caps.is_r500) {
+        OUT_CS_REG(R300_RB3D_CCTL,
+            R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_ENABLE);
+    } else {
+        OUT_CS_REG(R300_RB3D_CCTL, 0);
+    }
+
+    /* Set up colorbuffers. */
+    for (i = 0; i < fb->nr_cbufs; i++) {
+        surf = r300_surface(fb->cbufs[i]);
+
+        OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0 + (4 * i), 1);
+        OUT_CS_RELOC(surf->buffer, surf->offset, 0, surf->domain, 0);
+
+        OUT_CS_REG_SEQ(R300_RB3D_COLORPITCH0 + (4 * i), 1);
+        OUT_CS_RELOC(surf->buffer, surf->pitch, 0, surf->domain, 0);
+
+        OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i), surf->format);
+    }
+    for (; i < 4; i++) {
+        OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i), R300_US_OUT_FMT_UNUSED);
+    }
+
+    /* Set up a zbuffer. */
+    if (fb->zsbuf) {
+        surf = r300_surface(fb->zsbuf);
+
+        OUT_CS_REG_SEQ(R300_ZB_DEPTHOFFSET, 1);
+        OUT_CS_RELOC(surf->buffer, surf->offset, 0, surf->domain, 0);
+
+        OUT_CS_REG(R300_ZB_FORMAT, surf->format);
+
+        OUT_CS_REG_SEQ(R300_ZB_DEPTHPITCH, 1);
+        OUT_CS_RELOC(surf->buffer, surf->pitch, 0, surf->domain, 0);
+    }
+    END_CS;
+}
+
+void r300_emit_query_start(struct r300_context *r300, unsigned size, void*state)
+{
+    struct r300_query *query = r300->query_current;
+    CS_LOCALS(r300);
+
+    if (!query)
+	return;
+
+    BEGIN_CS(size);
+    if (r300->screen->caps.family == CHIP_FAMILY_RV530) {
+        OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL);
+    } else {
+        OUT_CS_REG(R300_SU_REG_DEST, R300_RASTER_PIPE_SELECT_ALL);
+    }
+    OUT_CS_REG(R300_ZB_ZPASS_DATA, 0);
+    END_CS;
+    query->begin_emitted = TRUE;
+    query->flushed = FALSE;
+}
+
+static void r300_emit_query_end_frag_pipes(struct r300_context *r300,
+                                           struct r300_query *query)
+{
+    struct r300_capabilities* caps = &r300->screen->caps;
+    struct r300_winsys_buffer *buf = r300->query_current->buffer;
+    CS_LOCALS(r300);
+
+    assert(caps->num_frag_pipes);
+
+    BEGIN_CS(6 * caps->num_frag_pipes + 2);
+    /* I'm not so sure I like this switch, but it's hard to be elegant
+     * when there's so many special cases...
+     *
+     * So here's the basic idea. For each pipe, enable writes to it only,
+     * then put out the relocation for ZPASS_ADDR, taking into account a
+     * 4-byte offset for each pipe. RV380 and older are special; they have
+     * only two pipes, and the second pipe's enable is on bit 3, not bit 1,
+     * so there's a chipset cap for that. */
+    switch (caps->num_frag_pipes) {
+        case 4:
+            /* pipe 3 only */
+            OUT_CS_REG(R300_SU_REG_DEST, 1 << 3);
+            OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+            OUT_CS_RELOC(buf, (query->num_results + 3) * 4,
+                    0, query->domain, 0);
+        case 3:
+            /* pipe 2 only */
+            OUT_CS_REG(R300_SU_REG_DEST, 1 << 2);
+            OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+            OUT_CS_RELOC(buf, (query->num_results + 2) * 4,
+                    0, query->domain, 0);
+        case 2:
+            /* pipe 1 only */
+            /* As mentioned above, accomodate RV380 and older. */
+            OUT_CS_REG(R300_SU_REG_DEST,
+                    1 << (caps->high_second_pipe ? 3 : 1));
+            OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+            OUT_CS_RELOC(buf, (query->num_results + 1) * 4,
+                    0, query->domain, 0);
+        case 1:
+            /* pipe 0 only */
+            OUT_CS_REG(R300_SU_REG_DEST, 1 << 0);
+            OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+            OUT_CS_RELOC(buf, (query->num_results + 0) * 4,
+                    0, query->domain, 0);
+            break;
+        default:
+            fprintf(stderr, "r300: Implementation error: Chipset reports %d"
+                    " pixel pipes!\n", caps->num_frag_pipes);
+            abort();
+    }
+
+    /* And, finally, reset it to normal... */
+    OUT_CS_REG(R300_SU_REG_DEST, 0xF);
+    END_CS;
+}
+
+static void rv530_emit_query_end_single_z(struct r300_context *r300,
+                                          struct r300_query *query)
+{
+    struct r300_winsys_buffer *buf = r300->query_current->buffer;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(8);
+    OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_0);
+    OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+    OUT_CS_RELOC(buf, query->num_results * 4, 0, query->domain, 0);
+    OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL);
+    END_CS;
+}
+
+static void rv530_emit_query_end_double_z(struct r300_context *r300,
+                                          struct r300_query *query)
+{
+    struct r300_winsys_buffer *buf = r300->query_current->buffer;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(14);
+    OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_0);
+    OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+    OUT_CS_RELOC(buf, (query->num_results + 0) * 4, 0, query->domain, 0);
+    OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_1);
+    OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+    OUT_CS_RELOC(buf, (query->num_results + 1) * 4, 0, query->domain, 0);
+    OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL);
+    END_CS;
+}
+
+void r300_emit_query_end(struct r300_context* r300)
+{
+    struct r300_capabilities *caps = &r300->screen->caps;
+    struct r300_query *query = r300->query_current;
+
+    if (!query)
+	return;
+
+    if (query->begin_emitted == FALSE)
+        return;
+
+    if (caps->family == CHIP_FAMILY_RV530) {
+        if (caps->num_z_pipes == 2)
+            rv530_emit_query_end_double_z(r300, query);
+        else
+            rv530_emit_query_end_single_z(r300, query);
+    } else 
+        r300_emit_query_end_frag_pipes(r300, query);
+
+    query->begin_emitted = FALSE;
+    query->num_results += query->num_pipes;
+
+    /* XXX grab all the results and reset the counter. */
+    if (query->num_results >= query->buffer_size / 4 - 4) {
+        query->num_results = (query->buffer_size / 4) / 2;
+        fprintf(stderr, "r300: Rewinding OQBO...\n");
+    }
+}
+
+void r300_emit_rs_state(struct r300_context* r300, unsigned size, void* state)
+{
+    struct r300_rs_state* rs = state;
+    struct pipe_framebuffer_state* fb = r300->fb_state.state;
+    float scale, offset;
+    unsigned mspos0, mspos1, aa_config;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(size);
+    OUT_CS_REG(R300_VAP_CNTL_STATUS, rs->vap_control_status);
+
+    /* Multisampling. Depends on framebuffer sample count. */
+    if (r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0)) {
+        if (fb->nr_cbufs && fb->cbufs[0]->texture->nr_samples > 1) {
+            aa_config = R300_GB_AA_CONFIG_AA_ENABLE;
+            /* Subsample placement. These may not be optimal. */
+            switch (fb->cbufs[0]->texture->nr_samples) {
+                case 2:
+                    aa_config |= R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_2;
+                    mspos0 = 0x33996633;
+                    mspos1 = 0x6666663;
+                    break;
+                case 3:
+                    aa_config |= R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_3;
+                    mspos0 = 0x33936933;
+                    mspos1 = 0x6666663;
+                    break;
+                case 4:
+                    aa_config |= R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_4;
+                    mspos0 = 0x33939933;
+                    mspos1 = 0x3966663;
+                    break;
+                case 6:
+                    aa_config |= R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_6;
+                    mspos0 = 0x22a2aa22;
+                    mspos1 = 0x2a65672;
+                    break;
+                default:
+                    debug_printf("r300: Bad number of multisamples!\n");
+                    mspos0 = rs->multisample_position_0;
+                    mspos1 = rs->multisample_position_1;
+                    break;
+            }
+
+            OUT_CS_REG_SEQ(R300_GB_MSPOS0, 2);
+            OUT_CS(mspos0);
+            OUT_CS(mspos1);
+
+            OUT_CS_REG(R300_GB_AA_CONFIG, aa_config);
+        } else {
+            OUT_CS_REG_SEQ(R300_GB_MSPOS0, 2);
+            OUT_CS(rs->multisample_position_0);
+            OUT_CS(rs->multisample_position_1);
+
+            OUT_CS_REG(R300_GB_AA_CONFIG, rs->antialiasing_config);
+        }
+    }
+
+    OUT_CS_REG(R300_GA_POINT_SIZE, rs->point_size);
+    OUT_CS_REG_SEQ(R300_GA_POINT_MINMAX, 2);
+    OUT_CS(rs->point_minmax);
+    OUT_CS(rs->line_control);
+
+    if (rs->polygon_offset_enable) {
+        scale = rs->depth_scale * 12;
+        offset = rs->depth_offset;
+
+        switch (r300->zbuffer_bpp) {
+            case 16:
+                offset *= 4;
+                break;
+            case 24:
+                offset *= 2;
+                break;
+        }
+
+        OUT_CS_REG_SEQ(R300_SU_POLY_OFFSET_FRONT_SCALE, 4);
+        OUT_CS_32F(scale);
+        OUT_CS_32F(offset);
+        OUT_CS_32F(scale);
+        OUT_CS_32F(offset);
+    }
+
+    OUT_CS_REG_SEQ(R300_SU_POLY_OFFSET_ENABLE, 2);
+    OUT_CS(rs->polygon_offset_enable);
+    OUT_CS(rs->cull_mode);
+    OUT_CS_REG(R300_GA_LINE_STIPPLE_CONFIG, rs->line_stipple_config);
+    OUT_CS_REG(R300_GA_LINE_STIPPLE_VALUE, rs->line_stipple_value);
+    OUT_CS_REG(R300_GA_POLY_MODE, rs->polygon_mode);
+    OUT_CS_REG(R300_SC_CLIP_RULE, rs->clip_rule);
+    OUT_CS_REG(R300_GB_ENABLE, rs->stuffing_enable);
+    OUT_CS_REG_SEQ(R300_GA_POINT_S0, 4);
+    OUT_CS_32F(rs->point_texcoord_left);
+    OUT_CS_32F(rs->point_texcoord_bottom);
+    OUT_CS_32F(rs->point_texcoord_right);
+    OUT_CS_32F(rs->point_texcoord_top);
+    END_CS;
+}
+
+void r300_emit_rs_block_state(struct r300_context* r300,
+                              unsigned size, void* state)
+{
+    struct r300_rs_block* rs = (struct r300_rs_block*)state;
+    unsigned i;
+    /* It's the same for both INST and IP tables */
+    unsigned count = (rs->inst_count & R300_RS_INST_COUNT_MASK) + 1;
+    CS_LOCALS(r300);
+
+    if (SCREEN_DBG_ON(r300->screen, DBG_DRAW)) {
+        r500_dump_rs_block(rs);
+    }
+
+    DBG(r300, DBG_DRAW, "r300: RS emit:\n");
+
+    BEGIN_CS(size);
+    OUT_CS_REG_SEQ(R300_VAP_VTX_STATE_CNTL, 2);
+    OUT_CS(rs->vap_vtx_state_cntl);
+    OUT_CS(rs->vap_vsm_vtx_assm);
+    OUT_CS_REG_SEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+    OUT_CS(rs->vap_out_vtx_fmt[0]);
+    OUT_CS(rs->vap_out_vtx_fmt[1]);
+
+    if (r300->screen->caps.is_r500) {
+        OUT_CS_REG_SEQ(R500_RS_IP_0, count);
+    } else {
+        OUT_CS_REG_SEQ(R300_RS_IP_0, count);
+    }
+    OUT_CS_TABLE(rs->ip, count);
+    for (i = 0; i < count; i++) {
+        DBG(r300, DBG_DRAW, "    : ip %d: 0x%08x\n", i, rs->ip[i]);
+    }
+
+    OUT_CS_REG_SEQ(R300_RS_COUNT, 2);
+    OUT_CS(rs->count);
+    OUT_CS(rs->inst_count);
+
+    if (r300->screen->caps.is_r500) {
+        OUT_CS_REG_SEQ(R500_RS_INST_0, count);
+    } else {
+        OUT_CS_REG_SEQ(R300_RS_INST_0, count);
+    }
+    OUT_CS_TABLE(rs->inst, count);
+    for (i = 0; i < count; i++) {
+        DBG(r300, DBG_DRAW, "    : inst %d: 0x%08x\n", i, rs->inst[i]);
+    }
+
+    DBG(r300, DBG_DRAW, "    : count: 0x%08x inst_count: 0x%08x\n",
+        rs->count, rs->inst_count);
+
+    END_CS;
+}
+
+void r300_emit_scissor_state(struct r300_context* r300,
+                             unsigned size, void* state)
+{
+    struct pipe_scissor_state* scissor = (struct pipe_scissor_state*)state;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(size);
+    OUT_CS_REG_SEQ(R300_SC_CLIPRECT_TL_0, 2);
+    if (r300->screen->caps.is_r500) {
+        OUT_CS((scissor->minx << R300_CLIPRECT_X_SHIFT) |
+               (scissor->miny << R300_CLIPRECT_Y_SHIFT));
+        OUT_CS(((scissor->maxx - 1) << R300_CLIPRECT_X_SHIFT) |
+               ((scissor->maxy - 1) << R300_CLIPRECT_Y_SHIFT));
+    } else {
+        OUT_CS(((scissor->minx + 1440) << R300_CLIPRECT_X_SHIFT) |
+               ((scissor->miny + 1440) << R300_CLIPRECT_Y_SHIFT));
+        OUT_CS(((scissor->maxx + 1440-1) << R300_CLIPRECT_X_SHIFT) |
+               ((scissor->maxy + 1440-1) << R300_CLIPRECT_Y_SHIFT));
+    }
+    END_CS;
+}
+
+void r300_emit_textures_state(struct r300_context *r300,
+                              unsigned size, void *state)
+{
+    struct r300_textures_state *allstate = (struct r300_textures_state*)state;
+    struct r300_texture_sampler_state *texstate;
+    struct r300_texture *tex;
+    unsigned i;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(size);
+    OUT_CS_REG(R300_TX_ENABLE, allstate->tx_enable);
+
+    for (i = 0; i < allstate->count; i++) {
+        if ((1 << i) & allstate->tx_enable) {
+            texstate = &allstate->regs[i];
+            tex = r300_texture(allstate->sampler_views[i]->base.texture);
+
+            OUT_CS_REG(R300_TX_FILTER0_0 + (i * 4), texstate->filter0);
+            OUT_CS_REG(R300_TX_FILTER1_0 + (i * 4), texstate->filter1);
+            OUT_CS_REG(R300_TX_BORDER_COLOR_0 + (i * 4),
+                       texstate->border_color);
+
+            OUT_CS_REG(R300_TX_FORMAT0_0 + (i * 4), texstate->format.format0);
+            OUT_CS_REG(R300_TX_FORMAT1_0 + (i * 4), texstate->format.format1);
+            OUT_CS_REG(R300_TX_FORMAT2_0 + (i * 4), texstate->format.format2);
+
+            OUT_CS_REG_SEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+            OUT_CS_TEX_RELOC(tex, texstate->format.tile_config, tex->domain,
+                             0, 0);
+        }
+    }
+    END_CS;
+}
+
+void r300_emit_aos(struct r300_context* r300, int offset, boolean indexed)
+{
+    struct pipe_vertex_buffer *vb1, *vb2, *vbuf = r300->vertex_buffer;
+    struct pipe_vertex_element *velem = r300->velems->velem;
+    struct r300_buffer *buf;
+    int i;
+    unsigned *hw_format_size = r300->velems->hw_format_size;
+    unsigned size1, size2, aos_count = r300->velems->count;
+    unsigned packet_size = (aos_count * 3 + 1) / 2;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(2 + packet_size + aos_count * 2);
+    OUT_CS_PKT3(R300_PACKET3_3D_LOAD_VBPNTR, packet_size);
+    OUT_CS(aos_count | (!indexed ? R300_VC_FORCE_PREFETCH : 0));
+
+    for (i = 0; i < aos_count - 1; i += 2) {
+        vb1 = &vbuf[velem[i].vertex_buffer_index];
+        vb2 = &vbuf[velem[i+1].vertex_buffer_index];
+        size1 = hw_format_size[i];
+        size2 = hw_format_size[i+1];
+
+        OUT_CS(R300_VBPNTR_SIZE0(size1) | R300_VBPNTR_STRIDE0(vb1->stride) |
+               R300_VBPNTR_SIZE1(size2) | R300_VBPNTR_STRIDE1(vb2->stride));
+        OUT_CS(vb1->buffer_offset + velem[i].src_offset   + offset * vb1->stride);
+        OUT_CS(vb2->buffer_offset + velem[i+1].src_offset + offset * vb2->stride);
+    }
+
+    if (aos_count & 1) {
+        vb1 = &vbuf[velem[i].vertex_buffer_index];
+        size1 = hw_format_size[i];
+
+        OUT_CS(R300_VBPNTR_SIZE0(size1) | R300_VBPNTR_STRIDE0(vb1->stride));
+        OUT_CS(vb1->buffer_offset + velem[i].src_offset + offset * vb1->stride);
+    }
+
+    for (i = 0; i < aos_count; i++) {
+        buf = r300_buffer(vbuf[velem[i].vertex_buffer_index].buffer);
+        OUT_CS_BUF_RELOC_NO_OFFSET(&buf->b.b, buf->domain, 0, 0);
+    }
+    END_CS;
+}
+
+void r300_emit_aos_swtcl(struct r300_context *r300, boolean indexed)
+{
+    CS_LOCALS(r300);
+
+    DBG(r300, DBG_DRAW, "r300: Preparing vertex buffer %p for render, "
+            "vertex size %d\n", r300->vbo,
+            r300->vertex_info.size);
+    /* Set the pointer to our vertex buffer. The emitted values are this:
+     * PACKET3 [3D_LOAD_VBPNTR]
+     * COUNT   [1]
+     * FORMAT  [size | stride << 8]
+     * OFFSET  [offset into BO]
+     * VBPNTR  [relocated BO]
+     */
+    BEGIN_CS(7);
+    OUT_CS_PKT3(R300_PACKET3_3D_LOAD_VBPNTR, 3);
+    OUT_CS(1 | (!indexed ? R300_VC_FORCE_PREFETCH : 0));
+    OUT_CS(r300->vertex_info.size |
+            (r300->vertex_info.size << 8));
+    OUT_CS(r300->vbo_offset);
+    OUT_CS_BUF_RELOC(r300->vbo, 0, r300_buffer(r300->vbo)->domain, 0, 0);
+    END_CS;
+}
+
+void r300_emit_vertex_stream_state(struct r300_context* r300,
+                                   unsigned size, void* state)
+{
+    struct r300_vertex_stream_state *streams =
+        (struct r300_vertex_stream_state*)state;
+    unsigned i;
+    CS_LOCALS(r300);
+
+    DBG(r300, DBG_DRAW, "r300: PSC emit:\n");
+
+    BEGIN_CS(size);
+    OUT_CS_REG_SEQ(R300_VAP_PROG_STREAM_CNTL_0, streams->count);
+    OUT_CS_TABLE(streams->vap_prog_stream_cntl, streams->count);
+    for (i = 0; i < streams->count; i++) {
+        DBG(r300, DBG_DRAW, "    : prog_stream_cntl%d: 0x%08x\n", i,
+               streams->vap_prog_stream_cntl[i]);
+    }
+    OUT_CS_REG_SEQ(R300_VAP_PROG_STREAM_CNTL_EXT_0, streams->count);
+    OUT_CS_TABLE(streams->vap_prog_stream_cntl_ext, streams->count);
+    for (i = 0; i < streams->count; i++) {
+        DBG(r300, DBG_DRAW, "    : prog_stream_cntl_ext%d: 0x%08x\n", i,
+               streams->vap_prog_stream_cntl_ext[i]);
+    }
+    END_CS;
+}
+
+void r300_emit_pvs_flush(struct r300_context* r300, unsigned size, void* state)
+{
+    CS_LOCALS(r300);
+
+    BEGIN_CS(size);
+    OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
+    END_CS;
+}
+
+void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state)
+{
+    struct r300_vertex_shader* vs = (struct r300_vertex_shader*)state;
+    struct r300_vertex_program_code* code = &vs->code;
+    struct r300_screen* r300screen = r300->screen;
+    unsigned instruction_count = code->length / 4;
+    unsigned i;
+
+    unsigned vtx_mem_size = r300screen->caps.is_r500 ? 128 : 72;
+    unsigned input_count = MAX2(util_bitcount(code->InputsRead), 1);
+    unsigned output_count = MAX2(util_bitcount(code->OutputsWritten), 1);
+    unsigned temp_count = MAX2(code->num_temporaries, 1);
+
+    unsigned pvs_num_slots = MIN3(vtx_mem_size / input_count,
+                                  vtx_mem_size / output_count, 10);
+    unsigned pvs_num_controllers = MIN2(vtx_mem_size / temp_count, 6);
+
+    unsigned imm_first = vs->externals_count;
+    unsigned imm_end = vs->code.constants.Count;
+    unsigned imm_count = vs->immediates_count;
+
+    CS_LOCALS(r300);
+
+    BEGIN_CS(size);
+    /* R300_VAP_PVS_CODE_CNTL_0
+     * R300_VAP_PVS_CONST_CNTL
+     * R300_VAP_PVS_CODE_CNTL_1
+     * See the r5xx docs for instructions on how to use these. */
+    OUT_CS_REG_SEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
+    OUT_CS(R300_PVS_FIRST_INST(0) |
+            R300_PVS_XYZW_VALID_INST(instruction_count - 1) |
+            R300_PVS_LAST_INST(instruction_count - 1));
+    OUT_CS(R300_PVS_MAX_CONST_ADDR(code->constants.Count - 1));
+    OUT_CS(instruction_count - 1);
+
+    OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG, 0);
+    OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, code->length);
+    OUT_CS_TABLE(code->body.d, code->length);
+
+    OUT_CS_REG(R300_VAP_CNTL, R300_PVS_NUM_SLOTS(pvs_num_slots) |
+            R300_PVS_NUM_CNTLRS(pvs_num_controllers) |
+            R300_PVS_NUM_FPUS(r300screen->caps.num_vert_fpus) |
+            R300_PVS_VF_MAX_VTX_NUM(12) |
+            (r300screen->caps.is_r500 ? R500_TCL_STATE_OPTIMIZATION : 0));
+
+    /* Emit immediates. */
+    if (imm_count) {
+        OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG,
+                   (r300->screen->caps.is_r500 ?
+                   R500_PVS_CONST_START : R300_PVS_CONST_START) +
+                   imm_first);
+        OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, imm_count * 4);
+        for (i = imm_first; i < imm_end; i++) {
+            const float *data = vs->code.constants.Constants[i].u.Immediate;
+            OUT_CS_TABLE(data, 4);
+        }
+    }
+    END_CS;
+}
+
+void r300_emit_vs_constants(struct r300_context* r300,
+                            unsigned size, void *state)
+{
+    unsigned count =
+        ((struct r300_vertex_shader*)r300->vs_state.state)->externals_count;
+    struct r300_constant_buffer *buf = (struct r300_constant_buffer*)state;
+    CS_LOCALS(r300);
+
+    if (!count)
+        return;
+
+    BEGIN_CS(size);
+    OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG,
+               (r300->screen->caps.is_r500 ?
+               R500_PVS_CONST_START : R300_PVS_CONST_START));
+    OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, count * 4);
+    OUT_CS_TABLE(buf->constants, count * 4);
+    END_CS;
+}
+
+void r300_emit_viewport_state(struct r300_context* r300,
+                              unsigned size, void* state)
+{
+    struct r300_viewport_state* viewport = (struct r300_viewport_state*)state;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(size);
+    OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
+    OUT_CS_TABLE(&viewport->xscale, 6);
+    OUT_CS_REG(R300_VAP_VTE_CNTL, viewport->vte_control);
+    END_CS;
+}
+
+void r300_emit_ztop_state(struct r300_context* r300,
+                          unsigned size, void* state)
+{
+    struct r300_ztop_state* ztop = (struct r300_ztop_state*)state;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(size);
+    OUT_CS_REG(R300_ZB_ZTOP, ztop->z_buffer_top);
+    END_CS;
+}
+
+void r300_emit_texture_cache_inval(struct r300_context* r300, unsigned size, void* state)
+{
+    CS_LOCALS(r300);
+
+    BEGIN_CS(size);
+    OUT_CS_REG(R300_TX_INVALTAGS, 0);
+    END_CS;
+}
+
+void r300_emit_buffer_validate(struct r300_context *r300,
+                               boolean do_validate_vertex_buffers,
+                               struct pipe_resource *index_buffer)
+{
+    struct pipe_framebuffer_state* fb =
+        (struct pipe_framebuffer_state*)r300->fb_state.state;
+    struct r300_textures_state *texstate =
+        (struct r300_textures_state*)r300->textures_state.state;
+    struct r300_texture* tex;
+    struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
+    struct pipe_vertex_element *velem = r300->velems->velem;
+    struct pipe_resource *pbuf;
+    unsigned i;
+    boolean invalid = FALSE;
+
+    /* upload buffers first */
+    if (r300->screen->caps.has_tcl && r300->any_user_vbs) {
+        r300_upload_user_buffers(r300);
+        r300->any_user_vbs = false;
+    }
+
+    /* Clean out BOs. */
+    r300->rws->reset_bos(r300->rws);
+
+validate:
+    /* Color buffers... */
+    for (i = 0; i < fb->nr_cbufs; i++) {
+        tex = r300_texture(fb->cbufs[i]->texture);
+        assert(tex && tex->buffer && "cbuf is marked, but NULL!");
+        if (!r300_add_texture(r300->rws, tex, 0, tex->domain)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
+    }
+    /* ...depth buffer... */
+    if (fb->zsbuf) {
+        tex = r300_texture(fb->zsbuf->texture);
+        assert(tex && tex->buffer && "zsbuf is marked, but NULL!");
+        if (!r300_add_texture(r300->rws, tex,
+			      0, tex->domain)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
+    }
+    /* ...textures... */
+    for (i = 0; i < texstate->count; i++) {
+        if (!(texstate->tx_enable & (1 << i))) {
+            continue;
+        }
+
+        tex = r300_texture(texstate->sampler_views[i]->base.texture);
+        if (!r300_add_texture(r300->rws, tex, tex->domain, 0)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
+    }
+    /* ...occlusion query buffer... */
+    if (r300->query_current) {
+        if (!r300->rws->add_buffer(r300->rws, r300->query_current->buffer,
+                                   0, r300->query_current->domain)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
+    }
+    /* ...vertex buffer for SWTCL path... */
+    if (r300->vbo) {
+        if (!r300_add_buffer(r300->rws, r300->vbo,
+			     r300_buffer(r300->vbo)->domain, 0)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
+    }
+    /* ...vertex buffers for HWTCL path... */
+    if (do_validate_vertex_buffers) {
+        for (i = 0; i < r300->velems->count; i++) {
+            pbuf = vbuf[velem[i].vertex_buffer_index].buffer;
+
+            if (!r300_add_buffer(r300->rws, pbuf,
+				 r300_buffer(pbuf)->domain, 0)) {
+		r300->context.flush(&r300->context, 0, NULL);
+                goto validate;
+            }
+        }
+    }
+    /* ...and index buffer for HWTCL path. */
+    if (index_buffer) {
+        if (!r300_add_buffer(r300->rws, index_buffer,
+			     r300_buffer(index_buffer)->domain, 0)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
+    }
+    if (!r300->rws->validate(r300->rws)) {
+        r300->context.flush(&r300->context, 0, NULL);
+        if (invalid) {
+            /* Well, hell. */
+            fprintf(stderr, "r300: Stuck in validation loop, gonna quit now.\n");
+            abort();
+        }
+        invalid = TRUE;
+        goto validate;
+    }
+}
+
+unsigned r300_get_num_dirty_dwords(struct r300_context *r300)
+{
+    struct r300_atom* atom;
+    unsigned dwords = 0;
+
+    foreach(atom, &r300->atom_list) {
+        if (atom->dirty) {
+            dwords += atom->size;
+        }
+    }
+
+    /* let's reserve some more, just in case */
+    dwords += 32;
+
+    return dwords;
+}
+
+/* Emit all dirty state. */
+void r300_emit_dirty_state(struct r300_context* r300)
+{
+    struct r300_atom* atom;
+
+    foreach(atom, &r300->atom_list) {
+        if (atom->dirty) {
+            atom->emit(r300, atom->size, atom->state);
+            if (SCREEN_DBG_ON(r300->screen, DBG_STATS)) {
+                atom->counter++;
+            }
+            atom->dirty = FALSE;
+        }
+    }
+
+    r300->dirty_hw++;
+}
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
new file mode 100644
index 0000000000..36a29894d0
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_EMIT_H
+#define R300_EMIT_H
+
+#include "r300_context.h"
+#include "radeon_code.h"
+
+struct rX00_fragment_program_code;
+struct r300_vertex_program_code;
+
+uint32_t pack_float24(float f);
+
+void r300_emit_aos(struct r300_context* r300, int offset, boolean indexed);
+
+void r300_emit_blend_state(struct r300_context* r300,
+                           unsigned size, void* state);
+
+void r300_emit_blend_color_state(struct r300_context* r300,
+                                 unsigned size, void* state);
+
+void r300_emit_clip_state(struct r300_context* r300,
+                          unsigned size, void* state);
+
+void r300_emit_dsa_state(struct r300_context* r300,
+                         unsigned size, void* state);
+
+void r300_emit_fs(struct r300_context* r300, unsigned size, void *state);
+
+void r300_emit_fs_constants(struct r300_context* r300, unsigned size, void *state);
+
+void r300_emit_fs_rc_constant_state(struct r300_context* r300, unsigned size, void *state);
+
+void r500_emit_fs(struct r300_context* r300, unsigned size, void *state);
+
+void r500_emit_fs_constants(struct r300_context* r300, unsigned size, void *state);
+
+void r500_emit_fs_rc_constant_state(struct r300_context* r300, unsigned size, void *state);
+
+void r300_emit_fb_state(struct r300_context* r300, unsigned size, void* state);
+
+void r300_emit_query_start(struct r300_context *r300, unsigned size, void *state);
+
+void r300_emit_query_end(struct r300_context* r300);
+
+void r300_emit_rs_state(struct r300_context* r300, unsigned size, void* state);
+
+void r300_emit_rs_block_state(struct r300_context* r300,
+                              unsigned size, void* state);
+
+void r300_emit_scissor_state(struct r300_context* r300,
+                             unsigned size, void* state);
+
+void r300_emit_textures_state(struct r300_context *r300,
+                              unsigned size, void *state);
+
+void r300_emit_aos_swtcl(struct r300_context *r300, boolean indexed);
+
+void r300_emit_vertex_stream_state(struct r300_context* r300,
+                                   unsigned size, void* state);
+
+void r300_emit_vs_constants(struct r300_context* r300,
+                            unsigned size, void *state);
+
+void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state);
+
+void r300_emit_viewport_state(struct r300_context* r300,
+                              unsigned size, void* state);
+
+void r300_emit_ztop_state(struct r300_context* r300,
+                          unsigned size, void* state);
+
+void r300_emit_pvs_flush(struct r300_context* r300, unsigned size, void* state);
+
+void r300_emit_texture_cache_inval(struct r300_context* r300, unsigned size, void* state);
+
+unsigned r300_get_num_dirty_dwords(struct r300_context *r300);
+
+/* Emit all dirty state. */
+void r300_emit_dirty_state(struct r300_context* r300);
+
+void r300_emit_buffer_validate(struct r300_context *r300,
+                               boolean do_validate_vertex_buffers,
+                               struct pipe_resource *index_buffer);
+
+#endif /* R300_EMIT_H */
diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
new file mode 100644
index 0000000000..ba840bfff8
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_flush.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+#include "util/u_simple_list.h"
+
+#include "r300_context.h"
+#include "r300_cs.h"
+#include "r300_emit.h"
+
+static void r300_flush(struct pipe_context* pipe,
+                       unsigned flags,
+                       struct pipe_fence_handle** fence)
+{
+    struct r300_context *r300 = r300_context(pipe);
+    struct r300_query *query;
+    struct r300_atom *atom;
+    struct r300_fence **rfence = (struct r300_fence**)fence;
+
+    /* We probably need to flush Draw, but we may have been called from
+     * within Draw. This feels kludgy, but it might be the best thing.
+     *
+     * Of course, the best thing is to kill Draw with fire. :3 */
+    if (r300->draw && !r300->draw->flushing) {
+        draw_flush(r300->draw);
+    }
+
+    if (r300->dirty_hw) {
+        r300_emit_query_end(r300);
+
+        if (SCREEN_DBG_ON(r300->screen, DBG_STATS)) {
+            r300->flush_counter++;
+        }
+        r300->rws->flush_cs(r300->rws);
+        r300->dirty_hw = 0;
+
+        /* New kitchen sink, baby. */
+        foreach(atom, &r300->atom_list) {
+            if (atom->state || atom->allow_null_state) {
+                atom->dirty = TRUE;
+            }
+        }
+
+        /* Unmark HWTCL state for SWTCL. */
+        if (!r300->screen->caps.has_tcl) {
+            r300->vs_state.dirty = FALSE;
+            r300->vs_constants.dirty = FALSE;
+        }
+    }
+
+    /* reset flushed query */
+    foreach(query, &r300->query_list) {
+        query->flushed = TRUE;
+    }
+
+    /* Create a new fence. */
+    if (rfence) {
+        *rfence = CALLOC_STRUCT(r300_fence);
+        pipe_reference_init(&(*rfence)->reference, 1);
+        (*rfence)->ctx = r300;
+    }
+}
+
+void r300_init_flush_functions(struct r300_context* r300)
+{
+    r300->context.flush = r300_flush;
+}
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
new file mode 100644
index 0000000000..e585394304
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -0,0 +1,506 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *                Joakim Sindholt <opensource@zhasha.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_ureg.h"
+
+#include "r300_cb.h"
+#include "r300_context.h"
+#include "r300_emit.h"
+#include "r300_screen.h"
+#include "r300_fs.h"
+#include "r300_reg.h"
+#include "r300_tgsi_to_rc.h"
+
+#include "radeon_code.h"
+#include "radeon_compiler.h"
+
+/* Convert info about FS input semantics to r300_shader_semantics. */
+void r300_shader_read_fs_inputs(struct tgsi_shader_info* info,
+                                struct r300_shader_semantics* fs_inputs)
+{
+    int i;
+    unsigned index;
+
+    r300_shader_semantics_reset(fs_inputs);
+
+    for (i = 0; i < info->num_inputs; i++) {
+        index = info->input_semantic_index[i];
+
+        switch (info->input_semantic_name[i]) {
+            case TGSI_SEMANTIC_COLOR:
+                assert(index < ATTR_COLOR_COUNT);
+                fs_inputs->color[index] = i;
+                break;
+
+            case TGSI_SEMANTIC_GENERIC:
+                assert(index < ATTR_GENERIC_COUNT);
+                fs_inputs->generic[index] = i;
+                break;
+
+            case TGSI_SEMANTIC_FOG:
+                assert(index == 0);
+                fs_inputs->fog = i;
+                break;
+
+            case TGSI_SEMANTIC_POSITION:
+                assert(index == 0);
+                fs_inputs->wpos = i;
+                break;
+
+            default:
+                fprintf(stderr, "r300: FP: Unknown input semantic: %i\n",
+                        info->input_semantic_name[i]);
+        }
+    }
+}
+
+static void find_output_registers(struct r300_fragment_program_compiler * compiler,
+                                  struct r300_fragment_shader_code *shader)
+{
+    unsigned i, colorbuf_count = 0;
+
+    /* Mark the outputs as not present initially */
+    compiler->OutputColor[0] = shader->info.num_outputs;
+    compiler->OutputColor[1] = shader->info.num_outputs;
+    compiler->OutputColor[2] = shader->info.num_outputs;
+    compiler->OutputColor[3] = shader->info.num_outputs;
+    compiler->OutputDepth = shader->info.num_outputs;
+
+    /* Now see where they really are. */
+    for(i = 0; i < shader->info.num_outputs; ++i) {
+        switch(shader->info.output_semantic_name[i]) {
+            case TGSI_SEMANTIC_COLOR:
+                compiler->OutputColor[colorbuf_count] = i;
+                colorbuf_count++;
+                break;
+            case TGSI_SEMANTIC_POSITION:
+                compiler->OutputDepth = i;
+                break;
+        }
+    }
+}
+
+static void allocate_hardware_inputs(
+    struct r300_fragment_program_compiler * c,
+    void (*allocate)(void * data, unsigned input, unsigned hwreg),
+    void * mydata)
+{
+    struct r300_shader_semantics* inputs =
+        (struct r300_shader_semantics*)c->UserData;
+    int i, reg = 0;
+
+    /* Allocate input registers. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (inputs->color[i] != ATTR_UNUSED) {
+            allocate(mydata, inputs->color[i], reg++);
+        }
+    }
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        if (inputs->generic[i] != ATTR_UNUSED) {
+            allocate(mydata, inputs->generic[i], reg++);
+        }
+    }
+    if (inputs->fog != ATTR_UNUSED) {
+        allocate(mydata, inputs->fog, reg++);
+    }
+    if (inputs->wpos != ATTR_UNUSED) {
+        allocate(mydata, inputs->wpos, reg++);
+    }
+}
+
+static void get_external_state(
+    struct r300_context* r300,
+    struct r300_fragment_program_external_state* state)
+{
+    struct r300_textures_state *texstate = r300->textures_state.state;
+    unsigned i;
+    unsigned char *swizzle;
+
+    for (i = 0; i < texstate->sampler_state_count; i++) {
+        struct r300_sampler_state* s = texstate->sampler_states[i];
+
+        if (!s) {
+            continue;
+        }
+
+        if (s->state.compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+            state->unit[i].compare_mode_enabled = 1;
+
+            /* Pass depth texture swizzling to the compiler. */
+            if (texstate->sampler_views[i]) {
+                swizzle = texstate->sampler_views[i]->swizzle;
+
+                state->unit[i].depth_texture_swizzle =
+                    RC_MAKE_SWIZZLE(swizzle[0], swizzle[1],
+                                    swizzle[2], swizzle[3]);
+            } else {
+                state->unit[i].depth_texture_swizzle = RC_SWIZZLE_XYZW;
+            }
+
+            /* Fortunately, no need to translate this. */
+            state->unit[i].texture_compare_func = s->state.compare_func;
+        }
+
+        state->unit[i].non_normalized_coords = !s->state.normalized_coords;
+
+        if (texstate->sampler_views[i]) {
+            struct r300_texture *t;
+            t = (struct r300_texture*)texstate->sampler_views[i]->base.texture;
+
+            /* XXX this should probably take into account STR, not just S. */
+            if (t->uses_pitch) {
+                switch (s->state.wrap_s) {
+                    case PIPE_TEX_WRAP_REPEAT:
+                        state->unit[i].wrap_mode = RC_WRAP_REPEAT;
+                        state->unit[i].fake_npot = TRUE;
+                        break;
+
+                    case PIPE_TEX_WRAP_MIRROR_REPEAT:
+                        state->unit[i].wrap_mode = RC_WRAP_MIRRORED_REPEAT;
+                        state->unit[i].fake_npot = TRUE;
+                        break;
+
+                    case PIPE_TEX_WRAP_MIRROR_CLAMP:
+                    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+                    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+                        state->unit[i].wrap_mode = RC_WRAP_MIRRORED_CLAMP;
+                        state->unit[i].fake_npot = TRUE;
+                        break;
+
+                    default:
+                        state->unit[i].wrap_mode = RC_WRAP_NONE;
+                        break;
+                }
+            }
+        }
+    }
+}
+
+static void r300_translate_fragment_shader(
+    struct r300_context* r300,
+    struct r300_fragment_shader_code* shader,
+    const struct tgsi_token *tokens);
+
+static void r300_dummy_fragment_shader(
+    struct r300_context* r300,
+    struct r300_fragment_shader_code* shader)
+{
+    struct pipe_shader_state state;
+    struct ureg_program *ureg;
+    struct ureg_dst out;
+    struct ureg_src imm;
+
+    /* Make a simple fragment shader which outputs (0, 0, 0, 1) */
+    ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
+    out = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
+    imm = ureg_imm4f(ureg, 0, 0, 0, 1);
+
+    ureg_MOV(ureg, out, imm);
+    ureg_END(ureg);
+
+    state.tokens = ureg_finalize(ureg);
+
+    shader->dummy = TRUE;
+    r300_translate_fragment_shader(r300, shader, state.tokens);
+
+    ureg_destroy(ureg);
+}
+
+static void r300_emit_fs_code_to_buffer(
+    struct r300_context *r300,
+    struct r300_fragment_shader_code *shader)
+{
+    struct rX00_fragment_program_code *generic_code = &shader->code;
+    unsigned imm_count = shader->immediates_count;
+    unsigned imm_first = shader->externals_count;
+    unsigned imm_end = generic_code->constants.Count;
+    struct rc_constant *constants = generic_code->constants.Constants;
+    unsigned i;
+    CB_LOCALS;
+
+    if (r300->screen->caps.is_r500) {
+        struct r500_fragment_program_code *code = &generic_code->code.r500;
+
+        shader->cb_code_size = 17 +
+                               ((code->inst_end + 1) * 6) +
+                               imm_count * 7;
+
+        NEW_CB(shader->cb_code, shader->cb_code_size);
+        OUT_CB_REG(R500_US_CONFIG, R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
+        OUT_CB_REG(R500_US_PIXSIZE, code->max_temp_idx);
+        OUT_CB_REG(R500_US_CODE_RANGE,
+                   R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(code->inst_end));
+        OUT_CB_REG(R500_US_CODE_OFFSET, 0);
+        OUT_CB_REG(R500_US_CODE_ADDR,
+                   R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(code->inst_end));
+
+        OUT_CB_REG(R500_GA_US_VECTOR_INDEX, R500_GA_US_VECTOR_INDEX_TYPE_INSTR);
+        OUT_CB_ONE_REG(R500_GA_US_VECTOR_DATA, (code->inst_end + 1) * 6);
+        for (i = 0; i <= code->inst_end; i++) {
+            OUT_CB(code->inst[i].inst0);
+            OUT_CB(code->inst[i].inst1);
+            OUT_CB(code->inst[i].inst2);
+            OUT_CB(code->inst[i].inst3);
+            OUT_CB(code->inst[i].inst4);
+            OUT_CB(code->inst[i].inst5);
+        }
+
+        /* Emit immediates. */
+        if (imm_count) {
+            for(i = imm_first; i < imm_end; ++i) {
+                if (constants[i].Type == RC_CONSTANT_IMMEDIATE) {
+                    const float *data = constants[i].u.Immediate;
+
+                    OUT_CB_REG(R500_GA_US_VECTOR_INDEX,
+                               R500_GA_US_VECTOR_INDEX_TYPE_CONST |
+                               (i & R500_GA_US_VECTOR_INDEX_MASK));
+                    OUT_CB_ONE_REG(R500_GA_US_VECTOR_DATA, 4);
+                    OUT_CB_TABLE(data, 4);
+                }
+            }
+        }
+    } else { /* r300 */
+        struct r300_fragment_program_code *code = &generic_code->code.r300;
+
+        shader->cb_code_size = 19 +
+                               code->alu.length * 4 +
+                               (code->tex.length ? (1 + code->tex.length) : 0) +
+                               imm_count * 5;
+
+        NEW_CB(shader->cb_code, shader->cb_code_size);
+        OUT_CB_REG(R300_US_CONFIG, code->config);
+        OUT_CB_REG(R300_US_PIXSIZE, code->pixsize);
+        OUT_CB_REG(R300_US_CODE_OFFSET, code->code_offset);
+
+        OUT_CB_REG_SEQ(R300_US_CODE_ADDR_0, 4);
+        OUT_CB_TABLE(code->code_addr, 4);
+
+        OUT_CB_REG_SEQ(R300_US_ALU_RGB_INST_0, code->alu.length);
+        for (i = 0; i < code->alu.length; i++)
+            OUT_CB(code->alu.inst[i].rgb_inst);
+
+        OUT_CB_REG_SEQ(R300_US_ALU_RGB_ADDR_0, code->alu.length);
+        for (i = 0; i < code->alu.length; i++)
+            OUT_CB(code->alu.inst[i].rgb_addr);
+
+        OUT_CB_REG_SEQ(R300_US_ALU_ALPHA_INST_0, code->alu.length);
+        for (i = 0; i < code->alu.length; i++)
+            OUT_CB(code->alu.inst[i].alpha_inst);
+
+        OUT_CB_REG_SEQ(R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
+        for (i = 0; i < code->alu.length; i++)
+            OUT_CB(code->alu.inst[i].alpha_addr);
+
+        if (code->tex.length) {
+            OUT_CB_REG_SEQ(R300_US_TEX_INST_0, code->tex.length);
+            OUT_CB_TABLE(code->tex.inst, code->tex.length);
+        }
+
+        /* Emit immediates. */
+        if (imm_count) {
+            for(i = imm_first; i < imm_end; ++i) {
+                if (constants[i].Type == RC_CONSTANT_IMMEDIATE) {
+                    const float *data = constants[i].u.Immediate;
+
+                    OUT_CB_REG_SEQ(R300_PFS_PARAM_0_X + i * 16, 4);
+                    OUT_CB(pack_float24(data[0]));
+                    OUT_CB(pack_float24(data[1]));
+                    OUT_CB(pack_float24(data[2]));
+                    OUT_CB(pack_float24(data[3]));
+                }
+            }
+        }
+    }
+
+    OUT_CB_REG(R300_FG_DEPTH_SRC, shader->fg_depth_src);
+    OUT_CB_REG(R300_US_W_FMT, shader->us_out_w);
+    END_CB;
+}
+
+static void r300_translate_fragment_shader(
+    struct r300_context* r300,
+    struct r300_fragment_shader_code* shader,
+    const struct tgsi_token *tokens)
+{
+    struct r300_fragment_program_compiler compiler;
+    struct tgsi_to_rc ttr;
+    int wpos;
+    unsigned i;
+
+    tgsi_scan_shader(tokens, &shader->info);
+    r300_shader_read_fs_inputs(&shader->info, &shader->inputs);
+
+    wpos = shader->inputs.wpos;
+
+    /* Setup the compiler. */
+    memset(&compiler, 0, sizeof(compiler));
+    rc_init(&compiler.Base);
+    compiler.Base.Debug = DBG_ON(r300, DBG_FP);
+
+    compiler.code = &shader->code;
+    compiler.state = shader->compare_state;
+    compiler.Base.is_r500 = r300->screen->caps.is_r500;
+    compiler.Base.max_temp_regs = compiler.Base.is_r500 ? 128 : 32;
+    compiler.AllocateHwInputs = &allocate_hardware_inputs;
+    compiler.UserData = &shader->inputs;
+
+    find_output_registers(&compiler, shader);
+
+    if (compiler.Base.Debug) {
+        debug_printf("r300: Initial fragment program\n");
+        tgsi_dump(tokens, 0);
+    }
+
+    /* Translate TGSI to our internal representation */
+    ttr.compiler = &compiler.Base;
+    ttr.info = &shader->info;
+    ttr.use_half_swizzles = TRUE;
+
+    r300_tgsi_to_rc(&ttr, tokens);
+
+    /**
+     * Transform the program to support WPOS.
+     *
+     * Introduce a small fragment at the start of the program that will be
+     * the only code that directly reads the WPOS input.
+     * All other code pieces that reference that input will be rewritten
+     * to read from a newly allocated temporary. */
+    if (wpos != ATTR_UNUSED) {
+        /* Moving the input to some other reg is not really necessary. */
+        rc_transform_fragment_wpos(&compiler.Base, wpos, wpos, TRUE);
+    }
+
+    /* Invoke the compiler */
+    r3xx_compile_fragment_program(&compiler);
+
+    /* Shaders with zero instructions are invalid,
+     * use the dummy shader instead. */
+    if (shader->code.code.r500.inst_end == -1) {
+        rc_destroy(&compiler.Base);
+        r300_dummy_fragment_shader(r300, shader);
+        return;
+    }
+
+    if (compiler.Base.Error) {
+        fprintf(stderr, "r300 FP: Compiler Error:\n%sUsing a dummy shader"
+                " instead.\nIf there's an 'unknown opcode' message, please"
+                " file a bug report and attach this log.\n", compiler.Base.ErrorMsg);
+
+        if (shader->dummy) {
+            fprintf(stderr, "r300 FP: Cannot compile the dummy shader! "
+                    "Giving up...\n");
+            abort();
+        }
+
+        rc_destroy(&compiler.Base);
+        r300_dummy_fragment_shader(r300, shader);
+        return;
+    }
+
+    /* Initialize numbers of constants for each type. */
+    shader->externals_count = ttr.immediate_offset;
+    shader->immediates_count = 0;
+    shader->rc_state_count = 0;
+
+    for (i = shader->externals_count; i < shader->code.constants.Count; i++) {
+        switch (shader->code.constants.Constants[i].Type) {
+            case RC_CONSTANT_IMMEDIATE:
+                ++shader->immediates_count;
+                break;
+            case RC_CONSTANT_STATE:
+                ++shader->rc_state_count;
+                break;
+            default:
+                assert(0);
+        }
+    }
+
+    /* Setup shader depth output. */
+    if (shader->code.writes_depth) {
+        shader->fg_depth_src = R300_FG_DEPTH_SRC_SHADER;
+        shader->us_out_w = R300_W_FMT_W24 | R300_W_SRC_US;
+    } else {
+        shader->fg_depth_src = R300_FG_DEPTH_SRC_SCAN;
+        shader->us_out_w = R300_W_FMT_W0 | R300_W_SRC_US;
+    }
+
+    /* And, finally... */
+    rc_destroy(&compiler.Base);
+
+    /* Build the command buffer. */
+    r300_emit_fs_code_to_buffer(r300, shader);
+}
+
+boolean r300_pick_fragment_shader(struct r300_context* r300)
+{
+    struct r300_fragment_shader* fs = r300_fs(r300);
+    struct r300_fragment_program_external_state state = {{{ 0 }}};
+    struct r300_fragment_shader_code* ptr;
+
+    get_external_state(r300, &state);
+
+    if (!fs->first) {
+        /* Build the fragment shader for the first time. */
+        fs->first = fs->shader = CALLOC_STRUCT(r300_fragment_shader_code);
+
+        memcpy(&fs->shader->compare_state, &state,
+            sizeof(struct r300_fragment_program_external_state));
+        r300_translate_fragment_shader(r300, fs->shader, fs->state.tokens);
+        return TRUE;
+
+    } else {
+        /* Check if the currently-bound shader has been compiled
+         * with the texture-compare state we need. */
+        if (memcmp(&fs->shader->compare_state, &state, sizeof(state)) != 0) {
+            /* Search for the right shader. */
+            ptr = fs->first;
+            while (ptr) {
+                if (memcmp(&ptr->compare_state, &state, sizeof(state)) == 0) {
+                    if (fs->shader != ptr) {
+                        fs->shader = ptr;
+                        return TRUE;
+                    }
+                    /* The currently-bound one is OK. */
+                    return FALSE;
+                }
+                ptr = ptr->next;
+            }
+
+            /* Not found, gotta compile a new one. */
+            ptr = CALLOC_STRUCT(r300_fragment_shader_code);
+            ptr->next = fs->first;
+            fs->first = fs->shader = ptr;
+
+            ptr->compare_state = state;
+            r300_translate_fragment_shader(r300, ptr, fs->state.tokens);
+            return TRUE;
+        }
+    }
+
+    return FALSE;
+}
diff --git a/src/gallium/drivers/r300/r300_fs.h b/src/gallium/drivers/r300/r300_fs.h
new file mode 100644
index 0000000000..51bfa88c5e
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_fs.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *                Joakim Sindholt <opensource@zhasha.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_FS_H
+#define R300_FS_H
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+#include "radeon_code.h"
+#include "r300_shader_semantics.h"
+
+struct r300_fragment_shader_code {
+    struct tgsi_shader_info info;
+    struct r300_shader_semantics inputs;
+
+    /* Whether the shader was replaced by a dummy one due to a shader
+     * compilation failure. */
+    boolean dummy;
+
+    /* Numbers of constants for each type. */
+    unsigned externals_count;
+    unsigned immediates_count;
+    unsigned rc_state_count;
+
+    /* Registers for fragment depth output setup. */
+    uint32_t fg_depth_src;      /* R300_FG_DEPTH_SRC: 0x4bd8 */
+    uint32_t us_out_w;          /* R300_US_W_FMT:     0x46b4 */
+
+    struct r300_fragment_program_external_state compare_state;
+    struct rX00_fragment_program_code code;
+
+    unsigned cb_code_size;
+    uint32_t *cb_code;
+
+    struct r300_fragment_shader_code* next;
+};
+
+struct r300_fragment_shader {
+    /* Parent class */
+    struct pipe_shader_state state;
+
+    /* Currently-bound fragment shader. */
+    struct r300_fragment_shader_code* shader;
+
+    /* List of the same shaders compiled with different texture-compare
+     * states. */
+    struct r300_fragment_shader_code* first;
+};
+
+void r300_shader_read_fs_inputs(struct tgsi_shader_info* info,
+                                struct r300_shader_semantics* fs_inputs);
+
+/* Return TRUE if the shader was switched and should be re-emitted. */
+boolean r300_pick_fragment_shader(struct r300_context* r300);
+
+static INLINE boolean r300_fragment_shader_writes_depth(struct r300_fragment_shader *fs)
+{
+    if (!fs)
+        return FALSE;
+    return (fs->shader->code.writes_depth) ? TRUE : FALSE;
+}
+
+#endif /* R300_FS_H */
diff --git a/src/gallium/drivers/r300/r300_hyperz.c b/src/gallium/drivers/r300/r300_hyperz.c
new file mode 100644
index 0000000000..e5c7658952
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_hyperz.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+
+#include "r300_hyperz.h"
+#include "r300_context.h"
+#include "r300_reg.h"
+#include "r300_fs.h"
+
+/*****************************************************************************/
+/* The ZTOP state                                                            */
+/*****************************************************************************/
+
+static boolean r300_dsa_writes_stencil(
+        struct pipe_stencil_state *s)
+{
+    return s->enabled && s->writemask &&
+           (s->fail_op  != PIPE_STENCIL_OP_KEEP ||
+            s->zfail_op != PIPE_STENCIL_OP_KEEP ||
+            s->zpass_op != PIPE_STENCIL_OP_KEEP);
+}
+
+static boolean r300_dsa_writes_depth_stencil(
+        struct pipe_depth_stencil_alpha_state *dsa)
+{
+    /* We are interested only in the cases when a depth or stencil value
+     * can be changed. */
+
+    if (dsa->depth.enabled && dsa->depth.writemask &&
+        dsa->depth.func != PIPE_FUNC_NEVER)
+        return TRUE;
+
+    if (r300_dsa_writes_stencil(&dsa->stencil[0]) ||
+        r300_dsa_writes_stencil(&dsa->stencil[1]))
+        return TRUE;
+
+    return FALSE;
+}
+
+static boolean r300_dsa_alpha_test_enabled(
+        struct pipe_depth_stencil_alpha_state *dsa)
+{
+    /* We are interested only in the cases when alpha testing can kill
+     * a fragment. */
+
+    return dsa->alpha.enabled && dsa->alpha.func != PIPE_FUNC_ALWAYS;
+}
+
+static void r300_update_ztop(struct r300_context* r300)
+{
+    struct r300_ztop_state* ztop_state =
+        (struct r300_ztop_state*)r300->ztop_state.state;
+    uint32_t old_ztop = ztop_state->z_buffer_top;
+
+    /* This is important enough that I felt it warranted a comment.
+     *
+     * According to the docs, these are the conditions where ZTOP must be
+     * disabled:
+     * 1) Alpha testing enabled
+     * 2) Texture kill instructions in fragment shader
+     * 3) Chroma key culling enabled
+     * 4) W-buffering enabled
+     *
+     * The docs claim that for the first three cases, if no ZS writes happen,
+     * then ZTOP can be used.
+     *
+     * (3) will never apply since we do not support chroma-keyed operations.
+     * (4) will need to be re-examined (and this comment updated) if/when
+     * Hyper-Z becomes supported.
+     *
+     * Additionally, the following conditions require disabled ZTOP:
+     * 5) Depth writes in fragment shader
+     * 6) Outstanding occlusion queries
+     *
+     * This register causes stalls all the way from SC to CB when changed,
+     * but it is buffered on-chip so it does not hurt to write it if it has
+     * not changed.
+     *
+     * ~C.
+     */
+
+    /* ZS writes */
+    if (r300_dsa_writes_depth_stencil(r300->dsa_state.state) &&
+           (r300_dsa_alpha_test_enabled(r300->dsa_state.state) ||  /* (1) */
+            r300_fs(r300)->shader->info.uses_kill)) {              /* (2) */
+        ztop_state->z_buffer_top = R300_ZTOP_DISABLE;
+    } else if (r300_fragment_shader_writes_depth(r300_fs(r300))) { /* (5) */
+        ztop_state->z_buffer_top = R300_ZTOP_DISABLE;
+    } else if (r300->query_current) {                              /* (6) */
+        ztop_state->z_buffer_top = R300_ZTOP_DISABLE;
+    } else {
+        ztop_state->z_buffer_top = R300_ZTOP_ENABLE;
+    }
+
+    if (ztop_state->z_buffer_top != old_ztop)
+        r300->ztop_state.dirty = TRUE;
+}
+
+void r300_update_hyperz_state(struct r300_context* r300)
+{
+    r300_update_ztop(r300);
+}
diff --git a/src/gallium/drivers/r300/r300_hyperz.h b/src/gallium/drivers/r300/r300_hyperz.h
new file mode 100644
index 0000000000..3df5053b89
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_hyperz.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_HYPERZ_H
+#define R300_HYPERZ_H
+
+struct r300_context;
+
+void r300_update_hyperz_state(struct r300_context* r300);
+
+#endif
diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c
new file mode 100644
index 0000000000..10cb468dfc
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_query.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright 2009 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+
+#include "r300_context.h"
+#include "r300_screen.h"
+#include "r300_emit.h"
+#include "r300_winsys.h"
+
+#include <stdio.h>
+
+static struct pipe_query *r300_create_query(struct pipe_context *pipe,
+                                            unsigned query_type)
+{
+    struct r300_context *r300 = r300_context(pipe);
+    struct r300_screen *r300screen = r300->screen;
+    struct r300_query *q;
+
+    assert(query_type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+    q = CALLOC_STRUCT(r300_query);
+    if (!q)
+        return NULL;
+
+    q->type = query_type;
+    q->domain = R300_DOMAIN_GTT;
+    q->buffer_size = 4096;
+
+    if (r300screen->caps.family == CHIP_FAMILY_RV530)
+        q->num_pipes = r300screen->caps.num_z_pipes;
+    else
+        q->num_pipes = r300screen->caps.num_frag_pipes;
+
+    insert_at_tail(&r300->query_list, q);
+
+    /* Open up the occlusion query buffer. */
+    q->buffer = r300->rws->buffer_create(r300->rws, 4096, 0, q->domain, q->buffer_size);
+
+    return (struct pipe_query*)q;
+}
+
+static void r300_destroy_query(struct pipe_context* pipe,
+                               struct pipe_query* query)
+{
+    struct r300_context *r300 = r300_context(pipe);
+    struct r300_query* q = r300_query(query);
+
+    r300->rws->buffer_reference(r300->rws, &q->buffer, NULL);
+    remove_from_list(q);
+    FREE(query);
+}
+
+void r300_resume_query(struct r300_context *r300,
+                       struct r300_query *query)
+{
+    r300->query_current = query;
+    r300->query_start.dirty = TRUE;
+}
+
+static void r300_begin_query(struct pipe_context* pipe,
+                             struct pipe_query* query)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_query* q = r300_query(query);
+
+    if (r300->query_current != NULL) {
+        fprintf(stderr, "r300: begin_query: "
+                "Some other query has already been started.\n");
+        assert(0);
+        return;
+    }
+
+    q->num_results = 0;
+    r300_resume_query(r300, q);
+}
+
+void r300_stop_query(struct r300_context *r300)
+{
+    r300_emit_query_end(r300);
+    r300->query_current = NULL;
+}
+
+static void r300_end_query(struct pipe_context* pipe,
+	                   struct pipe_query* query)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_query *q = r300_query(query);
+
+    if (q != r300->query_current) {
+        fprintf(stderr, "r300: end_query: Got invalid query.\n");
+        assert(0);
+        return;
+    }
+
+    r300_stop_query(r300);
+}
+
+static boolean r300_get_query_result(struct pipe_context* pipe,
+                                     struct pipe_query* query,
+                                     boolean wait,
+                                     void* vresult)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_query *q = r300_query(query);
+    unsigned flags, i;
+    uint32_t temp, *map;
+    uint64_t *result = (uint64_t*)vresult;
+
+    if (!q->flushed)
+        pipe->flush(pipe, 0, NULL);
+
+    flags = PIPE_TRANSFER_READ | (!wait ? PIPE_TRANSFER_DONTBLOCK : 0);
+
+    map = r300->rws->buffer_map(r300->rws, q->buffer, flags);
+    if (!map)
+        return FALSE;
+
+    /* Sum up the results. */
+    temp = 0;
+    for (i = 0; i < q->num_results; i++) {
+        temp += *map;
+        map++;
+    }
+
+    r300->rws->buffer_unmap(r300->rws, q->buffer);
+
+    *result = temp;
+    return TRUE;
+}
+
+static void r300_render_condition(struct pipe_context *pipe,
+                                  struct pipe_query *query,
+                                  uint mode)
+{
+    struct r300_context *r300 = r300_context(pipe);
+    uint64_t result;
+    boolean wait;
+
+    if (query) {
+        wait = mode == PIPE_RENDER_COND_WAIT ||
+               mode == PIPE_RENDER_COND_BY_REGION_WAIT;
+
+        if (!r300_get_query_result(pipe, query, wait, &result)) {
+            r300->skip_rendering = FALSE;
+        }
+
+        r300->skip_rendering = result == 0;
+    } else {
+        r300->skip_rendering = FALSE;
+    }
+}
+
+/***************************************************************************
+ * Fake occlusion queries (for debugging)
+ ***************************************************************************/
+
+static unsigned r300_fake_query;
+
+static struct pipe_query *r300_fake_create_query(struct pipe_context *pipe,
+                                                 unsigned query_type)
+{
+    return (struct pipe_query*)&r300_fake_query;
+}
+
+static void r300_fake_destroy_query(struct pipe_context* pipe,
+                                    struct pipe_query* query)
+{
+}
+
+static void r300_fake_begin_query(struct pipe_context* pipe,
+                                  struct pipe_query* query)
+{
+}
+
+static void r300_fake_end_query(struct pipe_context* pipe,
+                                struct pipe_query* query)
+{
+}
+
+static boolean r300_fake_get_query_result(struct pipe_context* pipe,
+                                          struct pipe_query* query,
+                                          boolean wait, void* vresult)
+{
+    uint64_t *result = (uint64_t*)vresult;
+    *result = 1000000;
+    return TRUE;
+}
+
+static void r300_fake_render_condition(struct pipe_context *pipe,
+                                       struct pipe_query *query, uint mode)
+{
+}
+
+void r300_init_query_functions(struct r300_context* r300) {
+    if (DBG_ON(r300, DBG_FAKE_OCC)) {
+        r300->context.create_query = r300_fake_create_query;
+        r300->context.destroy_query = r300_fake_destroy_query;
+        r300->context.begin_query = r300_fake_begin_query;
+        r300->context.end_query = r300_fake_end_query;
+        r300->context.get_query_result = r300_fake_get_query_result;
+        r300->context.render_condition = r300_fake_render_condition;
+    } else {
+        r300->context.create_query = r300_create_query;
+        r300->context.destroy_query = r300_destroy_query;
+        r300->context.begin_query = r300_begin_query;
+        r300->context.end_query = r300_end_query;
+        r300->context.get_query_result = r300_get_query_result;
+        r300->context.render_condition = r300_render_condition;
+    }
+}
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
new file mode 100644
index 0000000000..c783998c78
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -0,0 +1,3491 @@
+/**************************************************************************
+
+Copyright (C) 2004-2005 Nicolai Haehnle et al.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/* *INDENT-OFF* */
+
+#ifndef _R300_REG_H
+#define _R300_REG_H
+
+#define R300_MC_INIT_MISC_LAT_TIMER	0x180
+#	define R300_MC_MISC__MC_CPR_INIT_LAT_SHIFT	0
+#	define R300_MC_MISC__MC_VF_INIT_LAT_SHIFT	4
+#	define R300_MC_MISC__MC_DISP0R_INIT_LAT_SHIFT	8
+#	define R300_MC_MISC__MC_DISP1R_INIT_LAT_SHIFT	12
+#	define R300_MC_MISC__MC_FIXED_INIT_LAT_SHIFT	16
+#	define R300_MC_MISC__MC_E2R_INIT_LAT_SHIFT	20
+#	define R300_MC_MISC__MC_SAME_PAGE_PRIO_SHIFT	24
+#	define R300_MC_MISC__MC_GLOBW_INIT_LAT_SHIFT	28
+
+
+#define R300_MC_INIT_GFX_LAT_TIMER	0x154
+#	define R300_MC_MISC__MC_G3D0R_INIT_LAT_SHIFT	0
+#	define R300_MC_MISC__MC_G3D1R_INIT_LAT_SHIFT	4
+#	define R300_MC_MISC__MC_G3D2R_INIT_LAT_SHIFT	8
+#	define R300_MC_MISC__MC_G3D3R_INIT_LAT_SHIFT	12
+#	define R300_MC_MISC__MC_TX0R_INIT_LAT_SHIFT	16
+#	define R300_MC_MISC__MC_TX1R_INIT_LAT_SHIFT	20
+#	define R300_MC_MISC__MC_GLOBR_INIT_LAT_SHIFT	24
+#	define R300_MC_MISC__MC_GLOBW_FULL_LAT_SHIFT	28
+
+/*
+ * This file contains registers and constants for the R300. They have been
+ * found mostly by examining command buffers captured using glxtest, as well
+ * as by extrapolating some known registers and constants from the R200.
+ * I am fairly certain that they are correct unless stated otherwise
+ * in comments.
+ */
+
+#define R300_SE_VPORT_XSCALE                0x1D98
+#define R300_SE_VPORT_XOFFSET               0x1D9C
+#define R300_SE_VPORT_YSCALE                0x1DA0
+#define R300_SE_VPORT_YOFFSET               0x1DA4
+#define R300_SE_VPORT_ZSCALE                0x1DA8
+#define R300_SE_VPORT_ZOFFSET               0x1DAC
+
+#define R300_VAP_PORT_IDX0		    0x2040
+/*
+ * Vertex Array Processing (VAP) Control
+ */
+#define R300_VAP_CNTL	0x2080
+#       define R300_PVS_NUM_SLOTS_SHIFT                 0
+#       define R300_PVS_NUM_CNTLRS_SHIFT                4
+#       define R300_PVS_NUM_FPUS_SHIFT                  8
+#       define R300_VF_MAX_VTX_NUM_SHIFT                18
+#       define R300_PVS_NUM_SLOTS(x)                    ((x) << 0)
+#       define R300_PVS_NUM_CNTLRS(x)                   ((x) << 4)
+#       define R300_PVS_NUM_FPUS(x)                     ((x) << 8)
+#       define R300_PVS_VF_MAX_VTX_NUM(x)               ((x) << 18)
+#       define R300_GL_CLIP_SPACE_DEF                   (0 << 22)
+#       define R300_DX_CLIP_SPACE_DEF                   (1 << 22)
+#       define R500_TCL_STATE_OPTIMIZATION              (1 << 23)
+
+/* This register is written directly and also starts data section
+ * in many 3d CP_PACKET3's
+ */
+#define R300_VAP_VF_CNTL	0x2084
+#	define	R300_VAP_VF_CNTL__PRIM_TYPE__SHIFT              0
+#	define  R300_VAP_VF_CNTL__PRIM_NONE                     (0<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_POINTS                   (1<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINES                    (2<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINE_STRIP               (3<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLES                (4<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN             (5<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP           (6<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINE_LOOP                (12<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_QUADS                    (13<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_QUAD_STRIP               (14<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_POLYGON                  (15<<0)
+
+#	define	R300_VAP_VF_CNTL__PRIM_WALK__SHIFT              4
+	/* State based - direct writes to registers trigger vertex
+           generation */
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_STATE_BASED         (0<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_INDICES             (1<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST         (2<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED     (3<<4)
+
+	/* I don't think I saw these three used.. */
+#	define	R300_VAP_VF_CNTL__COLOR_ORDER__SHIFT            6
+#	define	R300_VAP_VF_CNTL__TCL_OUTPUT_CTL_ENA__SHIFT     9
+#	define	R300_VAP_VF_CNTL__PROG_STREAM_ENA__SHIFT        10
+
+	/* index size - when not set the indices are assumed to be 16 bit */
+#	define	R300_VAP_VF_CNTL__INDEX_SIZE_32bit              (1<<11)
+#       define R500_VAP_VF_CNTL__USE_ALT_NUM_VERTS          (1<<14)
+	/* number of vertices */
+#	define	R300_VAP_VF_CNTL__NUM_VERTICES__SHIFT           16
+
+#define R500_VAP_INDEX_OFFSET		    0x208c
+
+#define R500_VAP_ALT_NUM_VERTICES                           0x2088
+
+#define R300_VAP_OUTPUT_VTX_FMT_0           0x2090
+#       define R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT     (1<<0)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT (1<<1)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT (1<<2)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT (1<<3)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT (1<<4)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT (1<<16)
+
+#define R300_VAP_OUTPUT_VTX_FMT_1           0x2094
+	/* each of the following is 3 bits wide, specifies number
+	   of components */
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_0_COMP_CNT_SHIFT 0
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_1_COMP_CNT_SHIFT 3
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_2_COMP_CNT_SHIFT 6
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_3_COMP_CNT_SHIFT 9
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_4_COMP_CNT_SHIFT 12
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_5_COMP_CNT_SHIFT 15
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_6_COMP_CNT_SHIFT 18
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_7_COMP_CNT_SHIFT 21
+#	define R300_VAP_OUTPUT_VTX_FMT_1__NOT_PRESENT  0
+#	define R300_VAP_OUTPUT_VTX_FMT_1__1_COMPONENT  1
+#	define R300_VAP_OUTPUT_VTX_FMT_1__2_COMPONENTS 2
+#	define R300_VAP_OUTPUT_VTX_FMT_1__3_COMPONENTS 3
+#	define R300_VAP_OUTPUT_VTX_FMT_1__4_COMPONENTS 4
+
+#define R300_VAP_VPORT_XSCALE                     0x2098
+#define R300_VAP_VPORT_XOFFSET                    0x209c
+#define R300_VAP_VPORT_YSCALE                     0x20a0
+#define R300_VAP_VPORT_YOFFSET                    0x20a4
+#define R300_VAP_VPORT_ZSCALE                     0x20a8
+#define R300_VAP_VPORT_ZOFFSET                    0x20ac
+
+#define R300_VAP_VTE_CNTL                         0x20b0
+#define R300_SE_VTE_CNTL R300_VAP_VTE_CNTL
+#   define R300_VPORT_X_SCALE_ENA                           (1 << 0)
+#   define R300_VPORT_X_OFFSET_ENA                          (1 << 1)
+#   define R300_VPORT_Y_SCALE_ENA                           (1 << 2)
+#   define R300_VPORT_Y_OFFSET_ENA                          (1 << 3)
+#   define R300_VPORT_Z_SCALE_ENA                           (1 << 4)
+#   define R300_VPORT_Z_OFFSET_ENA                          (1 << 5)
+#   define R300_VTX_XY_FMT                                  (1 << 8)
+#   define R300_VTX_Z_FMT                                   (1 << 9)
+#   define R300_VTX_W0_FMT                                  (1 << 10)
+#   define R300_SERIAL_PROC_ENA                             (1 << 11)
+
+#define R300_VAP_VTX_SIZE               0x20b4
+
+/* BEGIN: Vertex data assembly - lots of uncertainties */
+
+/* gap */
+
+/* Maximum Vertex Indx Clamp */
+#define R300_VAP_VF_MAX_VTX_INDX         0x2134
+/* Minimum Vertex Indx Clamp */
+#define R300_VAP_VF_MIN_VTX_INDX         0x2138
+
+/** Vertex assembler/processor control status */
+#define R300_VAP_CNTL_STATUS              0x2140
+/* No swap at all (default) */
+#	define R300_VC_NO_SWAP                  (0 << 0)
+/* 16-bit swap: 0xAABBCCDD becomes 0xBBAADDCC */
+#	define R300_VC_16BIT_SWAP               (1 << 0)
+/* 32-bit swap: 0xAABBCCDD becomes 0xDDCCBBAA */
+#	define R300_VC_32BIT_SWAP               (2 << 0)
+/* Half-dword swap: 0xAABBCCDD becomes 0xCCDDAABB */
+#	define R300_VC_HALF_DWORD_SWAP          (3 << 0)
+/* The TCL engine will not be used (as it is logically or even physically removed) */
+#	define R300_VAP_TCL_BYPASS		(1 << 8)
+/* Read only flag if TCL engine is busy. */
+#	define R300_VAP_PVS_BUSY                (1 << 11)
+/* TODO: gap for MAX_MPS */
+/* Read only flag if the vertex store is busy. */
+#	define R300_VAP_VS_BUSY                 (1 << 24)
+/* Read only flag if the reciprocal engine is busy. */
+#	define R300_VAP_RCP_BUSY                (1 << 25)
+/* Read only flag if the viewport transform engine is busy. */
+#	define R300_VAP_VTE_BUSY                (1 << 26)
+/* Read only flag if the memory interface unit is busy. */
+#	define R300_VAP_MUI_BUSY                (1 << 27)
+/* Read only flag if the vertex cache is busy. */
+#	define R300_VAP_VC_BUSY                 (1 << 28)
+/* Read only flag if the vertex fetcher is busy. */
+#	define R300_VAP_VF_BUSY                 (1 << 29)
+/* Read only flag if the register pipeline is busy. */
+#	define R300_VAP_REGPIPE_BUSY            (1 << 30)
+/* Read only flag if the VAP engine is busy. */
+#	define R300_VAP_VAP_BUSY                (1 << 31)
+
+/* gap */
+
+/* Where do we get our vertex data?
+ *
+ * Vertex data either comes either from immediate mode registers or from
+ * vertex arrays.
+ * There appears to be no mixed mode (though we can force the pitch of
+ * vertex arrays to 0, effectively reusing the same element over and over
+ * again).
+ *
+ * Immediate mode is controlled by the INPUT_CNTL registers. I am not sure
+ * if these registers influence vertex array processing.
+ *
+ * Vertex arrays are controlled via the 3D_LOAD_VBPNTR packet3.
+ *
+ * In both cases, vertex attributes are then passed through INPUT_ROUTE.
+ *
+ * Beginning with INPUT_ROUTE_0_0 is a list of WORDs that route vertex data
+ * into the vertex processor's input registers.
+ * The first word routes the first input, the second word the second, etc.
+ * The corresponding input is routed into the register with the given index.
+ * The list is ended by a word with INPUT_ROUTE_END set.
+ *
+ * Always set COMPONENTS_4 in immediate mode.
+ */
+
+#define R300_VAP_PROG_STREAM_CNTL_0                     0x2150
+#       define R300_DATA_TYPE_0_SHIFT                   0
+#       define R300_DATA_TYPE_FLOAT_1                   0
+#       define R300_DATA_TYPE_FLOAT_2                   1
+#       define R300_DATA_TYPE_FLOAT_3                   2
+#       define R300_DATA_TYPE_FLOAT_4                   3
+#       define R300_DATA_TYPE_BYTE                      4
+#       define R300_DATA_TYPE_D3DCOLOR                  5
+#       define R300_DATA_TYPE_SHORT_2                   6
+#       define R300_DATA_TYPE_SHORT_4                   7
+#       define R300_DATA_TYPE_VECTOR_3_TTT              8
+#       define R300_DATA_TYPE_VECTOR_3_EET              9
+#       define R300_DATA_TYPE_FLOAT_8                   10
+#       define R300_DATA_TYPE_FLT16_2                   11
+#       define R300_DATA_TYPE_FLT16_4                   12
+#       define R300_SKIP_DWORDS_SHIFT                   4
+#       define R300_DST_VEC_LOC_SHIFT                   8
+#       define R300_LAST_VEC                            (1 << 13)
+#       define R300_SIGNED                              (1 << 14)
+#       define R300_NORMALIZE                           (1 << 15)
+#       define R300_DATA_TYPE_1_SHIFT                   16
+#define R300_VAP_PROG_STREAM_CNTL_1                     0x2154
+#define R300_VAP_PROG_STREAM_CNTL_2                     0x2158
+#define R300_VAP_PROG_STREAM_CNTL_3                     0x215C
+#define R300_VAP_PROG_STREAM_CNTL_4                     0x2160
+#define R300_VAP_PROG_STREAM_CNTL_5                     0x2164
+#define R300_VAP_PROG_STREAM_CNTL_6                     0x2168
+#define R300_VAP_PROG_STREAM_CNTL_7                     0x216C
+/* gap */
+
+/* Notes:
+ *  - always set up to produce at least two attributes:
+ *    if vertex program uses only position, fglrx will set normal, too
+ *  - INPUT_CNTL_0_COLOR and INPUT_CNTL_COLOR bits are always equal.
+ */
+#define R300_VAP_VTX_STATE_CNTL               0x2180
+#       define R300_COLOR_0_ASSEMBLY_SHIFT    0
+#       define R300_SEL_COLOR                 0
+#       define R300_SEL_USER_COLOR_0          1
+#       define R300_SEL_USER_COLOR_1          2
+#       define R300_COLOR_1_ASSEMBLY_SHIFT    2
+#       define R300_COLOR_2_ASSEMBLY_SHIFT    4
+#       define R300_COLOR_3_ASSEMBLY_SHIFT    6
+#       define R300_COLOR_4_ASSEMBLY_SHIFT    8
+#       define R300_COLOR_5_ASSEMBLY_SHIFT    10
+#       define R300_COLOR_6_ASSEMBLY_SHIFT    12
+#       define R300_COLOR_7_ASSEMBLY_SHIFT    14
+#       define R300_UPDATE_USER_COLOR_0_ENA   (1 << 16)
+
+/*
+ * Each bit in this field applies to the corresponding vector in the VSM
+ * memory (i.e. Bit 0 applies to VECTOR_0 (POSITION), etc.). If the bit
+ * is set, then the corresponding 4-Dword Vector is output into the Vertex Stream.
+ */
+#define R300_VAP_VSM_VTX_ASSM               0x2184
+#       define R300_INPUT_CNTL_POS               0x00000001
+#       define R300_INPUT_CNTL_NORMAL            0x00000002
+#       define R300_INPUT_CNTL_COLOR             0x00000004
+#       define R300_INPUT_CNTL_TC0               0x00000400
+#       define R300_INPUT_CNTL_TC1               0x00000800
+#       define R300_INPUT_CNTL_TC2               0x00001000 /* GUESS */
+#       define R300_INPUT_CNTL_TC3               0x00002000 /* GUESS */
+#       define R300_INPUT_CNTL_TC4               0x00004000 /* GUESS */
+#       define R300_INPUT_CNTL_TC5               0x00008000 /* GUESS */
+#       define R300_INPUT_CNTL_TC6               0x00010000 /* GUESS */
+#       define R300_INPUT_CNTL_TC7               0x00020000 /* GUESS */
+
+/* Programmable Stream Control Signed Normalize Control */
+#define R300_VAP_PSC_SGN_NORM_CNTL                0x21dc
+#   define SGN_NORM_ZERO                                    0
+#   define SGN_NORM_ZERO_CLAMP_MINUS_ONE                    1
+#   define SGN_NORM_NO_ZERO                                 2
+#   define R300_SGN_NORM_NO_ZERO (SGN_NORM_NO_ZERO | \
+        (SGN_NORM_NO_ZERO << 2) | (SGN_NORM_NO_ZERO << 4) | \
+        (SGN_NORM_NO_ZERO << 6) | (SGN_NORM_NO_ZERO << 8) | \
+        (SGN_NORM_NO_ZERO << 10) | (SGN_NORM_NO_ZERO << 12) | \
+        (SGN_NORM_NO_ZERO << 14) | (SGN_NORM_NO_ZERO << 16) | \
+        (SGN_NORM_NO_ZERO << 18) | (SGN_NORM_NO_ZERO << 20) | \
+        (SGN_NORM_NO_ZERO << 22) | (SGN_NORM_NO_ZERO << 24) | \
+        (SGN_NORM_NO_ZERO << 26) | (SGN_NORM_NO_ZERO << 28) | \
+        (SGN_NORM_NO_ZERO << 30))
+
+/* gap */
+
+/* Words parallel to INPUT_ROUTE_0; All words that are active in INPUT_ROUTE_0
+ * are set to a swizzling bit pattern, other words are 0.
+ *
+ * In immediate mode, the pattern is always set to xyzw. In vertex array
+ * mode, the swizzling pattern is e.g. used to set zw components in texture
+ * coordinates with only tweo components.
+ */
+#define R300_VAP_PROG_STREAM_CNTL_EXT_0                 0x21e0
+#       define R300_SWIZZLE0_SHIFT                      0
+#       define R300_SWIZZLE_SELECT_X_SHIFT              0
+#       define R300_SWIZZLE_SELECT_Y_SHIFT              3
+#       define R300_SWIZZLE_SELECT_Z_SHIFT              6
+#       define R300_SWIZZLE_SELECT_W_SHIFT              9
+
+#       define R300_SWIZZLE_SELECT_X                    0
+#       define R300_SWIZZLE_SELECT_Y                    1
+#       define R300_SWIZZLE_SELECT_Z                    2
+#       define R300_SWIZZLE_SELECT_W                    3
+#       define R300_SWIZZLE_SELECT_FP_ZERO              4
+#       define R300_SWIZZLE_SELECT_FP_ONE               5
+/* alternate forms for r300_emit.c */
+#       define R300_INPUT_ROUTE_SELECT_X    0
+#       define R300_INPUT_ROUTE_SELECT_Y    1
+#       define R300_INPUT_ROUTE_SELECT_Z    2
+#       define R300_INPUT_ROUTE_SELECT_W    3
+#       define R300_INPUT_ROUTE_SELECT_ZERO 4
+#       define R300_INPUT_ROUTE_SELECT_ONE  5
+
+#       define R300_WRITE_ENA_SHIFT                     12
+#       define R300_WRITE_ENA_X                         1
+#       define R300_WRITE_ENA_Y                         2
+#       define R300_WRITE_ENA_Z                         4
+#       define R300_WRITE_ENA_W                         8
+#       define R300_SWIZZLE1_SHIFT                      16
+
+#       define R300_VAP_SWIZZLE_X001 \
+        ((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) | \
+         (R300_SWIZZLE_SELECT_FP_ZERO << R300_SWIZZLE_SELECT_Y_SHIFT) | \
+         (R300_SWIZZLE_SELECT_FP_ZERO << R300_SWIZZLE_SELECT_Z_SHIFT) | \
+         (R300_SWIZZLE_SELECT_FP_ONE << R300_SWIZZLE_SELECT_W_SHIFT) | \
+         (0xf << R300_WRITE_ENA_SHIFT))
+
+#       define R300_VAP_SWIZZLE_XY01 \
+        ((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) | \
+         (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) | \
+         (R300_SWIZZLE_SELECT_FP_ZERO << R300_SWIZZLE_SELECT_Z_SHIFT) | \
+         (R300_SWIZZLE_SELECT_FP_ONE << R300_SWIZZLE_SELECT_W_SHIFT) | \
+         (0xf << R300_WRITE_ENA_SHIFT))
+
+#       define R300_VAP_SWIZZLE_XYZ1 \
+        ((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) | \
+         (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) | \
+         (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) | \
+         (R300_SWIZZLE_SELECT_FP_ONE << R300_SWIZZLE_SELECT_W_SHIFT) | \
+         (0xf << R300_WRITE_ENA_SHIFT))
+
+#       define R300_VAP_SWIZZLE_XYZW \
+        ((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) | \
+         (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) | \
+         (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) | \
+         (R300_SWIZZLE_SELECT_W << R300_SWIZZLE_SELECT_W_SHIFT) | \
+         (0xf << R300_WRITE_ENA_SHIFT))
+
+#define R300_VAP_PROG_STREAM_CNTL_EXT_1                 0x21e4
+#define R300_VAP_PROG_STREAM_CNTL_EXT_2                 0x21e8
+#define R300_VAP_PROG_STREAM_CNTL_EXT_3                 0x21ec
+#define R300_VAP_PROG_STREAM_CNTL_EXT_4                 0x21f0
+#define R300_VAP_PROG_STREAM_CNTL_EXT_5                 0x21f4
+#define R300_VAP_PROG_STREAM_CNTL_EXT_6                 0x21f8
+#define R300_VAP_PROG_STREAM_CNTL_EXT_7                 0x21fc
+
+/* END: Vertex data assembly */
+
+/* gap */
+
+/* BEGIN: Upload vertex program and data */
+
+/*
+ * The programmable vertex shader unit has a memory bank of unknown size
+ * that can be written to in 16 byte units by writing the address into
+ * UPLOAD_ADDRESS, followed by data in UPLOAD_DATA (multiples of 4 DWORDs).
+ *
+ * Pointers into the memory bank are always in multiples of 16 bytes.
+ *
+ * The memory bank is divided into areas with fixed meaning.
+ *
+ * Starting at address UPLOAD_PROGRAM: Vertex program instructions.
+ * Native limits reported by drivers from ATI suggest size 256 (i.e. 4KB),
+ * whereas the difference between known addresses suggests size 512.
+ *
+ * Starting at address UPLOAD_PARAMETERS: Vertex program parameters.
+ * Native reported limits and the VPI layout suggest size 256, whereas
+ * difference between known addresses suggests size 512.
+ *
+ * At address UPLOAD_POINTSIZE is a vector (0, 0, ps, 0), where ps is the
+ * floating point pointsize. The exact purpose of this state is uncertain,
+ * as there is also the R300_RE_POINTSIZE register.
+ *
+ * Multiple vertex programs and parameter sets can be loaded at once,
+ * which could explain the size discrepancy.
+ */
+#define R300_VAP_PVS_VECTOR_INDX_REG         0x2200
+#       define R300_PVS_CODE_START           0
+#       define R300_MAX_PVS_CODE_LINES       256
+#       define R500_MAX_PVS_CODE_LINES       1024
+#       define R300_PVS_CONST_START          512
+#       define R500_PVS_CONST_START          1024
+#       define R300_MAX_PVS_CONST_VECS       256
+#       define R500_MAX_PVS_CONST_VECS       1024
+#       define R300_PVS_UCP_START            1024
+#       define R500_PVS_UCP_START            1536
+#       define R300_POINT_VPORT_SCALE_OFFSET 1030
+#       define R500_POINT_VPORT_SCALE_OFFSET 1542
+#       define R300_POINT_GEN_TEX_OFFSET     1031
+#       define R500_POINT_GEN_TEX_OFFSET     1543
+
+/*
+ * These are obsolete defines form r300_context.h, but they might give some
+ * clues when investigating the addresses further...
+ */
+#if 0
+#define VSF_DEST_PROGRAM        0x0
+#define VSF_DEST_MATRIX0        0x200
+#define VSF_DEST_MATRIX1        0x204
+#define VSF_DEST_MATRIX2        0x208
+#define VSF_DEST_VECTOR0        0x20c
+#define VSF_DEST_VECTOR1        0x20d
+#define VSF_DEST_UNKNOWN1       0x400
+#define VSF_DEST_UNKNOWN2       0x406
+#endif
+
+/* gap */
+
+#define R300_VAP_PVS_UPLOAD_DATA            0x2208
+
+/* END: Upload vertex program and data */
+
+/* gap */
+
+/* I do not know the purpose of this register. However, I do know that
+ * it is set to 221C_CLEAR for clear operations and to 221C_NORMAL
+ * for normal rendering.
+ *
+ * 2007-11-05: This register is the user clip plane control register, but there
+ * also seems to be a rendering mode control; the NORMAL/CLEAR defines.
+ *
+ * See bug #9871. http://bugs.freedesktop.org/attachment.cgi?id=10672&action=view
+ */
+#define R300_VAP_CLIP_CNTL                       0x221C
+#       define R300_VAP_UCP_ENABLE_0             (1 << 0)
+#       define R300_VAP_UCP_ENABLE_1             (1 << 1)
+#       define R300_VAP_UCP_ENABLE_2             (1 << 2)
+#       define R300_VAP_UCP_ENABLE_3             (1 << 3)
+#       define R300_VAP_UCP_ENABLE_4             (1 << 4)
+#       define R300_VAP_UCP_ENABLE_5             (1 << 5)
+#       define R300_PS_UCP_MODE_DIST_COP         (0 << 14)
+#       define R300_PS_UCP_MODE_RADIUS_COP       (1 << 14)
+#       define R300_PS_UCP_MODE_RADIUS_COP_CLIP  (2 << 14)
+#       define R300_PS_UCP_MODE_CLIP_AS_TRIFAN   (3 << 14)
+#       define R300_CLIP_DISABLE                 (1 << 16)
+#       define R300_UCP_CULL_ONLY_ENABLE         (1 << 17)
+#       define R300_BOUNDARY_EDGE_FLAG_ENABLE    (1 << 18)
+#       define R500_COLOR2_IS_TEXTURE            (1 << 20)
+#       define R500_COLOR3_IS_TEXTURE            (1 << 21)
+
+/* These seem to be per-pixel and per-vertex X and Y clipping planes. The first
+ * plane is per-pixel and the second plane is per-vertex.
+ *
+ * This was determined by experimentation alone but I believe it is correct.
+ *
+ * These registers are called X_QUAD0_1_FL to X_QUAD0_4_FL by glxtest.
+ */
+#define R300_VAP_GB_VERT_CLIP_ADJ                   0x2220
+#define R300_VAP_GB_VERT_DISC_ADJ                   0x2224
+#define R300_VAP_GB_HORZ_CLIP_ADJ                   0x2228
+#define R300_VAP_GB_HORZ_DISC_ADJ                   0x222c
+
+/* gap */
+
+/* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
+ * rendering commands and overwriting vertex program parameters.
+ * Therefore, I suspect writing zero to 0x2284 synchronizes the engine and
+ * avoids bugs caused by still running shaders reading bad data from memory.
+ */
+#define R300_VAP_PVS_STATE_FLUSH_REG        0x2284
+
+/* This register is used to define the number of core clocks to wait for a
+ * vertex to be received by the VAP input controller (while the primitive
+ * path is backed up) before forcing any accumulated vertices to be submitted
+ * to the vertex processing path.
+ */
+#define VAP_PVS_VTX_TIMEOUT_REG             0x2288
+#       define R300_2288_R300                    0x00750000 /* -- nh */
+#       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
+
+/* gap */
+
+/* Addresses are relative to the vertex program instruction area of the
+ * memory bank. PROGRAM_END points to the last instruction of the active
+ * program
+ *
+ * The meaning of the two UNKNOWN fields is obviously not known. However,
+ * experiments so far have shown that both *must* point to an instruction
+ * inside the vertex program, otherwise the GPU locks up.
+ *
+ * fglrx usually sets CNTL_3_UNKNOWN to the end of the program and
+ * R300_PVS_CNTL_1_POS_END_SHIFT points to instruction where last write to
+ * position takes place.
+ *
+ * Most likely this is used to ignore rest of the program in cases
+ * where group of verts arent visible. For some reason this "section"
+ * is sometimes accepted other instruction that have no relationship with
+ * position calculations.
+ */
+#define R300_VAP_PVS_CODE_CNTL_0            0x22D0
+#       define R300_PVS_FIRST_INST_SHIFT         0
+#       define R300_PVS_XYZW_VALID_INST_SHIFT    10
+#       define R300_PVS_LAST_INST_SHIFT          20
+#       define R300_PVS_FIRST_INST(x)            ((x) << 0)
+#       define R300_PVS_XYZW_VALID_INST(x)       ((x) << 10)
+#       define R300_PVS_LAST_INST(x)             ((x) << 20)
+/* Addresses are relative to the vertex program parameters area. */
+#define R300_VAP_PVS_CONST_CNTL             0x22D4
+#       define R300_PVS_CONST_BASE_OFFSET_SHIFT  0
+#       define R300_PVS_MAX_CONST_ADDR_SHIFT     16
+#       define R300_PVS_MAX_CONST_ADDR(x)        ((x) << 16)
+#define R300_VAP_PVS_CODE_CNTL_1	    0x22D8
+#       define R300_PVS_LAST_VTX_SRC_INST_SHIFT  0
+#define R300_VAP_PVS_FLOW_CNTL_OPC          0x22DC
+
+/* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
+ * immediate vertices
+ */
+#define R300_VAP_VTX_COLOR_R                0x2464
+#define R300_VAP_VTX_COLOR_G                0x2468
+#define R300_VAP_VTX_COLOR_B                0x246C
+#define R300_VAP_VTX_POS_0_X_1              0x2490 /* used for glVertex2*() */
+#define R300_VAP_VTX_POS_0_Y_1              0x2494
+#define R300_VAP_VTX_COLOR_PKD              0x249C /* RGBA */
+#define R300_VAP_VTX_POS_0_X_2              0x24A0 /* used for glVertex3*() */
+#define R300_VAP_VTX_POS_0_Y_2              0x24A4
+#define R300_VAP_VTX_POS_0_Z_2              0x24A8
+/* write 0 to indicate end of packet? */
+#define R300_VAP_VTX_END_OF_PKT             0x24AC
+
+/* gap */
+
+/* These are values from r300_reg/r300_reg.h - they are known to be correct
+ * and are here so we can use one register file instead of several
+ * - Vladimir
+ */
+#define R300_GB_VAP_RASTER_VTX_FMT_0	0x4000
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__POS_PRESENT	(1<<0)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_0_PRESENT	(1<<1)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_1_PRESENT	(1<<2)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_2_PRESENT	(1<<3)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_3_PRESENT	(1<<4)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_SPACE	(0xf<<5)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__PT_SIZE_PRESENT	(0x1<<16)
+
+#define R300_GB_VAP_RASTER_VTX_FMT_1	0x4004
+	/* each of the following is 3 bits wide, specifies number
+	   of components */
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_0_COMP_CNT_SHIFT	0
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_1_COMP_CNT_SHIFT	3
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_2_COMP_CNT_SHIFT	6
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_3_COMP_CNT_SHIFT	9
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_4_COMP_CNT_SHIFT	12
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_5_COMP_CNT_SHIFT	15
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_6_COMP_CNT_SHIFT	18
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_7_COMP_CNT_SHIFT	21
+
+/* UNK30 seems to enables point to quad transformation on textures
+ * (or something closely related to that).
+ * This bit is rather fatal at the time being due to lackings at pixel
+ * shader side
+ * Specifies top of Raster pipe specific enable controls.
+ */
+#define R300_GB_ENABLE	0x4008
+#	define R300_GB_POINT_STUFF_DISABLE     (0 << 0)
+#	define R300_GB_POINT_STUFF_ENABLE      (1 << 0) /* Specifies if points will have stuffed texture coordinates. */
+#	define R300_GB_LINE_STUFF_DISABLE      (0 << 1)
+#	define R300_GB_LINE_STUFF_ENABLE       (1 << 1) /* Specifies if lines will have stuffed texture coordinates. */
+#	define R300_GB_TRIANGLE_STUFF_DISABLE  (0 << 2)
+#	define R300_GB_TRIANGLE_STUFF_ENABLE   (1 << 2) /* Specifies if triangles will have stuffed texture coordinates. */
+#	define R300_GB_STENCIL_AUTO_DISABLE    (0 << 4)
+#	define R300_GB_STENCIL_AUTO_ENABLE     (1 << 4) /* Enable stencil auto inc/dec based on triangle cw/ccw, force into dzy low bit. */
+#	define R300_GB_STENCIL_AUTO_FORCE      (2 << 4) /* Force 0 into dzy low bit. */
+
+	/* each of the following is 2 bits wide */
+#define R300_GB_TEX_REPLICATE	0 /* Replicate VAP source texture coordinates (S,T,[R,Q]). */
+#define R300_GB_TEX_ST		1 /* Stuff with source texture coordinates (S,T). */
+#define R300_GB_TEX_STR		2 /* Stuff with source texture coordinates (S,T,R). */
+#	define R300_GB_TEX0_SOURCE_SHIFT	16
+#	define R300_GB_TEX1_SOURCE_SHIFT	18
+#	define R300_GB_TEX2_SOURCE_SHIFT	20
+#	define R300_GB_TEX3_SOURCE_SHIFT	22
+#	define R300_GB_TEX4_SOURCE_SHIFT	24
+#	define R300_GB_TEX5_SOURCE_SHIFT	26
+#	define R300_GB_TEX6_SOURCE_SHIFT	28
+#	define R300_GB_TEX7_SOURCE_SHIFT	30
+
+/* MSPOS - positions for multisample antialiasing (?) */
+#define R300_GB_MSPOS0                           0x4010
+	/* shifts - each of the fields is 4 bits */
+#	define R300_GB_MSPOS0__MS_X0_SHIFT	0
+#	define R300_GB_MSPOS0__MS_Y0_SHIFT	4
+#	define R300_GB_MSPOS0__MS_X1_SHIFT	8
+#	define R300_GB_MSPOS0__MS_Y1_SHIFT	12
+#	define R300_GB_MSPOS0__MS_X2_SHIFT	16
+#	define R300_GB_MSPOS0__MS_Y2_SHIFT	20
+#	define R300_GB_MSPOS0__MSBD0_Y		24
+#	define R300_GB_MSPOS0__MSBD0_X		28
+
+#define R300_GB_MSPOS1                           0x4014
+#	define R300_GB_MSPOS1__MS_X3_SHIFT	0
+#	define R300_GB_MSPOS1__MS_Y3_SHIFT	4
+#	define R300_GB_MSPOS1__MS_X4_SHIFT	8
+#	define R300_GB_MSPOS1__MS_Y4_SHIFT	12
+#	define R300_GB_MSPOS1__MS_X5_SHIFT	16
+#	define R300_GB_MSPOS1__MS_Y5_SHIFT	20
+#	define R300_GB_MSPOS1__MSBD1		24
+
+/* Specifies the graphics pipeline configuration for rasterization. */
+#define R300_GB_TILE_CONFIG                      0x4018
+#	define R300_GB_TILE_DISABLE             (0 << 0)
+#	define R300_GB_TILE_ENABLE              (1 << 0)
+#	define R300_GB_TILE_PIPE_COUNT_RV300	(0 << 1) /* RV350 (1 pipe, 1 ctx) */
+#	define R300_GB_TILE_PIPE_COUNT_R300	(3 << 1) /* R300 (2 pipes, 1 ctx) */
+#	define R300_GB_TILE_PIPE_COUNT_R420_3P  (6 << 1) /* R420-3P (3 pipes, 1 ctx) */
+#	define R300_GB_TILE_PIPE_COUNT_R420	(7 << 1) /* R420 (4 pipes, 1 ctx) */
+#	define R300_GB_TILE_SIZE_8		(0 << 4)
+#	define R300_GB_TILE_SIZE_16		(1 << 4)
+#	define R300_GB_TILE_SIZE_32		(2 << 4)
+#	define R300_GB_SUPER_SIZE_1		(0 << 6)
+#	define R300_GB_SUPER_SIZE_2		(1 << 6)
+#	define R300_GB_SUPER_SIZE_4		(2 << 6)
+#	define R300_GB_SUPER_SIZE_8		(3 << 6)
+#	define R300_GB_SUPER_SIZE_16		(4 << 6)
+#	define R300_GB_SUPER_SIZE_32		(5 << 6)
+#	define R300_GB_SUPER_SIZE_64		(6 << 6)
+#	define R300_GB_SUPER_SIZE_128		(7 << 6)
+#	define R300_GB_SUPER_X_SHIFT		9	/* 3 bits wide */
+#	define R300_GB_SUPER_Y_SHIFT		12	/* 3 bits wide */
+#	define R300_GB_SUPER_TILE_A		(0 << 15)
+#	define R300_GB_SUPER_TILE_B		(1 << 15)
+#	define R300_GB_SUBPIXEL_1_12		(0 << 16)
+#	define R300_GB_SUBPIXEL_1_16		(1 << 16)
+#	define R300_GB_TILE_CONFIG_QUADS_PER_RAS_4   (0 << 17)
+#	define R300_GB_TILE_CONFIG_QUADS_PER_RAS_8   (1 << 17)
+#	define R300_GB_TILE_CONFIG_QUADS_PER_RAS_16  (2 << 17)
+#	define R300_GB_TILE_CONFIG_QUADS_PER_RAS_32  (3 << 17)
+#	define R300_GB_TILE_CONFIG_BB_SCAN_INTERCEPT (0 << 19)
+#	define R300_GB_TILE_CONFIG_BB_SCAN_BOUND_BOX (1 << 19)
+#	define R300_GB_TILE_CONFIG_ALT_SCAN_EN_LR    (0 << 20)
+#	define R300_GB_TILE_CONFIG_ALT_SCAN_EN_LRL   (1 << 20)
+#	define R300_GB_TILE_CONFIG_ALT_OFFSET        (0 << 21)
+#	define R300_GB_TILE_CONFIG_SUBPRECISION      (0 << 22)
+#	define R300_GB_TILE_CONFIG_ALT_TILING_DEF    (0 << 23)
+#	define R300_GB_TILE_CONFIG_ALT_TILING_3_2    (1 << 23)
+#	define R300_GB_TILE_CONFIG_Z_EXTENDED_24_1   (0 << 24)
+#	define R300_GB_TILE_CONFIG_Z_EXTENDED_S25_1  (1 << 24)
+
+/* Specifies the sizes of the various FIFO`s in the sc/rs/us. This register must be the first one written */
+#define R300_GB_FIFO_SIZE	0x4024
+	/* each of the following is 2 bits wide */
+#define R300_GB_FIFO_SIZE_32	0
+#define R300_GB_FIFO_SIZE_64	1
+#define R300_GB_FIFO_SIZE_128	2
+#define R300_GB_FIFO_SIZE_256	3
+#	define R300_SC_IFIFO_SIZE_SHIFT	0
+#	define R300_SC_TZFIFO_SIZE_SHIFT	2
+#	define R300_SC_BFIFO_SIZE_SHIFT	4
+
+#	define R300_US_OFIFO_SIZE_SHIFT	12
+#	define R300_US_WFIFO_SIZE_SHIFT	14
+	/* the following use the same constants as above, but meaning is
+	   is times 2 (i.e. instead of 32 words it means 64 */
+#	define R300_RS_TFIFO_SIZE_SHIFT	6
+#	define R300_RS_CFIFO_SIZE_SHIFT	8
+#	define R300_US_RAM_SIZE_SHIFT		10
+	/* watermarks, 3 bits wide */
+#	define R300_RS_HIGHWATER_COL_SHIFT	16
+#	define R300_RS_HIGHWATER_TEX_SHIFT	19
+#	define R300_OFIFO_HIGHWATER_SHIFT	22	/* two bits only */
+#	define R300_CUBE_FIFO_HIGHWATER_COL_SHIFT	24
+
+#define R300_GB_Z_PEQ_CONFIG                          0x4028
+#	define R300_GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_4_4    (0 << 0)
+#	define R300_GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_8_8    (1 << 0)
+
+/* Specifies various polygon specific selects (fog, depth, perspective). */
+#define R300_GB_SELECT                           0x401c
+#	define R300_GB_FOG_SELECT_C0A		(0 << 0)
+#	define R300_GB_FOG_SELECT_C1A           (1 << 0)
+#	define R300_GB_FOG_SELECT_C2A           (2 << 0)
+#	define R300_GB_FOG_SELECT_C3A           (3 << 0)
+#	define R300_GB_FOG_SELECT_1_1_W         (4 << 0)
+#	define R300_GB_FOG_SELECT_Z		(5 << 0)
+#	define R300_GB_DEPTH_SELECT_Z		(0 << 3)
+#	define R300_GB_DEPTH_SELECT_1_1_W	(1 << 3)
+#	define R300_GB_W_SELECT_1_W		(0 << 4)
+#	define R300_GB_W_SELECT_1		(1 << 4)
+#	define R300_GB_FOG_STUFF_DISABLE        (0 << 5)
+#	define R300_GB_FOG_STUFF_ENABLE         (1 << 5)
+#	define R300_GB_FOG_STUFF_TEX_SHIFT      6
+#	define R300_GB_FOG_STUFF_TEX_MASK       0x000003c0
+#	define R300_GB_FOG_STUFF_COMP_SHIFT     10
+#	define R300_GB_FOG_STUFF_COMP_MASK      0x00000c00
+
+/* Specifies the graphics pipeline configuration for antialiasing. */
+#define R300_GB_AA_CONFIG                         0x4020
+#	define R300_GB_AA_CONFIG_AA_DISABLE           (0 << 0)
+#	define R300_GB_AA_CONFIG_AA_ENABLE            (1 << 0)
+#	define R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_2  (0 << 1)
+#	define R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_3  (1 << 1)
+#	define R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_4  (2 << 1)
+#	define R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_6  (3 << 1)
+
+/* Selects which of 4 pipes are active. */
+#define R300_GB_PIPE_SELECT                           0x402c
+#	define R300_GB_PIPE_SELECT_PIPE0_ID_SHIFT  0
+#	define R300_GB_PIPE_SELECT_PIPE1_ID_SHIFT  2
+#	define R300_GB_PIPE_SELECT_PIPE2_ID_SHIFT  4
+#	define R300_GB_PIPE_SELECT_PIPE3_ID_SHIFT  6
+#	define R300_GB_PIPE_SELECT_PIPE_MASK_SHIFT 8
+#	define R300_GB_PIPE_SELECT_MAX_PIPE        12
+#	define R300_GB_PIPE_SELECT_BAD_PIPES       14
+#	define R300_GB_PIPE_SELECT_CONFIG_PIPES    18
+
+
+/* Specifies the sizes of the various FIFO`s in the sc/rs. */
+#define R300_GB_FIFO_SIZE1                            0x4070
+/* High water mark for SC input fifo */
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_IFIFO_SHIFT 0
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_IFIFO_MASK  0x0000003f
+/* High water mark for SC input fifo (B) */
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_BFIFO_SHIFT 6
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_BFIFO_MASK  0x00000fc0
+/* High water mark for RS colors' fifo */
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_COL_SHIFT   12
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_COL_MASK    0x0003f000
+/* High water mark for RS textures' fifo */
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_TEX_SHIFT   18
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_TEX_MASK    0x00fc0000
+
+/* This table specifies the source location and format for up to 16 texture
+ * addresses (i[0]:i[15]) and four colors (c[0]:c[3])
+ */
+#define R500_RS_IP_0					0x4074
+#define R500_RS_IP_1					0x4078
+#define R500_RS_IP_2					0x407C
+#define R500_RS_IP_3					0x4080
+#define R500_RS_IP_4					0x4084
+#define R500_RS_IP_5					0x4088
+#define R500_RS_IP_6					0x408C
+#define R500_RS_IP_7					0x4090
+#define R500_RS_IP_8					0x4094
+#define R500_RS_IP_9					0x4098
+#define R500_RS_IP_10					0x409C
+#define R500_RS_IP_11					0x40A0
+#define R500_RS_IP_12					0x40A4
+#define R500_RS_IP_13					0x40A8
+#define R500_RS_IP_14					0x40AC
+#define R500_RS_IP_15					0x40B0
+#define R500_RS_IP_PTR_K0                               62
+#define R500_RS_IP_PTR_K1                               63
+#define R500_RS_IP_TEX_PTR_S_SHIFT 			0
+#define R500_RS_IP_TEX_PTR_T_SHIFT 			6
+#define R500_RS_IP_TEX_PTR_R_SHIFT 			12
+#define R500_RS_IP_TEX_PTR_Q_SHIFT 			18
+#define R500_RS_IP_COL_PTR_SHIFT 			24
+#define R500_RS_IP_COL_FMT_SHIFT 			27
+#       define R500_RS_SEL_S(x)                         ((x) << 0)
+#       define R500_RS_SEL_T(x)                         ((x) << 6)
+#       define R500_RS_SEL_R(x)                         ((x) << 12)
+#       define R500_RS_SEL_Q(x)                         ((x) << 18)
+#	define R500_RS_COL_PTR(x)		        ((x) << 24)
+#       define R500_RS_COL_FMT(x)                       ((x) << 27)
+/* gap */
+#define R500_RS_IP_OFFSET_DIS 				(0 << 31)
+#define R500_RS_IP_OFFSET_EN 				(1 << 31)
+
+/* gap */
+
+/* Zero to flush caches. */
+#define R300_TX_INVALTAGS                   0x4100
+#define R300_TX_FLUSH                       0x0
+
+/* The upper enable bits are guessed, based on fglrx reported limits. */
+#define R300_TX_ENABLE                      0x4104
+#       define R300_TX_ENABLE_0                  (1 << 0)
+#       define R300_TX_ENABLE_1                  (1 << 1)
+#       define R300_TX_ENABLE_2                  (1 << 2)
+#       define R300_TX_ENABLE_3                  (1 << 3)
+#       define R300_TX_ENABLE_4                  (1 << 4)
+#       define R300_TX_ENABLE_5                  (1 << 5)
+#       define R300_TX_ENABLE_6                  (1 << 6)
+#       define R300_TX_ENABLE_7                  (1 << 7)
+#       define R300_TX_ENABLE_8                  (1 << 8)
+#       define R300_TX_ENABLE_9                  (1 << 9)
+#       define R300_TX_ENABLE_10                 (1 << 10)
+#       define R300_TX_ENABLE_11                 (1 << 11)
+#       define R300_TX_ENABLE_12                 (1 << 12)
+#       define R300_TX_ENABLE_13                 (1 << 13)
+#       define R300_TX_ENABLE_14                 (1 << 14)
+#       define R300_TX_ENABLE_15                 (1 << 15)
+
+#define R500_TX_FILTER_4		    0x4110
+#	define R500_TX_WEIGHT_1_SHIFT            (0)
+#	define R500_TX_WEIGHT_0_SHIFT            (11)
+#	define R500_TX_WEIGHT_PAIR               (1<<22)
+#	define R500_TX_PHASE_SHIFT               (23)
+#	define R500_TX_DIRECTION_HORIZONTAL	 (0<<27)
+#	define R500_TX_DIRECTION_VERITCAL	 (1<<27)
+
+/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) */
+#define R300_GA_POINT_S0                              0x4200
+
+/* T Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) */
+#define R300_GA_POINT_T0                              0x4204
+
+/* S Texture Coordinate of Vertex 2 for Point texture stuffing (URC) */
+#define R300_GA_POINT_S1                              0x4208
+
+/* T Texture Coordinate of Vertex 2 for Point texture stuffing (URC) */
+#define R300_GA_POINT_T1                              0x420c
+
+/* Specifies amount to shift integer position of vertex (screen space) before
+ * converting to float for triangle stipple.
+ */
+#define R300_GA_TRIANGLE_STIPPLE            0x4214
+#	define R300_GA_TRIANGLE_STIPPLE_X_SHIFT_SHIFT 0
+#	define R300_GA_TRIANGLE_STIPPLE_X_SHIFT_MASK  0x0000000f
+#	define R300_GA_TRIANGLE_STIPPLE_Y_SHIFT_SHIFT 16
+#	define R300_GA_TRIANGLE_STIPPLE_Y_SHIFT_MASK  0x000f0000
+
+/* The pointsize is given in multiples of 6. The pointsize can be enormous:
+ * Clear() renders a single point that fills the entire framebuffer.
+ * 1/2 Height of point; fixed (16.0), subpixel format (1/12 or 1/16, even if in
+ * 8b precision).
+ */
+#define R300_GA_POINT_SIZE                   0x421C
+#       define R300_POINTSIZE_Y_SHIFT         0
+#       define R300_POINTSIZE_Y_MASK          0x0000ffff
+#       define R300_POINTSIZE_X_SHIFT         16
+#       define R300_POINTSIZE_X_MASK          0xffff0000
+#       define R300_POINTSIZE_MAX             (R300_POINTSIZE_Y_MASK / 6)
+
+/* Red fill color */
+#define R500_GA_FILL_R                                0x4220
+
+/* Green fill color */
+#define R500_GA_FILL_G                                0x4224
+
+/* Blue fill color */
+#define R500_GA_FILL_B                                0x4228
+
+/* Alpha fill color */
+#define R500_GA_FILL_A                                0x422c
+
+
+/* Specifies maximum and minimum point & sprite sizes for per vertex size
+ * specification. The lower part (15:0) is MIN and (31:16) is max.
+ */
+#define R300_GA_POINT_MINMAX                0x4230
+#       define R300_GA_POINT_MINMAX_MIN_SHIFT          0
+#       define R300_GA_POINT_MINMAX_MIN_MASK           (0xFFFF << 0)
+#       define R300_GA_POINT_MINMAX_MAX_SHIFT          16
+#       define R300_GA_POINT_MINMAX_MAX_MASK           (0xFFFF << 16)
+
+/* 1/2 width of line, in subpixels (1/12 or 1/16 only, even in 8b
+ * subprecision); (16.0) fixed format.
+ *
+ * The line width is given in multiples of 6.
+ * In default mode lines are classified as vertical lines.
+ * HO: horizontal
+ * VE: vertical or horizontal
+ * HO & VE: no classification
+ */
+#define R300_GA_LINE_CNTL                             0x4234
+#       define R300_GA_LINE_CNTL_WIDTH_SHIFT       0
+#       define R300_GA_LINE_CNTL_WIDTH_MASK        0x0000ffff
+#	define R300_GA_LINE_CNTL_END_TYPE_HOR      (0 << 16)
+#	define R300_GA_LINE_CNTL_END_TYPE_VER      (1 << 16)
+#	define R300_GA_LINE_CNTL_END_TYPE_SQR      (2 << 16) /* horizontal or vertical depending upon slope */
+#	define R300_GA_LINE_CNTL_END_TYPE_COMP     (3 << 16) /* Computed (perpendicular to slope) */
+#	define R500_GA_LINE_CNTL_SORT_NO           (0 << 18)
+#	define R500_GA_LINE_CNTL_SORT_MINX_MINY    (1 << 18)
+/** TODO: looks wrong */
+#       define R300_LINESIZE_MAX              (R300_GA_LINE_CNTL_WIDTH_MASK / 6)
+/** TODO: looks wrong */
+#       define R300_LINE_CNT_HO               (1 << 16)
+/** TODO: looks wrong */
+#       define R300_LINE_CNT_VE               (1 << 17)
+
+/* Line Stipple configuration information. */
+#define R300_GA_LINE_STIPPLE_CONFIG                   0x4238
+#	define R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_NO     (0 << 0)
+#	define R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_LINE   (1 << 0)
+#	define R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_PACKET (2 << 0)
+#	define R300_GA_LINE_STIPPLE_CONFIG_STIPPLE_SCALE_SHIFT 2
+#	define R300_GA_LINE_STIPPLE_CONFIG_STIPPLE_SCALE_MASK  0xfffffffc
+
+/* Used to load US instructions and constants */
+#define R500_GA_US_VECTOR_INDEX               0x4250
+#	define R500_GA_US_VECTOR_INDEX_SHIFT       0
+#	define R500_GA_US_VECTOR_INDEX_MASK        0x000000ff
+#	define R500_GA_US_VECTOR_INDEX_TYPE_INSTR  (0 << 16)
+#	define R500_GA_US_VECTOR_INDEX_TYPE_CONST  (1 << 16)
+#	define R500_GA_US_VECTOR_INDEX_CLAMP_NO    (0 << 17)
+#	define R500_GA_US_VECTOR_INDEX_CLAMP_CONST (1 << 17)
+
+/* Data register for loading US instructions and constants */
+#define R500_GA_US_VECTOR_DATA                0x4254
+
+/* Specifies color properties and mappings of textures. */
+#define R500_GA_COLOR_CONTROL_PS3                     0x4258
+#	define R500_TEX0_SHADING_PS3_SOLID       (0 << 0)
+#	define R500_TEX0_SHADING_PS3_FLAT        (1 << 0)
+#	define R500_TEX0_SHADING_PS3_GOURAUD     (2 << 0)
+#	define R500_TEX1_SHADING_PS3_SOLID       (0 << 2)
+#	define R500_TEX1_SHADING_PS3_FLAT        (1 << 2)
+#	define R500_TEX1_SHADING_PS3_GOURAUD     (2 << 2)
+#	define R500_TEX2_SHADING_PS3_SOLID       (0 << 4)
+#	define R500_TEX2_SHADING_PS3_FLAT        (1 << 4)
+#	define R500_TEX2_SHADING_PS3_GOURAUD     (2 << 4)
+#	define R500_TEX3_SHADING_PS3_SOLID       (0 << 6)
+#	define R500_TEX3_SHADING_PS3_FLAT        (1 << 6)
+#	define R500_TEX3_SHADING_PS3_GOURAUD     (2 << 6)
+#	define R500_TEX4_SHADING_PS3_SOLID       (0 << 8)
+#	define R500_TEX4_SHADING_PS3_FLAT        (1 << 8)
+#	define R500_TEX4_SHADING_PS3_GOURAUD     (2 << 8)
+#	define R500_TEX5_SHADING_PS3_SOLID       (0 << 10)
+#	define R500_TEX5_SHADING_PS3_FLAT        (1 << 10)
+#	define R500_TEX5_SHADING_PS3_GOURAUD     (2 << 10)
+#	define R500_TEX6_SHADING_PS3_SOLID       (0 << 12)
+#	define R500_TEX6_SHADING_PS3_FLAT        (1 << 12)
+#	define R500_TEX6_SHADING_PS3_GOURAUD     (2 << 12)
+#	define R500_TEX7_SHADING_PS3_SOLID       (0 << 14)
+#	define R500_TEX7_SHADING_PS3_FLAT        (1 << 14)
+#	define R500_TEX7_SHADING_PS3_GOURAUD     (2 << 14)
+#	define R500_TEX8_SHADING_PS3_SOLID       (0 << 16)
+#	define R500_TEX8_SHADING_PS3_FLAT        (1 << 16)
+#	define R500_TEX8_SHADING_PS3_GOURAUD     (2 << 16)
+#	define R500_TEX9_SHADING_PS3_SOLID       (0 << 18)
+#	define R500_TEX9_SHADING_PS3_FLAT        (1 << 18)
+#	define R500_TEX9_SHADING_PS3_GOURAUD     (2 << 18)
+#	define R500_TEX10_SHADING_PS3_SOLID      (0 << 20)
+#	define R500_TEX10_SHADING_PS3_FLAT       (1 << 20)
+#	define R500_TEX10_SHADING_PS3_GOURAUD    (2 << 20)
+#	define R500_COLOR0_TEX_OVERRIDE_NO       (0 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_0    (1 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_1    (2 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_2    (3 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_3    (4 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_4    (5 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_5    (6 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_6    (7 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_7    (8 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_8_C2 (9 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_9_C3 (10 << 22)
+#	define R500_COLOR1_TEX_OVERRIDE_NO       (0 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_0    (1 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_1    (2 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_2    (3 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_3    (4 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_4    (5 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_5    (6 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_6    (7 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_7    (8 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_8_C2 (9 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_9_C3 (10 << 26)
+
+/* Returns idle status of various G3D block, captured when GA_IDLE written or
+ * when hard or soft reset asserted.
+ */
+#define R500_GA_IDLE                                  0x425c
+#	define R500_GA_IDLE_PIPE3_Z_IDLE  (0 << 0)
+#	define R500_GA_IDLE_PIPE2_Z_IDLE  (0 << 1)
+#	define R500_GA_IDLE_PIPE3_CD_IDLE (0 << 2)
+#	define R500_GA_IDLE_PIPE2_CD_IDLE (0 << 3)
+#	define R500_GA_IDLE_PIPE3_FG_IDLE (0 << 4)
+#	define R500_GA_IDLE_PIPE2_FG_IDLE (0 << 5)
+#	define R500_GA_IDLE_PIPE3_US_IDLE (0 << 6)
+#	define R500_GA_IDLE_PIPE2_US_IDLE (0 << 7)
+#	define R500_GA_IDLE_PIPE3_SC_IDLE (0 << 8)
+#	define R500_GA_IDLE_PIPE2_SC_IDLE (0 << 9)
+#	define R500_GA_IDLE_PIPE3_RS_IDLE (0 << 10)
+#	define R500_GA_IDLE_PIPE2_RS_IDLE (0 << 11)
+#	define R500_GA_IDLE_PIPE1_Z_IDLE  (0 << 12)
+#	define R500_GA_IDLE_PIPE0_Z_IDLE  (0 << 13)
+#	define R500_GA_IDLE_PIPE1_CD_IDLE (0 << 14)
+#	define R500_GA_IDLE_PIPE0_CD_IDLE (0 << 15)
+#	define R500_GA_IDLE_PIPE1_FG_IDLE (0 << 16)
+#	define R500_GA_IDLE_PIPE0_FG_IDLE (0 << 17)
+#	define R500_GA_IDLE_PIPE1_US_IDLE (0 << 18)
+#	define R500_GA_IDLE_PIPE0_US_IDLE (0 << 19)
+#	define R500_GA_IDLE_PIPE1_SC_IDLE (0 << 20)
+#	define R500_GA_IDLE_PIPE0_SC_IDLE (0 << 21)
+#	define R500_GA_IDLE_PIPE1_RS_IDLE (0 << 22)
+#	define R500_GA_IDLE_PIPE0_RS_IDLE (0 << 23)
+#	define R500_GA_IDLE_SU_IDLE       (0 << 24)
+#	define R500_GA_IDLE_GA_IDLE       (0 << 25)
+#	define R500_GA_IDLE_GA_UNIT2_IDLE (0 << 26)
+
+/* Current value of stipple accumulator. */
+#define R300_GA_LINE_STIPPLE_VALUE            0x4260
+
+/* S Texture Coordinate Value for Vertex 0 of Line (stuff textures -- i.e. AA) */
+#define R300_GA_LINE_S0                               0x4264
+/* S Texture Coordinate Value for Vertex 1 of Lines (V2 of parallelogram -- stuff textures -- i.e. AA) */
+#define R300_GA_LINE_S1                               0x4268
+
+/* GA Input fifo high water marks */
+#define R500_GA_FIFO_CNTL                             0x4270
+#	define R500_GA_FIFO_CNTL_VERTEX_FIFO_MASK   0x00000007
+#	define R500_GA_FIFO_CNTL_VERTEX_FIFO_SHIFT  0
+#	define R500_GA_FIFO_CNTL_VERTEX_INDEX_MASK  0x00000038
+#	define R500_GA_FIFO_CNTL_VERTEX_INDEX_SHIFT 3
+#	define R500_GA_FIFO_CNTL_VERTEX_REG_MASK    0x00003fc0
+#	define R500_GA_FIFO_CNTL_VERTEX_REG_SHIFT   6
+
+/* GA enhance/tweaks */
+#define R300_GA_ENHANCE                               0x4274
+#	define R300_GA_ENHANCE_DEADLOCK_CNTL_NO_EFFECT   (0 << 0)
+#	define R300_GA_ENHANCE_DEADLOCK_CNTL_PREVENT_TCL (1 << 0) /* Prevents TCL interface from deadlocking on GA side. */
+#	define R300_GA_ENHANCE_FASTSYNC_CNTL_NO_EFFECT   (0 << 1)
+#	define R300_GA_ENHANCE_FASTSYNC_CNTL_ENABLE      (1 << 1) /* Enables high-performance register/primitive switching. */
+#	define R500_GA_ENHANCE_REG_READWRITE_NO_EFFECT   (0 << 2) /* R520+ only */
+#	define R500_GA_ENHANCE_REG_READWRITE_ENABLE      (1 << 2) /* R520+ only, Enables GA support of simultaneous register reads and writes. */
+#	define R500_GA_ENHANCE_REG_NOSTALL_NO_EFFECT     (0 << 3)
+#	define R500_GA_ENHANCE_REG_NOSTALL_ENABLE        (1 << 3) /* Enables GA support of no-stall reads for register read back. */
+
+#define R300_GA_COLOR_CONTROL                   0x4278
+#	define R300_GA_COLOR_CONTROL_RGB0_SHADING_SOLID      (0 << 0)
+#	define R300_GA_COLOR_CONTROL_RGB0_SHADING_FLAT       (1 << 0)
+#	define R300_GA_COLOR_CONTROL_RGB0_SHADING_GOURAUD    (2 << 0)
+#	define R300_GA_COLOR_CONTROL_ALPHA0_SHADING_SOLID    (0 << 2)
+#	define R300_GA_COLOR_CONTROL_ALPHA0_SHADING_FLAT     (1 << 2)
+#	define R300_GA_COLOR_CONTROL_ALPHA0_SHADING_GOURAUD  (2 << 2)
+#	define R300_GA_COLOR_CONTROL_RGB1_SHADING_SOLID      (0 << 4)
+#	define R300_GA_COLOR_CONTROL_RGB1_SHADING_FLAT       (1 << 4)
+#	define R300_GA_COLOR_CONTROL_RGB1_SHADING_GOURAUD    (2 << 4)
+#	define R300_GA_COLOR_CONTROL_ALPHA1_SHADING_SOLID    (0 << 6)
+#	define R300_GA_COLOR_CONTROL_ALPHA1_SHADING_FLAT     (1 << 6)
+#	define R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD  (2 << 6)
+#	define R300_GA_COLOR_CONTROL_RGB2_SHADING_SOLID      (0 << 8)
+#	define R300_GA_COLOR_CONTROL_RGB2_SHADING_FLAT       (1 << 8)
+#	define R300_GA_COLOR_CONTROL_RGB2_SHADING_GOURAUD    (2 << 8)
+#	define R300_GA_COLOR_CONTROL_ALPHA2_SHADING_SOLID    (0 << 10)
+#	define R300_GA_COLOR_CONTROL_ALPHA2_SHADING_FLAT     (1 << 10)
+#	define R300_GA_COLOR_CONTROL_ALPHA2_SHADING_GOURAUD  (2 << 10)
+#	define R300_GA_COLOR_CONTROL_RGB3_SHADING_SOLID      (0 << 12)
+#	define R300_GA_COLOR_CONTROL_RGB3_SHADING_FLAT       (1 << 12)
+#	define R300_GA_COLOR_CONTROL_RGB3_SHADING_GOURAUD    (2 << 12)
+#	define R300_GA_COLOR_CONTROL_ALPHA3_SHADING_SOLID    (0 << 14)
+#	define R300_GA_COLOR_CONTROL_ALPHA3_SHADING_FLAT     (1 << 14)
+#	define R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD  (2 << 14)
+#	define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_FIRST  (0 << 16)
+#	define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_SECOND (1 << 16)
+#	define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_THIRD  (2 << 16)
+#	define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST   (3 << 16)
+
+#       define R300_SHADE_MODEL_FLAT ( \
+        R300_GA_COLOR_CONTROL_RGB0_SHADING_FLAT | \
+        R300_GA_COLOR_CONTROL_ALPHA0_SHADING_FLAT | \
+        R300_GA_COLOR_CONTROL_RGB1_SHADING_FLAT | \
+        R300_GA_COLOR_CONTROL_ALPHA1_SHADING_FLAT | \
+        R300_GA_COLOR_CONTROL_RGB2_SHADING_FLAT | \
+        R300_GA_COLOR_CONTROL_ALPHA2_SHADING_FLAT | \
+        R300_GA_COLOR_CONTROL_RGB3_SHADING_FLAT | \
+        R300_GA_COLOR_CONTROL_ALPHA3_SHADING_FLAT )
+
+#       define R300_SHADE_MODEL_SMOOTH ( \
+        R300_GA_COLOR_CONTROL_RGB0_SHADING_GOURAUD | \
+        R300_GA_COLOR_CONTROL_ALPHA0_SHADING_GOURAUD | \
+        R300_GA_COLOR_CONTROL_RGB1_SHADING_GOURAUD | \
+        R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD | \
+        R300_GA_COLOR_CONTROL_RGB2_SHADING_GOURAUD | \
+        R300_GA_COLOR_CONTROL_ALPHA2_SHADING_GOURAUD | \
+        R300_GA_COLOR_CONTROL_RGB3_SHADING_GOURAUD | \
+        R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD )
+
+/* Specifies red & green components of fill color -- S312 format -- Backwards comp. */
+#define R300_GA_SOLID_RG                         0x427c
+#	define GA_SOLID_RG_COLOR_GREEN_SHIFT 0
+#	define GA_SOLID_RG_COLOR_GREEN_MASK  0x0000ffff
+#	define GA_SOLID_RG_COLOR_RED_SHIFT   16
+#	define GA_SOLID_RG_COLOR_RED_MASK    0xffff0000
+/* Specifies blue & alpha components of fill color -- S312 format -- Backwards comp. */
+#define R300_GA_SOLID_BA                         0x4280
+#	define GA_SOLID_BA_COLOR_ALPHA_SHIFT 0
+#	define GA_SOLID_BA_COLOR_ALPHA_MASK  0x0000ffff
+#	define GA_SOLID_BA_COLOR_BLUE_SHIFT  16
+#	define GA_SOLID_BA_COLOR_BLUE_MASK   0xffff0000
+
+/* Polygon Mode
+ * Dangerous
+ */
+#define R300_GA_POLY_MODE                             0x4288
+#	define R300_GA_POLY_MODE_DISABLE           (0 << 0)
+#	define R300_GA_POLY_MODE_DUAL              (1 << 0) /* send 2 sets of 3 polys with specified poly type */
+/* reserved */
+#	define R300_GA_POLY_MODE_FRONT_PTYPE_POINT (0 << 4)
+#	define R300_GA_POLY_MODE_FRONT_PTYPE_LINE  (1 << 4)
+#	define R300_GA_POLY_MODE_FRONT_PTYPE_TRI   (2 << 4)
+/* reserved */
+#	define R300_GA_POLY_MODE_BACK_PTYPE_POINT  (0 << 7)
+#	define R300_GA_POLY_MODE_BACK_PTYPE_LINE   (1 << 7)
+#	define R300_GA_POLY_MODE_BACK_PTYPE_TRI    (2 << 7)
+/* reserved */
+
+/* Specifies the rouding mode for geometry & color SPFP to FP conversions. */
+#define R300_GA_ROUND_MODE                            0x428c
+#	define R300_GA_ROUND_MODE_GEOMETRY_ROUND_TRUNC   (0 << 0)
+#	define R300_GA_ROUND_MODE_GEOMETRY_ROUND_NEAREST (1 << 0)
+#	define R300_GA_ROUND_MODE_COLOR_ROUND_TRUNC      (0 << 2)
+#	define R300_GA_ROUND_MODE_COLOR_ROUND_NEAREST    (1 << 2)
+#	define R300_GA_ROUND_MODE_RGB_CLAMP_RGB          (0 << 4)
+#	define R300_GA_ROUND_MODE_RGB_CLAMP_FP20         (1 << 4)
+#	define R300_GA_ROUND_MODE_ALPHA_CLAMP_RGB        (0 << 5)
+#	define R300_GA_ROUND_MODE_ALPHA_CLAMP_FP20       (1 << 5)
+#	define R500_GA_ROUND_MODE_GEOMETRY_MASK_SHIFT    6
+#	define R500_GA_ROUND_MODE_GEOMETRY_MASK_MASK     0x000003c0
+
+/* Specifies x & y offsets for vertex data after conversion to FP.
+ * Offsets are in S15 format (subpixels -- 1/12 or 1/16, even in 8b
+ * subprecision).
+ */
+#define R300_GA_OFFSET                                0x4290
+#	define R300_GA_OFFSET_X_OFFSET_SHIFT 0
+#	define R300_GA_OFFSET_X_OFFSET_MASK  0x0000ffff
+#	define R300_GA_OFFSET_Y_OFFSET_SHIFT 16
+#	define R300_GA_OFFSET_Y_OFFSET_MASK  0xffff0000
+
+/* Specifies the scale to apply to fog. */
+#define R300_GA_FOG_SCALE                     0x4294
+/* Specifies the offset to apply to fog. */
+#define R300_GA_FOG_OFFSET                    0x4298
+/* Specifies number of cycles to assert reset, and also causes RB3D soft reset to assert. */
+#define R300_GA_SOFT_RESET                    0x429c
+
+/* Not sure why there are duplicate of factor and constant values.
+ * My best guess so far is that there are seperate zbiases for test and write.
+ * Ordering might be wrong.
+ * Some of the tests indicate that fgl has a fallback implementation of zbias
+ * via pixel shaders.
+ */
+#define R300_SU_TEX_WRAP                      0x42A0
+#define R300_SU_POLY_OFFSET_FRONT_SCALE       0x42A4
+#define R300_SU_POLY_OFFSET_FRONT_OFFSET      0x42A8
+#define R300_SU_POLY_OFFSET_BACK_SCALE        0x42AC
+#define R300_SU_POLY_OFFSET_BACK_OFFSET       0x42B0
+
+/* This register needs to be set to (1<<1) for RV350 to correctly
+ * perform depth test (see --vb-triangles in r300_demo)
+ * Don't know about other chips. - Vladimir
+ * This is set to 3 when GL_POLYGON_OFFSET_FILL is on.
+ * My guess is that there are two bits for each zbias primitive
+ * (FILL, LINE, POINT).
+ *  One to enable depth test and one for depth write.
+ * Yet this doesnt explain why depth writes work ...
+ */
+#define R300_SU_POLY_OFFSET_ENABLE	       0x42B4
+#	define R300_FRONT_ENABLE	       (1 << 0)
+#	define R300_BACK_ENABLE 	       (1 << 1)
+#	define R300_PARA_ENABLE 	       (1 << 2)
+
+#define R300_SU_CULL_MODE                      0x42B8
+#       define R300_CULL_FRONT                   (1 << 0)
+#       define R300_CULL_BACK                    (1 << 1)
+#       define R300_FRONT_FACE_CCW               (0 << 2)
+#       define R300_FRONT_FACE_CW                (1 << 2)
+
+/* SU Depth Scale value */
+#define R300_SU_DEPTH_SCALE                 0x42c0
+/* SU Depth Offset value */
+#define R300_SU_DEPTH_OFFSET                0x42c4
+
+#define R300_SU_REG_DEST		    0x42c8
+#	define R300_RASTER_PIPE_SELECT_0	(1 << 0)
+#	define R300_RASTER_PIPE_SELECT_1	(1 << 1)
+#	define R300_RASTER_PIPE_SELECT_2	(1 << 2)
+#	define R300_RASTER_PIPE_SELECT_3	(1 << 3)
+#	define R300_RASTER_PIPE_SELECT_ALL	0xf
+
+
+/* BEGIN: Rasterization / Interpolators - many guesses */
+
+/*
+ * TC_CNT is the number of incoming texture coordinate sets (i.e. it depends
+ * on the vertex program, *not* the fragment program)
+ */
+#define R300_RS_COUNT                      0x4300
+#       define R300_IT_COUNT_SHIFT               0
+#       define R300_IT_COUNT_MASK                0x0000007f
+#       define R300_IC_COUNT_SHIFT               7
+#       define R300_IC_COUNT_MASK                0x00000780
+#       define R300_W_ADDR_SHIFT                 12
+#       define R300_W_ADDR_MASK                  0x0003f000
+#       define R300_HIRES_DIS                    (0 << 18)
+#       define R300_HIRES_EN                     (1 << 18)
+#       define R300_IT_COUNT(x)                  ((x) << 0)
+#       define R300_IC_COUNT(x)                  ((x) << 7)
+#       define R300_W_COUNT(x)                   ((x) << 12)
+
+#define R300_RS_INST_COUNT                       0x4304
+#       define R300_RS_INST_COUNT_SHIFT          0
+#       define R300_RS_INST_COUNT_MASK           0x0000000f
+#       define R300_RS_TX_OFFSET_SHIFT           5
+#	define R300_RS_TX_OFFSET_MASK            0x000000e0
+#       define R300_RS_TX_OFFSET(x)              ((x) << 5)
+
+/* gap */
+
+/* Only used for texture coordinates.
+ * Use the source field to route texture coordinate input from the
+ * vertex program to the desired interpolator. Note that the source
+ * field is relative to the outputs the vertex program *actually*
+ * writes. If a vertex program only writes texcoord[1], this will
+ * be source index 0.
+ * Set INTERP_USED on all interpolators that produce data used by
+ * the fragment program. INTERP_USED looks like a swizzling mask,
+ * but I haven't seen it used that way.
+ *
+ * Note: The _UNKNOWN constants are always set in their respective
+ * register. I don't know if this is necessary.
+ */
+#define R300_RS_IP_0				        0x4310
+#define R300_RS_IP_1				        0x4314
+#define R300_RS_IP_2				        0x4318
+#define R300_RS_IP_3				        0x431C
+#       define R300_RS_INTERP_SRC_SHIFT          2 /* TODO: check for removal */
+#       define R300_RS_INTERP_SRC_MASK           (7 << 2) /* TODO: check for removal */
+#	define R300_RS_TEX_PTR(x)		        (x << 0)
+#	define R300_RS_COL_PTR(x)		        ((x) << 6)
+#	define R300_RS_COL_FMT(x)		        ((x) << 9)
+#	define R300_RS_COL_FMT_RGBA		        0
+#	define R300_RS_COL_FMT_RGB0		        1
+#	define R300_RS_COL_FMT_RGB1		        2
+#	define R300_RS_COL_FMT_000A		        4
+#	define R300_RS_COL_FMT_0000		        5
+#	define R300_RS_COL_FMT_0001		        6
+#	define R300_RS_COL_FMT_111A		        8
+#	define R300_RS_COL_FMT_1110		        9
+#	define R300_RS_COL_FMT_1111		        10
+#	define R300_RS_SEL_S(x)		                ((x) << 13)
+#	define R300_RS_SEL_T(x)		                ((x) << 16)
+#	define R300_RS_SEL_R(x)		                ((x) << 19)
+#	define R300_RS_SEL_Q(x)		                ((x) << 22)
+#	define R300_RS_SEL_C0		                0
+#	define R300_RS_SEL_C1		                1
+#	define R300_RS_SEL_C2		                2
+#	define R300_RS_SEL_C3		                3
+#	define R300_RS_SEL_K0		                4
+#	define R300_RS_SEL_K1		                5
+
+
+/*  */
+#define R500_RS_INST_0					0x4320
+#define R500_RS_INST_1					0x4324
+#define R500_RS_INST_2					0x4328
+#define R500_RS_INST_3					0x432c
+#define R500_RS_INST_4					0x4330
+#define R500_RS_INST_5					0x4334
+#define R500_RS_INST_6					0x4338
+#define R500_RS_INST_7					0x433c
+#define R500_RS_INST_8					0x4340
+#define R500_RS_INST_9					0x4344
+#define R500_RS_INST_10					0x4348
+#define R500_RS_INST_11					0x434c
+#define R500_RS_INST_12					0x4350
+#define R500_RS_INST_13					0x4354
+#define R500_RS_INST_14					0x4358
+#define R500_RS_INST_15					0x435c
+#define R500_RS_INST_TEX_ID_SHIFT			0
+#        define R500_RS_INST_TEX_ID(x)                  ((x) << 0)
+#define R500_RS_INST_TEX_CN_WRITE			(1 << 4)
+#define R500_RS_INST_TEX_ADDR_SHIFT			5
+#        define R500_RS_INST_TEX_ADDR(x)                ((x) << 5)
+#define R500_RS_INST_COL_ID_SHIFT			12
+#        define R500_RS_INST_COL_ID(x)                  ((x) << 12)
+#define R500_RS_INST_COL_CN_NO_WRITE			(0 << 16)
+#define R500_RS_INST_COL_CN_WRITE			(1 << 16)
+#define R500_RS_INST_COL_CN_WRITE_FBUFFER		(2 << 16)
+#define R500_RS_INST_COL_CN_WRITE_BACKFACE		(3 << 16)
+#define R500_RS_INST_COL_ADDR_SHIFT			18
+#        define R500_RS_INST_COL_ADDR(x)                ((x) << 18)
+#define R500_RS_INST_TEX_ADJ				(1 << 25)
+#define R500_RS_INST_W_CN				(1 << 26)
+
+/* These DWORDs control how vertex data is routed into fragment program
+ * registers, after interpolators.
+ */
+#define R300_RS_INST_0                     0x4330
+#define R300_RS_INST_1                     0x4334
+#define R300_RS_INST_2                     0x4338
+#define R300_RS_INST_3                     0x433C
+#define R300_RS_INST_4                     0x4340
+#define R300_RS_INST_5                     0x4344
+#define R300_RS_INST_6                     0x4348
+#define R300_RS_INST_7                     0x434C
+#	define R300_RS_INST_TEX_ID(x)  		((x) << 0)
+#	define R300_RS_INST_TEX_CN_WRITE 	(1 << 3)
+#	define R300_RS_INST_TEX_ADDR(x)		((x) << 6)
+#	define R300_RS_INST_TEX_ADDR_SHIFT 	6
+#	define R300_RS_INST_COL_ID(x)		((x) << 11)
+#	define R300_RS_INST_COL_CN_WRITE	(1 << 14)
+#	define R300_RS_INST_COL_ADDR(x)		((x) << 17)
+#	define R300_RS_INST_COL_ADDR_SHIFT	17
+#	define R300_RS_INST_TEX_ADJ		(1 << 22)
+#	define R300_RS_COL_BIAS_UNUSED_SHIFT    23
+
+/* END: Rasterization / Interpolators - many guesses */
+
+/* Hierarchical Z Enable */
+#define R300_SC_HYPERZ                   0x43a4
+#	define R300_SC_HYPERZ_DISABLE     (0 << 0)
+#	define R300_SC_HYPERZ_ENABLE      (1 << 0)
+#	define R300_SC_HYPERZ_MIN         (0 << 1)
+#	define R300_SC_HYPERZ_MAX         (1 << 1)
+#	define R300_SC_HYPERZ_ADJ_256     (0 << 2)
+#	define R300_SC_HYPERZ_ADJ_128     (1 << 2)
+#	define R300_SC_HYPERZ_ADJ_64      (2 << 2)
+#	define R300_SC_HYPERZ_ADJ_32      (3 << 2)
+#	define R300_SC_HYPERZ_ADJ_16      (4 << 2)
+#	define R300_SC_HYPERZ_ADJ_8       (5 << 2)
+#	define R300_SC_HYPERZ_ADJ_4       (6 << 2)
+#	define R300_SC_HYPERZ_ADJ_2       (7 << 2)
+#	define R300_SC_HYPERZ_HZ_Z0MIN_NO (0 << 5)
+#	define R300_SC_HYPERZ_HZ_Z0MIN    (1 << 5)
+#	define R300_SC_HYPERZ_HZ_Z0MAX_NO (0 << 6)
+#	define R300_SC_HYPERZ_HZ_Z0MAX    (1 << 6)
+
+#define R300_SC_EDGERULE                 0x43a8
+
+/* BEGIN: Scissors and cliprects */
+
+/* There are four clipping rectangles. Their corner coordinates are inclusive.
+ * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
+ * on whether the pixel is inside cliprects 0-3, respectively. For example,
+ * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
+ * the number 3 (binary 0011).
+ * Iff the bit corresponding to the pixel's number in RE_CLIPRECT_CNTL is set,
+ * the pixel is rasterized.
+ *
+ * In addition to this, there is a scissors rectangle. Only pixels inside the
+ * scissors rectangle are drawn. (coordinates are inclusive)
+ *
+ * For some reason, the top-left corner of the framebuffer is at (1440, 1440)
+ * for the purpose of clipping and scissors.
+ */
+#define R300_SC_CLIPRECT_TL_0               0x43B0
+#define R300_SC_CLIPRECT_BR_0               0x43B4
+#define R300_SC_CLIPRECT_TL_1               0x43B8
+#define R300_SC_CLIPRECT_BR_1               0x43BC
+#define R300_SC_CLIPRECT_TL_2               0x43C0
+#define R300_SC_CLIPRECT_BR_2               0x43C4
+#define R300_SC_CLIPRECT_TL_3               0x43C8
+#define R300_SC_CLIPRECT_BR_3               0x43CC
+#       define R300_CLIPRECT_OFFSET              1440
+#       define R300_CLIPRECT_MASK                0x1FFF
+#       define R300_CLIPRECT_X_SHIFT             0
+#       define R300_CLIPRECT_X_MASK              (0x1FFF << 0)
+#       define R300_CLIPRECT_Y_SHIFT             13
+#       define R300_CLIPRECT_Y_MASK              (0x1FFF << 13)
+#define R300_SC_CLIP_RULE                   0x43D0
+#       define R300_CLIP_OUT                     (1 << 0)
+#       define R300_CLIP_0                       (1 << 1)
+#       define R300_CLIP_1                       (1 << 2)
+#       define R300_CLIP_10                      (1 << 3)
+#       define R300_CLIP_2                       (1 << 4)
+#       define R300_CLIP_20                      (1 << 5)
+#       define R300_CLIP_21                      (1 << 6)
+#       define R300_CLIP_210                     (1 << 7)
+#       define R300_CLIP_3                       (1 << 8)
+#       define R300_CLIP_30                      (1 << 9)
+#       define R300_CLIP_31                      (1 << 10)
+#       define R300_CLIP_310                     (1 << 11)
+#       define R300_CLIP_32                      (1 << 12)
+#       define R300_CLIP_320                     (1 << 13)
+#       define R300_CLIP_321                     (1 << 14)
+#       define R300_CLIP_3210                    (1 << 15)
+
+/* gap */
+
+#define R300_SC_SCISSORS_TL                 0x43E0
+#define R300_SC_SCISSORS_BR                 0x43E4
+#       define R300_SCISSORS_OFFSET              1440
+#       define R300_SCISSORS_X_SHIFT             0
+#       define R300_SCISSORS_X_MASK              (0x1FFF << 0)
+#       define R300_SCISSORS_Y_SHIFT             13
+#       define R300_SCISSORS_Y_MASK              (0x1FFF << 13)
+
+/* Screen door sample mask */
+#define R300_SC_SCREENDOOR                 0x43e8
+
+/* END: Scissors and cliprects */
+
+/* BEGIN: Texture specification */
+
+/*
+ * The texture specification dwords are grouped by meaning and not by texture
+ * unit. This means that e.g. the offset for texture image unit N is found in
+ * register TX_OFFSET_0 + (4*N)
+ */
+#define R300_TX_FILTER0_0                        0x4400
+#define R300_TX_FILTER0_1                        0x4404
+#define R300_TX_FILTER0_2                        0x4408
+#define R300_TX_FILTER0_3                        0x440c
+#define R300_TX_FILTER0_4                        0x4410
+#define R300_TX_FILTER0_5                        0x4414
+#define R300_TX_FILTER0_6                        0x4418
+#define R300_TX_FILTER0_7                        0x441c
+#define R300_TX_FILTER0_8                        0x4420
+#define R300_TX_FILTER0_9                        0x4424
+#define R300_TX_FILTER0_10                       0x4428
+#define R300_TX_FILTER0_11                       0x442c
+#define R300_TX_FILTER0_12                       0x4430
+#define R300_TX_FILTER0_13                       0x4434
+#define R300_TX_FILTER0_14                       0x4438
+#define R300_TX_FILTER0_15                       0x443c
+#       define R300_TX_REPEAT                    0
+#       define R300_TX_MIRRORED                  1
+#       define R300_TX_CLAMP_TO_EDGE             2
+#	define R300_TX_MIRROR_ONCE_TO_EDGE       3
+#       define R300_TX_CLAMP                     4
+#	define R300_TX_MIRROR_ONCE               5
+#       define R300_TX_CLAMP_TO_BORDER           6
+#	define R300_TX_MIRROR_ONCE_TO_BORDER     7
+#       define R300_TX_WRAP_S_SHIFT              0
+#       define R300_TX_WRAP_S_MASK               (7 << 0)
+#       define R300_TX_WRAP_T_SHIFT              3
+#       define R300_TX_WRAP_T_MASK               (7 << 3)
+#       define R300_TX_WRAP_R_SHIFT              6
+#       define R300_TX_WRAP_R_MASK               (7 << 6)
+#	define R300_TX_MAG_FILTER_4              (0 << 9)
+#       define R300_TX_MAG_FILTER_NEAREST        (1 << 9)
+#       define R300_TX_MAG_FILTER_LINEAR         (2 << 9)
+#       define R300_TX_MAG_FILTER_ANISO          (3 << 9)
+#       define R300_TX_MAG_FILTER_MASK           (3 << 9)
+#       define R300_TX_MIN_FILTER_NEAREST        (1 << 11)
+#       define R300_TX_MIN_FILTER_LINEAR         (2 << 11)
+#	define R300_TX_MIN_FILTER_ANISO          (3 << 11)
+#	define R300_TX_MIN_FILTER_MASK           (3 << 11)
+#	define R300_TX_MIN_FILTER_MIP_NONE       (0 << 13)
+#	define R300_TX_MIN_FILTER_MIP_NEAREST    (1 << 13)
+#	define R300_TX_MIN_FILTER_MIP_LINEAR     (2 << 13)
+#	define R300_TX_MIN_FILTER_MIP_MASK       (3 << 13)
+#       define R300_TX_MAX_MIP_LEVEL_SHIFT       17
+#       define R300_TX_MAX_MIP_LEVEL_MASK        (0xf << 17)
+#	define R300_TX_MAX_ANISO_1_TO_1          (0 << 21)
+#	define R300_TX_MAX_ANISO_2_TO_1          (1 << 21)
+#	define R300_TX_MAX_ANISO_4_TO_1          (2 << 21)
+#	define R300_TX_MAX_ANISO_8_TO_1          (3 << 21)
+#	define R300_TX_MAX_ANISO_16_TO_1         (4 << 21)
+#	define R300_TX_MAX_ANISO_MASK            (7 << 21)
+#       define R300_TX_WRAP_S(x)                 ((x) << 0)
+#       define R300_TX_WRAP_T(x)                 ((x) << 3)
+#       define R300_TX_MAX_MIP_LEVEL(x)          ((x) << 17)
+
+#define R300_TX_FILTER1_0                      0x4440
+#	define R300_CHROMA_KEY_MODE_DISABLE    0
+#	define R300_CHROMA_KEY_FORCE	       1
+#	define R300_CHROMA_KEY_BLEND           2
+#	define R300_MC_ROUND_NORMAL            (0<<2)
+#	define R300_MC_ROUND_MPEG4             (1<<2)
+#	define R300_LOD_BIAS_SHIFT             3
+#	define R300_LOD_BIAS_MASK	       0x1ff8
+#	define R300_EDGE_ANISO_EDGE_DIAG       (0<<13)
+#	define R300_EDGE_ANISO_EDGE_ONLY       (1<<13)
+#	define R300_MC_COORD_TRUNCATE_DISABLE  (0<<14)
+#	define R300_MC_COORD_TRUNCATE_MPEG     (1<<14)
+#	define R300_TX_TRI_PERF_0_8            (0<<15)
+#	define R300_TX_TRI_PERF_1_8            (1<<15)
+#	define R300_TX_TRI_PERF_1_4            (2<<15)
+#	define R300_TX_TRI_PERF_3_8            (3<<15)
+#	define R300_ANISO_THRESHOLD_MASK       (7<<17)
+
+#	define R500_MACRO_SWITCH               (1<<22)
+#       define R500_TX_MAX_ANISO(x)            ((x) << 23)
+#       define R500_TX_MAX_ANISO_MASK          (63 << 23)
+#       define R500_TX_ANISO_HIGH_QUALITY      (1 << 30)
+
+#	define R500_BORDER_FIX                 (1<<31)
+
+#define R300_TX_FORMAT0_0                   0x4480
+#       define R300_TX_WIDTHMASK_SHIFT           0
+#       define R300_TX_WIDTHMASK_MASK            (2047 << 0)
+#       define R300_TX_HEIGHTMASK_SHIFT          11
+#       define R300_TX_HEIGHTMASK_MASK           (2047 << 11)
+#	define R300_TX_DEPTHMASK_SHIFT           22
+#	define R300_TX_DEPTHMASK_MASK            (0xf << 22)
+#       define R300_TX_SIZE_PROJECTED            (1 << 30)
+#       define R300_TX_PITCH_EN                  (1 << 31)
+#       define R300_TX_WIDTH(x)                  ((x) << 0)
+#       define R300_TX_HEIGHT(x)                 ((x) << 11)
+#       define R300_TX_DEPTH(x)                  ((x) << 22)
+#       define R300_TX_NUM_LEVELS(x)             ((x) << 26)
+
+#define R300_TX_FORMAT1_0                   0x44C0
+	/* The interpretation of the format word by Wladimir van der Laan */
+	/* The X, Y, Z and W refer to the layout of the components.
+	   They are given meanings as R, G, B and Alpha by the swizzle
+	   specification */
+#	define R300_TX_FORMAT_X8		    0x0
+#	define R300_TX_FORMAT_X16		    0x1
+#	define R300_TX_FORMAT_Y4X4		    0x2
+#	define R300_TX_FORMAT_Y8X8		    0x3
+#	define R300_TX_FORMAT_Y16X16		    0x4
+#	define R300_TX_FORMAT_Z3Y3X2		    0x5
+#	define R300_TX_FORMAT_Z5Y6X5		    0x6
+#	define R300_TX_FORMAT_Z6Y5X5		    0x7
+#	define R300_TX_FORMAT_Z11Y11X10		    0x8
+#	define R300_TX_FORMAT_Z10Y11X11		    0x9
+#	define R300_TX_FORMAT_W4Z4Y4X4		    0xA
+#	define R300_TX_FORMAT_W1Z5Y5X5		    0xB
+#	define R300_TX_FORMAT_W8Z8Y8X8		    0xC
+#	define R300_TX_FORMAT_W2Z10Y10X10	    0xD
+#	define R300_TX_FORMAT_W16Z16Y16X16	    0xE
+#	define R300_TX_FORMAT_DXT1	    	    0xF
+#	define R300_TX_FORMAT_DXT3	    	    0x10
+#	define R300_TX_FORMAT_DXT5	    	    0x11
+#	define R300_TX_FORMAT_CxV8U8           	    0x12
+#	define R300_TX_FORMAT_AVYU444 	    	    0x13
+#	define R300_TX_FORMAT_VYUY422  	    	    0x14
+#	define R300_TX_FORMAT_YVYU422  	    	    0x15
+#	define R300_TX_FORMAT_16_MPEG  	    	    0x16
+#	define R300_TX_FORMAT_16_16_MPEG    	    0x17
+#	define R300_TX_FORMAT_16F     	    	    0x18
+#	define R300_TX_FORMAT_16F_16F 	    	    0x19
+#	define R300_TX_FORMAT_16F_16F_16F_16F  	    0x1A
+#	define R300_TX_FORMAT_32F     	    	    0x1B
+#	define R300_TX_FORMAT_32F_32F 	    	    0x1C
+#	define R300_TX_FORMAT_32F_32F_32F_32F  	    0x1D
+#       define R300_TX_FORMAT_W24_FP                0x1E
+#       define R400_TX_FORMAT_ATI2N                 0x1F
+
+/* These need TX_FORMAT2_[0-15].TXFORMAT_MSB set.
+
+   My guess is the 10-bit formats are the 8-bit ones but with filtering being
+   performed with the precision of 10 bits per channel. This makes sense
+   with sRGB textures since the conversion to linear space reduces the precision
+   significantly so the shader gets approximately the 8-bit precision
+   in the end. It might also improve the quality of HDR rendering where
+   high-precision filtering is desirable.
+
+   Again, this is guessed, the formats might mean something entirely else.
+   The others should be fine. */
+#       define R500_TX_FORMAT_X1                    0x0
+#       define R500_TX_FORMAT_X1_REV                0x1
+#       define R500_TX_FORMAT_X10                   0x2
+#       define R500_TX_FORMAT_Y10X10                0x3
+#       define R500_TX_FORMAT_W10Z10Y10X10          0x4
+#       define R500_TX_FORMAT_ATI1N                 0x5
+#       define R500_TX_FORMAT_Y8X24                 0x6
+
+
+#       define R300_TX_FORMAT_SIGNED_W             (1 << 5)
+#       define R300_TX_FORMAT_SIGNED_Z             (1 << 6)
+#       define R300_TX_FORMAT_SIGNED_Y             (1 << 7)
+#       define R300_TX_FORMAT_SIGNED_X             (1 << 8)
+#       define R300_TX_FORMAT_SIGNED               (0xf << 5)
+
+#	define R300_TX_FORMAT_3D		   (1 << 25)
+#	define R300_TX_FORMAT_CUBIC_MAP		   (2 << 25)
+
+	/* alpha modes, convenience mostly */
+	/* if you have alpha, pick constant appropriate to the
+	   number of channels (1 for I8, 2 for I8A8, 4 for R8G8B8A8, etc */
+# 	define R300_TX_FORMAT_ALPHA_1CH		    0x000
+# 	define R300_TX_FORMAT_ALPHA_2CH		    0x200
+# 	define R300_TX_FORMAT_ALPHA_4CH		    0x600
+# 	define R300_TX_FORMAT_ALPHA_NONE	    0xA00
+	/* Swizzling */
+	/* constants */
+#	define R300_TX_FORMAT_X		0
+#	define R300_TX_FORMAT_Y		1
+#	define R300_TX_FORMAT_Z		2
+#	define R300_TX_FORMAT_W		3
+#	define R300_TX_FORMAT_ZERO	4
+#	define R300_TX_FORMAT_ONE	5
+	/* 2.0*Z, everything above 1.0 is set to 0.0 */
+#	define R300_TX_FORMAT_CUT_Z	6
+	/* 2.0*W, everything above 1.0 is set to 0.0 */
+#	define R300_TX_FORMAT_CUT_W	7
+
+#	define R300_TX_FORMAT_B_SHIFT	18
+#	define R300_TX_FORMAT_G_SHIFT	15
+#	define R300_TX_FORMAT_R_SHIFT	12
+#	define R300_TX_FORMAT_A_SHIFT	9
+	/* Convenience macro to take care of layout and swizzling */
+#	define R300_EASY_TX_FORMAT(B, G, R, A, FMT)	(		\
+		((R300_TX_FORMAT_##B)<<R300_TX_FORMAT_B_SHIFT)		\
+		| ((R300_TX_FORMAT_##G)<<R300_TX_FORMAT_G_SHIFT)	\
+		| ((R300_TX_FORMAT_##R)<<R300_TX_FORMAT_R_SHIFT)	\
+		| ((R300_TX_FORMAT_##A)<<R300_TX_FORMAT_A_SHIFT)	\
+		| (R300_TX_FORMAT_##FMT)				\
+		)
+	/* These can be ORed with result of R300_EASY_TX_FORMAT()
+	   We don't really know what they do. Take values from a
+           constant color ? */
+#	define R300_TX_FORMAT_CONST_X		(1<<5)
+#	define R300_TX_FORMAT_CONST_Y		(2<<5)
+#	define R300_TX_FORMAT_CONST_Z		(4<<5)
+#	define R300_TX_FORMAT_CONST_W		(8<<5)
+
+#       define R300_TX_FORMAT_GAMMA               (1 << 21)
+#       define R300_TX_FORMAT_YUV_TO_RGB          (1 << 22)
+
+#       define R300_TX_CACHE(x)                 ((x) << 27)
+#       define R300_TX_CACHE_WHOLE              0
+/* reserved */
+#       define R300_TX_CACHE_HALF_0             2
+#       define R300_TX_CACHE_HALF_1             3
+#       define R300_TX_CACHE_FOURTH_0           4
+#       define R300_TX_CACHE_FOURTH_1           5
+#       define R300_TX_CACHE_FOURTH_2           6
+#       define R300_TX_CACHE_FOURTH_3           7
+#       define R300_TX_CACHE_EIGHTH_0           8
+#       define R300_TX_CACHE_EIGHTH_1           9
+#       define R300_TX_CACHE_EIGHTH_2           10
+#       define R300_TX_CACHE_EIGHTH_3           11
+#       define R300_TX_CACHE_EIGHTH_4           12
+#       define R300_TX_CACHE_EIGHTH_5           13
+#       define R300_TX_CACHE_EIGHTH_6           14
+#       define R300_TX_CACHE_EIGHTH_7           15
+#       define R300_TX_CACHE_SIXTEENTH_0        16
+#       define R300_TX_CACHE_SIXTEENTH_1        17
+#       define R300_TX_CACHE_SIXTEENTH_2        18
+#       define R300_TX_CACHE_SIXTEENTH_3        19
+#       define R300_TX_CACHE_SIXTEENTH_4        20
+#       define R300_TX_CACHE_SIXTEENTH_5        21
+#       define R300_TX_CACHE_SIXTEENTH_6        22
+#       define R300_TX_CACHE_SIXTEENTH_7        23
+#       define R300_TX_CACHE_SIXTEENTH_8        24
+#       define R300_TX_CACHE_SIXTEENTH_9        25
+#       define R300_TX_CACHE_SIXTEENTH_10       26
+#       define R300_TX_CACHE_SIXTEENTH_11       27
+#       define R300_TX_CACHE_SIXTEENTH_12       28
+#       define R300_TX_CACHE_SIXTEENTH_13       29
+#       define R300_TX_CACHE_SIXTEENTH_14       30
+#       define R300_TX_CACHE_SIXTEENTH_15       31
+
+#define R300_TX_FORMAT2_0		    0x4500 /* obvious missing in gap */
+#       define R300_TX_PITCHMASK_SHIFT           0
+#       define R300_TX_PITCHMASK_MASK            (2047 << 0)
+#	define R500_TXFORMAT_MSB		 (1 << 14)
+#	define R500_TXWIDTH_BIT11	         (1 << 15)
+#	define R500_TXHEIGHT_BIT11	         (1 << 16)
+#	define R500_POW2FIX2FLT			 (1 << 17)
+#	define R500_SEL_FILTER4_TC0		 (0 << 18)
+#	define R500_SEL_FILTER4_TC1		 (1 << 18)
+#	define R500_SEL_FILTER4_TC2		 (2 << 18)
+#	define R500_SEL_FILTER4_TC3		 (3 << 18)
+
+#define R300_TX_OFFSET_0                    0x4540
+#define R300_TX_OFFSET_1                    0x4544
+#define R300_TX_OFFSET_2                    0x4548
+#define R300_TX_OFFSET_3                    0x454C
+#define R300_TX_OFFSET_4                    0x4550
+#define R300_TX_OFFSET_5                    0x4554
+#define R300_TX_OFFSET_6                    0x4558
+#define R300_TX_OFFSET_7                    0x455C
+
+#       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
+#       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
+#       define R300_TXO_ENDIAN_WORD_SWAP         (2 << 0)
+#       define R300_TXO_ENDIAN_HALFDW_SWAP       (3 << 0)
+#       define R300_TXO_MACRO_TILE_LINEAR        (0 << 2)
+#       define R300_TXO_MACRO_TILE_TILED         (1 << 2)
+#       define R300_TXO_MACRO_TILE(x)            ((x) << 2)
+#       define R300_TXO_MICRO_TILE_LINEAR        (0 << 3)
+#       define R300_TXO_MICRO_TILE_TILED         (1 << 3)
+#       define R300_TXO_MICRO_TILE_TILED_SQUARE  (2 << 3)
+#       define R300_TXO_MICRO_TILE(x)            ((x) << 3)
+#       define R300_TXO_OFFSET_MASK              0xffffffe0
+#       define R300_TXO_OFFSET_SHIFT             5
+
+/* 32 bit chroma key */
+#define R300_TX_CHROMA_KEY_0                      0x4580
+#define R300_TX_CHROMA_KEY_1                      0x4584
+#define R300_TX_CHROMA_KEY_2                      0x4588
+#define R300_TX_CHROMA_KEY_3                      0x458c
+#define R300_TX_CHROMA_KEY_4                      0x4590
+#define R300_TX_CHROMA_KEY_5                      0x4594
+#define R300_TX_CHROMA_KEY_6                      0x4598
+#define R300_TX_CHROMA_KEY_7                      0x459c
+#define R300_TX_CHROMA_KEY_8                      0x45a0
+#define R300_TX_CHROMA_KEY_9                      0x45a4
+#define R300_TX_CHROMA_KEY_10                     0x45a8
+#define R300_TX_CHROMA_KEY_11                     0x45ac
+#define R300_TX_CHROMA_KEY_12                     0x45b0
+#define R300_TX_CHROMA_KEY_13                     0x45b4
+#define R300_TX_CHROMA_KEY_14                     0x45b8
+#define R300_TX_CHROMA_KEY_15                     0x45bc
+/* ff00ff00 == { 0, 1.0, 0, 1.0 } */
+
+/* Border Color */
+#define R300_TX_BORDER_COLOR_0              0x45c0
+#define R300_TX_BORDER_COLOR_1              0x45c4
+#define R300_TX_BORDER_COLOR_2              0x45c8
+#define R300_TX_BORDER_COLOR_3              0x45cc
+#define R300_TX_BORDER_COLOR_4              0x45d0
+#define R300_TX_BORDER_COLOR_5              0x45d4
+#define R300_TX_BORDER_COLOR_6              0x45d8
+#define R300_TX_BORDER_COLOR_7              0x45dc
+#define R300_TX_BORDER_COLOR_8              0x45e0
+#define R300_TX_BORDER_COLOR_9              0x45e4
+#define R300_TX_BORDER_COLOR_10             0x45e8
+#define R300_TX_BORDER_COLOR_11             0x45ec
+#define R300_TX_BORDER_COLOR_12             0x45f0
+#define R300_TX_BORDER_COLOR_13             0x45f4
+#define R300_TX_BORDER_COLOR_14             0x45f8
+#define R300_TX_BORDER_COLOR_15             0x45fc
+
+
+/* END: Texture specification */
+
+/* BEGIN: Fragment program instruction set */
+
+/* Fragment programs are written directly into register space.
+ * There are separate instruction streams for texture instructions and ALU
+ * instructions.
+ * In order to synchronize these streams, the program is divided into up
+ * to 4 nodes. Each node begins with a number of TEX operations, followed
+ * by a number of ALU operations.
+ * The first node can have zero TEX ops, all subsequent nodes must have at
+ * least
+ * one TEX ops.
+ * All nodes must have at least one ALU op.
+ *
+ * The index of the last node is stored in PFS_CNTL_0: A value of 0 means
+ * 1 node, a value of 3 means 4 nodes.
+ * The total amount of instructions is defined in PFS_CNTL_2. The offsets are
+ * offsets into the respective instruction streams, while *_END points to the
+ * last instruction relative to this offset.
+ */
+#define R300_US_CONFIG                      0x4600
+#       define R300_PFS_CNTL_LAST_NODES_SHIFT    0
+#       define R300_PFS_CNTL_LAST_NODES_MASK     (3 << 0)
+#       define R300_PFS_CNTL_FIRST_NODE_HAS_TEX  (1 << 3)
+#define R300_US_PIXSIZE                     0x4604
+/* There is an unshifted value here which has so far always been equal to the
+ * index of the highest used temporary register.
+ */
+#define R300_US_CODE_OFFSET                 0x4608
+#       define R300_PFS_CNTL_ALU_OFFSET_SHIFT    0
+#       define R300_PFS_CNTL_ALU_OFFSET_MASK     (63 << 0)
+#       define R300_PFS_CNTL_ALU_END_SHIFT       6
+#       define R300_PFS_CNTL_ALU_END_MASK        (63 << 6)
+#       define R300_PFS_CNTL_TEX_OFFSET_SHIFT    13
+#       define R300_PFS_CNTL_TEX_OFFSET_MASK     (31 << 13)
+#       define R300_PFS_CNTL_TEX_END_SHIFT       18
+#       define R300_PFS_CNTL_TEX_END_MASK        (31 << 18)
+#       define R400_PFS_CNTL_TEX_OFFSET_MSB_SHIFT 24
+#       define R400_PFS_CNTL_TEX_OFFSET_MSB_MASK (0xf << 24)
+#       define R400_PFS_CNTL_TEX_END_MSB_SHIFT   28
+#       define R400_PFS_CNTL_TEX_END_MSB_MASK    (0xf << 28)
+
+/* gap */
+
+/* Nodes are stored backwards. The last active node is always stored in
+ * PFS_NODE_3.
+ * Example: In a 2-node program, NODE_0 and NODE_1 are set to 0. The
+ * first node is stored in NODE_2, the second node is stored in NODE_3.
+ *
+ * Offsets are relative to the master offset from PFS_CNTL_2.
+ */
+#define R300_US_CODE_ADDR_0                 0x4610
+#define R300_US_CODE_ADDR_1                 0x4614
+#define R300_US_CODE_ADDR_2                 0x4618
+#define R300_US_CODE_ADDR_3                 0x461C
+#       define R300_ALU_START_SHIFT         0
+#       define R300_ALU_START_MASK          (63 << 0)
+#       define R300_ALU_SIZE_SHIFT          6
+#       define R300_ALU_SIZE_MASK           (63 << 6)
+#       define R300_TEX_START_SHIFT         12
+#       define R300_TEX_START_MASK          (31 << 12)
+#       define R300_TEX_SIZE_SHIFT          17
+#       define R300_TEX_SIZE_MASK           (31 << 17)
+#	define R300_RGBA_OUT                (1 << 22)
+#	define R300_W_OUT                   (1 << 23)
+#       define R400_TEX_START_MSB_SHIFT     24
+#       define R400_TEX_START_MSG_MASK      (0xf << 24)
+#       define R400_TEX_SIZE_MSB_SHIFT      28
+#       define R400_TEX_SIZE_MSG_MASK       (0xf << 28)
+
+/* TEX
+ * As far as I can tell, texture instructions cannot write into output
+ * registers directly. A subsequent ALU instruction is always necessary,
+ * even if it's just MAD o0, r0, 1, 0
+ */
+#define R300_US_TEX_INST_0                  0x4620
+#	define R300_SRC_ADDR_SHIFT          0
+#	define R300_SRC_ADDR_MASK           (31 << 0)
+#	define R300_DST_ADDR_SHIFT          6
+#	define R300_DST_ADDR_MASK           (31 << 6)
+#	define R300_TEX_ID_SHIFT            11
+#       define R300_TEX_ID_MASK             (15 << 11)
+#	define R300_TEX_INST_SHIFT		15
+#		define R300_TEX_OP_NOP	        0
+#		define R300_TEX_OP_LD	        1
+#		define R300_TEX_OP_KIL	        2
+#		define R300_TEX_OP_TXP	        3
+#		define R300_TEX_OP_TXB	        4
+#	define R300_TEX_INST_MASK               (7 << 15)
+#      define R400_SRC_ADDR_EXT_BIT         (1 << 19)
+#      define R400_DST_ADDR_EXT_BIT         (1 << 20)
+
+/* Output format from the unfied shader */
+#define R300_US_OUT_FMT_0                   0x46A4
+#	define R300_US_OUT_FMT_C4_8         (0 << 0)
+#	define R300_US_OUT_FMT_C4_10        (1 << 0)
+#	define R300_US_OUT_FMT_C4_10_GAMMA  (2 << 0)
+#	define R300_US_OUT_FMT_C_16         (3 << 0)
+#	define R300_US_OUT_FMT_C2_16        (4 << 0)
+#	define R300_US_OUT_FMT_C4_16        (5 << 0)
+#	define R300_US_OUT_FMT_C_16_MPEG    (6 << 0)
+#	define R300_US_OUT_FMT_C2_16_MPEG   (7 << 0)
+#	define R300_US_OUT_FMT_C2_4         (8 << 0)
+#	define R300_US_OUT_FMT_C_3_3_2      (9 << 0)
+#	define R300_US_OUT_FMT_C_6_5_6      (10 << 0)
+#	define R300_US_OUT_FMT_C_11_11_10   (11 << 0)
+#	define R300_US_OUT_FMT_C_10_11_11   (12 << 0)
+#	define R300_US_OUT_FMT_C_2_10_10_10 (13 << 0)
+/* reserved */
+#	define R300_US_OUT_FMT_UNUSED       (15 << 0)
+#	define R300_US_OUT_FMT_C_16_FP      (16 << 0)
+#	define R300_US_OUT_FMT_C2_16_FP     (17 << 0)
+#	define R300_US_OUT_FMT_C4_16_FP     (18 << 0)
+#	define R300_US_OUT_FMT_C_32_FP      (19 << 0)
+#	define R300_US_OUT_FMT_C2_32_FP     (20 << 0)
+#	define R300_US_OUT_FMT_C4_32_FP     (21 << 0)
+#   define R300_C0_SEL_A				(0 << 8)
+#   define R300_C0_SEL_R				(1 << 8)
+#   define R300_C0_SEL_G				(2 << 8)
+#   define R300_C0_SEL_B				(3 << 8)
+#   define R300_C1_SEL_A				(0 << 10)
+#   define R300_C1_SEL_R				(1 << 10)
+#   define R300_C1_SEL_G				(2 << 10)
+#   define R300_C1_SEL_B				(3 << 10)
+#   define R300_C2_SEL_A				(0 << 12)
+#   define R300_C2_SEL_R				(1 << 12)
+#   define R300_C2_SEL_G				(2 << 12)
+#   define R300_C2_SEL_B				(3 << 12)
+#   define R300_C3_SEL_A				(0 << 14)
+#   define R300_C3_SEL_R				(1 << 14)
+#   define R300_C3_SEL_G				(2 << 14)
+#   define R300_C3_SEL_B				(3 << 14)
+#   define R300_OUT_SIGN(x)				((x) << 16)
+#   define R500_ROUND_ADJ				(1 << 20)
+
+/* ALU
+ * The ALU instructions register blocks are enumerated according to the order
+ * in which fglrx. I assume there is space for 64 instructions, since
+ * each block has space for a maximum of 64 DWORDs, and this matches reported
+ * native limits.
+ *
+ * The basic functional block seems to be one MAD for each color and alpha,
+ * and an adder that adds all components after the MUL.
+ *  - ADD, MUL, MAD etc.: use MAD with appropriate neutral operands
+ *  - DP4: Use OUTC_DP4, OUTA_DP4
+ *  - DP3: Use OUTC_DP3, OUTA_DP4, appropriate alpha operands
+ *  - DPH: Use OUTC_DP4, OUTA_DP4, appropriate alpha operands
+ *  - CMPH: If ARG2 > 0.5, return ARG0, else return ARG1
+ *  - CMP: If ARG2 < 0, return ARG1, else return ARG0
+ *  - FLR: use FRC+MAD
+ *  - XPD: use MAD+MAD
+ *  - SGE, SLT: use MAD+CMP
+ *  - RSQ: use ABS modifier for argument
+ *  - Use OUTC_REPL_ALPHA to write results of an alpha-only operation
+ *    (e.g. RCP) into color register
+ *  - apparently, there's no quick DST operation
+ *  - fglrx set FPI2_UNKNOWN_31 on a "MAD fragment.color, tmp0, tmp1, tmp2"
+ *  - fglrx set FPI2_UNKNOWN_31 on a "MAX r2, r1, c0"
+ *  - fglrx once set FPI0_UNKNOWN_31 on a "FRC r1, r1"
+ *
+ * Operand selection
+ * First stage selects three sources from the available registers and
+ * constant parameters. This is defined in INSTR1 (color) and INSTR3 (alpha).
+ * fglrx sorts the three source fields: Registers before constants,
+ * lower indices before higher indices; I do not know whether this is
+ * necessary.
+ *
+ * fglrx fills unused sources with "read constant 0"
+ * According to specs, you cannot select more than two different constants.
+ *
+ * Second stage selects the operands from the sources. This is defined in
+ * INSTR0 (color) and INSTR2 (alpha). You can also select the special constants
+ * zero and one.
+ * Swizzling and negation happens in this stage, as well.
+ *
+ * Important: Color and alpha seem to be mostly separate, i.e. their sources
+ * selection appears to be fully independent (the register storage is probably
+ * physically split into a color and an alpha section).
+ * However (because of the apparent physical split), there is some interaction
+ * WRT swizzling. If, for example, you want to load an R component into an
+ * Alpha operand, this R component is taken from a *color* source, not from
+ * an alpha source. The corresponding register doesn't even have to appear in
+ * the alpha sources list. (I hope this all makes sense to you)
+ *
+ * Destination selection
+ * The destination register index is in FPI1 (color) and FPI3 (alpha)
+ * together with enable bits.
+ * There are separate enable bits for writing into temporary registers
+ * (DSTC_REG_* /DSTA_REG) and program output registers (DSTC_OUTPUT_*
+ * /DSTA_OUTPUT). You can write to both at once, or not write at all (the
+ * same index must be used for both).
+ *
+ * Note: There is a special form for LRP
+ *  - Argument order is the same as in ARB_fragment_program.
+ *  - Operation is MAD
+ *  - ARG1 is set to ARGC_SRC1C_LRP/ARGC_SRC1A_LRP
+ *  - Set FPI0/FPI2_SPECIAL_LRP
+ * Arbitrary LRP (including support for swizzling) requires vanilla MAD+MAD
+ */
+#define R300_US_ALU_RGB_ADDR_0                   0x46C0
+#       define R300_ALU_SRC0C_SHIFT             0
+#       define R300_ALU_SRC0C_MASK              (31 << 0)
+#       define R300_ALU_SRC0C_CONST             (1 << 5)
+#       define R300_ALU_SRC1C_SHIFT             6
+#       define R300_ALU_SRC1C_MASK              (31 << 6)
+#       define R300_ALU_SRC1C_CONST             (1 << 11)
+#       define R300_ALU_SRC2C_SHIFT             12
+#       define R300_ALU_SRC2C_MASK              (31 << 12)
+#       define R300_ALU_SRC2C_CONST             (1 << 17)
+#       define R300_ALU_SRC_MASK                0x0003ffff
+#       define R300_ALU_DSTC_SHIFT              18
+#       define R300_ALU_DSTC_MASK               (31 << 18)
+#		define R300_ALU_DSTC_REG_MASK_SHIFT     23
+#       define R300_ALU_DSTC_REG_X              (1 << 23)
+#       define R300_ALU_DSTC_REG_Y              (1 << 24)
+#       define R300_ALU_DSTC_REG_Z              (1 << 25)
+#		define R300_ALU_DSTC_OUTPUT_MASK_SHIFT  26
+#       define R300_ALU_DSTC_OUTPUT_X           (1 << 26)
+#       define R300_ALU_DSTC_OUTPUT_Y           (1 << 27)
+#       define R300_ALU_DSTC_OUTPUT_Z           (1 << 28)
+#       define R300_ALU_DSTC_OUTPUT_XYZ         (7 << 26)
+#       define R300_RGB_ADDR0(x)                ((x) << 0)
+#       define R300_RGB_ADDR1(x)                ((x) << 6)
+#       define R300_RGB_ADDR2(x)                ((x) << 12)
+#       define R300_RGB_TARGET(x)               ((x) << 29)
+
+#define R300_US_ALU_ALPHA_ADDR_0                 0x47C0
+#       define R300_ALU_SRC0A_SHIFT             0
+#       define R300_ALU_SRC0A_MASK              (31 << 0)
+#       define R300_ALU_SRC0A_CONST             (1 << 5)
+#       define R300_ALU_SRC1A_SHIFT             6
+#       define R300_ALU_SRC1A_MASK              (31 << 6)
+#       define R300_ALU_SRC1A_CONST             (1 << 11)
+#       define R300_ALU_SRC2A_SHIFT             12
+#       define R300_ALU_SRC2A_MASK              (31 << 12)
+#       define R300_ALU_SRC2A_CONST             (1 << 17)
+#       define R300_ALU_SRC_MASK                0x0003ffff
+#       define R300_ALU_DSTA_SHIFT              18
+#       define R300_ALU_DSTA_MASK               (31 << 18)
+#       define R300_ALU_DSTA_REG                (1 << 23)
+#       define R300_ALU_DSTA_OUTPUT             (1 << 24)
+#		define R300_ALU_DSTA_DEPTH              (1 << 27)
+#       define R300_ALPHA_ADDR0(x)              ((x) << 0)
+#       define R300_ALPHA_ADDR1(x)              ((x) << 6)
+#       define R300_ALPHA_ADDR2(x)              ((x) << 12)
+#       define R300_ALPHA_TARGET(x)             ((x) << 25)
+
+#define R300_US_ALU_RGB_INST_0                   0x48C0
+#       define R300_ALU_ARGC_SRC0C_XYZ          0
+#       define R300_ALU_ARGC_SRC0C_XXX          1
+#       define R300_ALU_ARGC_SRC0C_YYY          2
+#       define R300_ALU_ARGC_SRC0C_ZZZ          3
+#       define R300_ALU_ARGC_SRC1C_XYZ          4
+#       define R300_ALU_ARGC_SRC1C_XXX          5
+#       define R300_ALU_ARGC_SRC1C_YYY          6
+#       define R300_ALU_ARGC_SRC1C_ZZZ          7
+#       define R300_ALU_ARGC_SRC2C_XYZ          8
+#       define R300_ALU_ARGC_SRC2C_XXX          9
+#       define R300_ALU_ARGC_SRC2C_YYY          10
+#       define R300_ALU_ARGC_SRC2C_ZZZ          11
+#       define R300_ALU_ARGC_SRC0A              12
+#       define R300_ALU_ARGC_SRC1A              13
+#       define R300_ALU_ARGC_SRC2A              14
+#       define R300_ALU_ARGC_SRCP_XYZ           15
+#       define R300_ALU_ARGC_SRCP_XXX           16
+#       define R300_ALU_ARGC_SRCP_YYY           17
+#       define R300_ALU_ARGC_SRCP_ZZZ           18
+#       define R300_ALU_ARGC_SRCP_WWW           19
+#       define R300_ALU_ARGC_ZERO               20
+#       define R300_ALU_ARGC_ONE                21
+#       define R300_ALU_ARGC_HALF               22
+#       define R300_ALU_ARGC_SRC0C_YZX          23
+#       define R300_ALU_ARGC_SRC1C_YZX          24
+#       define R300_ALU_ARGC_SRC2C_YZX          25
+#       define R300_ALU_ARGC_SRC0C_ZXY          26
+#       define R300_ALU_ARGC_SRC1C_ZXY          27
+#       define R300_ALU_ARGC_SRC2C_ZXY          28
+#       define R300_ALU_ARGC_SRC0CA_WZY         29
+#       define R300_ALU_ARGC_SRC1CA_WZY         30
+#       define R300_ALU_ARGC_SRC2CA_WZY         31
+#       define R300_RGB_SWIZA(x)                ((x) << 0)
+#       define R300_RGB_SWIZB(x)                ((x) << 7)
+#       define R300_RGB_SWIZC(x)                ((x) << 14)
+
+#       define R300_ALU_ARG0C_SHIFT             0
+#       define R300_ALU_ARG0C_MASK              (31 << 0)
+#       define R300_ALU_ARG0C_NOP               (0 << 5)
+#       define R300_ALU_ARG0C_NEG               (1 << 5)
+#       define R300_ALU_ARG0C_ABS               (2 << 5)
+#       define R300_ALU_ARG0C_NAB               (3 << 5)
+#       define R300_ALU_ARG1C_SHIFT             7
+#       define R300_ALU_ARG1C_MASK              (31 << 7)
+#       define R300_ALU_ARG1C_NOP               (0 << 12)
+#       define R300_ALU_ARG1C_NEG               (1 << 12)
+#       define R300_ALU_ARG1C_ABS               (2 << 12)
+#       define R300_ALU_ARG1C_NAB               (3 << 12)
+#       define R300_ALU_ARG2C_SHIFT             14
+#       define R300_ALU_ARG2C_MASK              (31 << 14)
+#       define R300_ALU_ARG2C_NOP               (0 << 19)
+#       define R300_ALU_ARG2C_NEG               (1 << 19)
+#       define R300_ALU_ARG2C_ABS               (2 << 19)
+#       define R300_ALU_ARG2C_NAB               (3 << 19)
+#       define R300_ALU_SRCP_1_MINUS_2_SRC0     (0 << 21)
+#       define R300_ALU_SRCP_SRC1_MINUS_SRC0    (1 << 21)
+#       define R300_ALU_SRCP_SRC1_PLUS_SRC0     (2 << 21)
+#       define R300_ALU_SRCP_1_MINUS_SRC0       (3 << 21)
+
+#       define R300_ALU_OUTC_MAD                (0 << 23)
+#       define R300_ALU_OUTC_DP3                (1 << 23)
+#       define R300_ALU_OUTC_DP4                (2 << 23)
+#       define R300_ALU_OUTC_D2A                (3 << 23)
+#       define R300_ALU_OUTC_MIN                (4 << 23)
+#       define R300_ALU_OUTC_MAX                (5 << 23)
+#       define R300_ALU_OUTC_CMPH               (7 << 23)
+#       define R300_ALU_OUTC_CMP                (8 << 23)
+#       define R300_ALU_OUTC_FRC                (9 << 23)
+#       define R300_ALU_OUTC_REPL_ALPHA         (10 << 23)
+
+#       define R300_ALU_OUTC_MOD_NOP            (0 << 27)
+#       define R300_ALU_OUTC_MOD_MUL2           (1 << 27)
+#       define R300_ALU_OUTC_MOD_MUL4           (2 << 27)
+#       define R300_ALU_OUTC_MOD_MUL8           (3 << 27)
+#       define R300_ALU_OUTC_MOD_DIV2           (4 << 27)
+#       define R300_ALU_OUTC_MOD_DIV4           (5 << 27)
+#       define R300_ALU_OUTC_MOD_DIV8           (6 << 27)
+
+#       define R300_ALU_OUTC_CLAMP              (1 << 30)
+#       define R300_ALU_INSERT_NOP              (1 << 31)
+
+#define R300_US_ALU_ALPHA_INST_0                 0x49C0
+#       define R300_ALU_ARGA_SRC0C_X            0
+#       define R300_ALU_ARGA_SRC0C_Y            1
+#       define R300_ALU_ARGA_SRC0C_Z            2
+#       define R300_ALU_ARGA_SRC1C_X            3
+#       define R300_ALU_ARGA_SRC1C_Y            4
+#       define R300_ALU_ARGA_SRC1C_Z            5
+#       define R300_ALU_ARGA_SRC2C_X            6
+#       define R300_ALU_ARGA_SRC2C_Y            7
+#       define R300_ALU_ARGA_SRC2C_Z            8
+#       define R300_ALU_ARGA_SRC0A              9
+#       define R300_ALU_ARGA_SRC1A              10
+#       define R300_ALU_ARGA_SRC2A              11
+#       define R300_ALU_ARGA_SRCP_X             12
+#       define R300_ALU_ARGA_SRCP_Y             13
+#       define R300_ALU_ARGA_SRCP_Z             14
+#       define R300_ALU_ARGA_SRCP_W             15
+#       define R300_ALU_ARGA_ZERO               16
+#       define R300_ALU_ARGA_ONE                17
+#       define R300_ALU_ARGA_HALF               18
+#       define R300_ALPHA_SWIZA(x)              ((x) << 0)
+#       define R300_ALPHA_SWIZB(x)              ((x) << 7)
+#       define R300_ALPHA_SWIZC(x)              ((x) << 14)
+
+#       define R300_ALU_ARG0A_SHIFT             0
+#       define R300_ALU_ARG0A_MASK              (31 << 0)
+#       define R300_ALU_ARG0A_NOP               (0 << 5)
+#       define R300_ALU_ARG0A_NEG               (1 << 5)
+#	define R300_ALU_ARG0A_ABS		 (2 << 5)
+#	define R300_ALU_ARG0A_NAB		 (3 << 5)
+#       define R300_ALU_ARG1A_SHIFT             7
+#       define R300_ALU_ARG1A_MASK              (31 << 7)
+#       define R300_ALU_ARG1A_NOP               (0 << 12)
+#       define R300_ALU_ARG1A_NEG               (1 << 12)
+#	define R300_ALU_ARG1A_ABS		 (2 << 12)
+#	define R300_ALU_ARG1A_NAB		 (3 << 12)
+#       define R300_ALU_ARG2A_SHIFT             14
+#       define R300_ALU_ARG2A_MASK              (31 << 14)
+#       define R300_ALU_ARG2A_NOP               (0 << 19)
+#       define R300_ALU_ARG2A_NEG               (1 << 19)
+#	define R300_ALU_ARG2A_ABS		 (2 << 19)
+#	define R300_ALU_ARG2A_NAB		 (3 << 19)
+#       define R300_ALU_SRCP_1_MINUS_2_SRC0     (0 << 21)
+#       define R300_ALU_SRCP_SRC1_MINUS_SRC0    (1 << 21)
+#       define R300_ALU_SRCP_SRC1_PLUS_SRC0     (2 << 21)
+#       define R300_ALU_SRCP_1_MINUS_SRC0       (3 << 21)
+
+#       define R300_ALU_OUTA_MAD                (0 << 23)
+#       define R300_ALU_OUTA_DP4                (1 << 23)
+#       define R300_ALU_OUTA_MIN                (2 << 23)
+#       define R300_ALU_OUTA_MAX                (3 << 23)
+#       define R300_ALU_OUTA_CND                (5 << 23)
+#       define R300_ALU_OUTA_CMP                (6 << 23)
+#       define R300_ALU_OUTA_FRC                (7 << 23)
+#       define R300_ALU_OUTA_EX2                (8 << 23)
+#       define R300_ALU_OUTA_LG2                (9 << 23)
+#       define R300_ALU_OUTA_RCP                (10 << 23)
+#       define R300_ALU_OUTA_RSQ                (11 << 23)
+
+#       define R300_ALU_OUTA_MOD_NOP            (0 << 27)
+#       define R300_ALU_OUTA_MOD_MUL2           (1 << 27)
+#       define R300_ALU_OUTA_MOD_MUL4           (2 << 27)
+#       define R300_ALU_OUTA_MOD_MUL8           (3 << 27)
+#       define R300_ALU_OUTA_MOD_DIV2           (4 << 27)
+#       define R300_ALU_OUTA_MOD_DIV4           (5 << 27)
+#       define R300_ALU_OUTA_MOD_DIV8           (6 << 27)
+
+#       define R300_ALU_OUTA_CLAMP              (1 << 30)
+/* END: Fragment program instruction set */
+
+/* R4xx extended fragment shader registers. */
+#define R400_US_ALU_EXT_ADDR_0              0x4ac0 /* up to 63 (0x4bbc) */
+#   define R400_ADDR0_EXT_RGB_MSB_BIT       0x01
+#   define R400_ADDR1_EXT_RGB_MSB_BIT       0x02
+#   define R400_ADDR2_EXT_RGB_MSB_BIT       0x04
+#   define R400_ADDRD_EXT_RGB_MSB_BIT       0x08
+#   define R400_ADDR0_EXT_A_MSB_BIT         0x10
+#   define R400_ADDR1_EXT_A_MSB_BIT         0x20
+#   define R400_ADDR2_EXT_A_MSB_BIT         0x40
+#   define R400_ADDRD_EXT_A_MSB_BIT         0x80
+#define R400_US_CODE_BANK                   0x46b8
+#   define R400_BANK_SHIFT                  0
+#   define R400_BANK_MASK                   0xf
+#   define R400_R390_MODE_ENABLE            (1 << 4)
+#define R400_US_CODE_EXT                    0x46bc
+#   define R400_ALU_OFFSET_MSB_SHIFT        0
+#   define R400_ALU_OFFSET_MSB_MASK         (0x7 << 0)
+#   define R400_ALU_SIZE_MSB_SHIFT          3
+#   define R400_ALU_SIZE_MSB_MASK           (0x7 << 3)
+#   define R400_ALU_START0_MSB_SHIFT        6
+#   define R400_ALU_START0_MSB_MASK         (0x7 << 6)
+#   define R400_ALU_SIZE0_MSB_SHIFT         9
+#   define R400_ALU_SIZE0_MSB_MASK          (0x7 << 9)
+#   define R400_ALU_START1_MSB_SHIFT        12
+#   define R400_ALU_START1_MSB_MASK         (0x7 << 12)
+#   define R400_ALU_SIZE1_MSB_SHIFT         15
+#   define R400_ALU_SIZE1_MSB_MASK          (0x7 << 15)
+#   define R400_ALU_START2_MSB_SHIFT        18
+#   define R400_ALU_START2_MSB_MASK         (0x7 << 18)
+#   define R400_ALU_SIZE2_MSB_SHIFT         21
+#   define R400_ALU_SIZE2_MSB_MASK          (0x7 << 21)
+#   define R400_ALU_START3_MSB_SHIFT        24
+#   define R400_ALU_START3_MSB_MASK         (0x7 << 24)
+#   define R400_ALU_SIZE3_MSB_SHIFT         27
+#   define R400_ALU_SIZE3_MSB_MASK          (0x7 << 27)
+/* END: R4xx extended fragment shader registers. */
+
+/* Fog: Fog Blending Enable */
+#define R300_FG_FOG_BLEND                             0x4bc0
+#       define R300_FG_FOG_BLEND_DISABLE              (0 << 0)
+#       define R300_FG_FOG_BLEND_ENABLE               (1 << 0)
+#	define R300_FG_FOG_BLEND_FN_LINEAR            (0 << 1)
+#	define R300_FG_FOG_BLEND_FN_EXP               (1 << 1)
+#	define R300_FG_FOG_BLEND_FN_EXP2              (2 << 1)
+#	define R300_FG_FOG_BLEND_FN_CONSTANT          (3 << 1)
+#	define R300_FG_FOG_BLEND_FN_MASK              (3 << 1)
+
+/* Fog: Red Component of Fog Color */
+#define R300_FG_FOG_COLOR_R                           0x4bc8
+/* Fog: Green Component of Fog Color */
+#define R300_FG_FOG_COLOR_G                           0x4bcc
+/* Fog: Blue Component of Fog Color */
+#define R300_FG_FOG_COLOR_B                           0x4bd0
+#	define R300_FG_FOG_COLOR_MASK 0x000003ff
+
+/* Fog: Constant Factor for Fog Blending */
+#define R300_FG_FOG_FACTOR                            0x4bc4
+#	define FG_FOG_FACTOR_MASK 0x000003ff
+
+/* Fog: Alpha function */
+#define R300_FG_ALPHA_FUNC                            0x4bd4
+#       define R300_FG_ALPHA_FUNC_VAL_MASK               0x000000ff
+#       define R300_FG_ALPHA_FUNC_NEVER                     (0 << 8)
+#       define R300_FG_ALPHA_FUNC_LESS                      (1 << 8)
+#       define R300_FG_ALPHA_FUNC_EQUAL                     (2 << 8)
+#       define R300_FG_ALPHA_FUNC_LE                        (3 << 8)
+#       define R300_FG_ALPHA_FUNC_GREATER                   (4 << 8)
+#       define R300_FG_ALPHA_FUNC_NOTEQUAL                  (5 << 8)
+#       define R300_FG_ALPHA_FUNC_GE                        (6 << 8)
+#       define R300_FG_ALPHA_FUNC_ALWAYS                    (7 << 8)
+#       define R300_ALPHA_TEST_OP_MASK                      (7 << 8)
+#       define R300_FG_ALPHA_FUNC_DISABLE                   (0 << 11)
+#       define R300_FG_ALPHA_FUNC_ENABLE                    (1 << 11)
+
+#       define R500_FG_ALPHA_FUNC_10BIT                     (0 << 12)
+#       define R500_FG_ALPHA_FUNC_8BIT                      (1 << 12)
+
+#       define R300_FG_ALPHA_FUNC_MASK_DISABLE              (0 << 16)
+#       define R300_FG_ALPHA_FUNC_MASK_ENABLE               (1 << 16)
+#       define R300_FG_ALPHA_FUNC_CFG_2_OF_4                (0 << 17)
+#       define R300_FG_ALPHA_FUNC_CFG_3_OF_6                (1 << 17)
+
+#       define R300_FG_ALPHA_FUNC_DITH_DISABLE              (0 << 20)
+#       define R300_FG_ALPHA_FUNC_DITH_ENABLE               (1 << 20)
+
+#       define R500_FG_ALPHA_FUNC_OFFSET_DISABLE            (0 << 24)
+#       define R500_FG_ALPHA_FUNC_OFFSET_ENABLE             (1 << 24) /* Not supported in R520 */
+#       define R500_FG_ALPHA_FUNC_DISC_ZERO_MASK_DISABLE    (0 << 25)
+#       define R500_FG_ALPHA_FUNC_DISC_ZERO_MASK_ENABLE     (1 << 25)
+
+#       define R500_FG_ALPHA_FUNC_FP16_DISABLE              (0 << 28)
+#       define R500_FG_ALPHA_FUNC_FP16_ENABLE               (1 << 28)
+
+
+/* Fog: Where does the depth come from? */
+#define R300_FG_DEPTH_SRC                  0x4bd8
+#	define R300_FG_DEPTH_SRC_SCAN   (0 << 0)
+#	define R300_FG_DEPTH_SRC_SHADER (1 << 0)
+
+/* Fog: Alpha Compare Value */
+#define R500_FG_ALPHA_VALUE                0x4be0
+#	define R500_FG_ALPHA_VALUE_MASK 0x0000ffff
+
+#define RV530_FG_ZBREG_DEST                 0x4be8
+#	define RV530_FG_ZBREG_DEST_PIPE_SELECT_0             (1 << 0)
+#	define RV530_FG_ZBREG_DEST_PIPE_SELECT_1             (1 << 1)
+#	define RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL           (3 << 0)
+/* gap */
+
+/* Fragment program parameters in 7.16 floating point */
+#define R300_PFS_PARAM_0_X                  0x4C00
+#define R300_PFS_PARAM_0_Y                  0x4C04
+#define R300_PFS_PARAM_0_Z                  0x4C08
+#define R300_PFS_PARAM_0_W                  0x4C0C
+/* last consts */
+#define R300_PFS_PARAM_31_X                 0x4DF0
+#define R300_PFS_PARAM_31_Y                 0x4DF4
+#define R300_PFS_PARAM_31_Z                 0x4DF8
+#define R300_PFS_PARAM_31_W                 0x4DFC
+
+/* Unpipelined. */
+#define R300_RB3D_CCTL                      0x4e00
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES(x)       (MAX2(((x)-1), 0) << 5)
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES_1_BUFFER                (0 << 5)
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES_2_BUFFERS               (1 << 5)
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES_3_BUFFERS               (2 << 5)
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES_4_BUFFERS               (3 << 5)
+#	define R300_RB3D_CCTL_CLRCMP_FLIPE_DISABLE                    (0 << 7)
+#	define R300_RB3D_CCTL_CLRCMP_FLIPE_ENABLE                     (1 << 7)
+#	define R300_RB3D_CCTL_AA_COMPRESSION_DISABLE                  (0 << 9)
+#	define R300_RB3D_CCTL_AA_COMPRESSION_ENABLE                   (1 << 9)
+#	define R300_RB3D_CCTL_CMASK_DISABLE                           (0 << 10)
+#	define R300_RB3D_CCTL_CMASK_ENABLE                            (1 << 10)
+/* reserved */
+#	define R300_RB3D_CCTL_INDEPENDENT_COLOR_CHANNEL_MASK_DISABLE  (0 << 12)
+#	define R300_RB3D_CCTL_INDEPENDENT_COLOR_CHANNEL_MASK_ENABLE   (1 << 12)
+#	define R300_RB3D_CCTL_WRITE_COMPRESSION_ENABLE                (0 << 13)
+#	define R300_RB3D_CCTL_WRITE_COMPRESSION_DISABLE               (1 << 13)
+#	define R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_DISABLE  (0 << 14)
+#	define R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_ENABLE   (1 << 14)
+
+
+/* Notes:
+ * - AFAIK fglrx always sets BLEND_UNKNOWN when blending is used in
+ *   the application
+ * - AFAIK fglrx always sets BLEND_NO_SEPARATE when CBLEND and ABLEND
+ *    are set to the same
+ *   function (both registers are always set up completely in any case)
+ * - Most blend flags are simply copied from R200 and not tested yet
+ */
+#define R300_RB3D_CBLEND                    0x4E04
+#define R300_RB3D_ABLEND                    0x4E08
+/* the following only appear in CBLEND */
+#       define R300_ALPHA_BLEND_ENABLE         (1 << 0)
+#       define R300_SEPARATE_ALPHA_ENABLE      (1 << 1)
+#       define R300_READ_ENABLE                (1 << 2)
+#       define R300_DISCARD_SRC_PIXELS_DIS     (0 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_0     (1 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_COLOR_0     (2 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_0     (3 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1     (4 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_COLOR_1     (5 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1     (6 << 3)
+#       define R500_SRC_ALPHA_0_NO_READ                (1 << 30)
+#       define R500_SRC_ALPHA_1_NO_READ                (1 << 31)
+
+/* the following are shared between CBLEND and ABLEND */
+#       define R300_FCN_MASK                         (3  << 12)
+#       define R300_COMB_FCN_ADD_CLAMP               (0  << 12)
+#       define R300_COMB_FCN_ADD_NOCLAMP             (1  << 12)
+#       define R300_COMB_FCN_SUB_CLAMP               (2  << 12)
+#       define R300_COMB_FCN_SUB_NOCLAMP             (3  << 12)
+#       define R300_COMB_FCN_MIN                     (4  << 12)
+#       define R300_COMB_FCN_MAX                     (5  << 12)
+#       define R300_COMB_FCN_RSUB_CLAMP              (6  << 12)
+#       define R300_COMB_FCN_RSUB_NOCLAMP            (7  << 12)
+#       define R300_BLEND_GL_ZERO                    (32)
+#       define R300_BLEND_GL_ONE                     (33)
+#       define R300_BLEND_GL_SRC_COLOR               (34)
+#       define R300_BLEND_GL_ONE_MINUS_SRC_COLOR     (35)
+#       define R300_BLEND_GL_DST_COLOR               (36)
+#       define R300_BLEND_GL_ONE_MINUS_DST_COLOR     (37)
+#       define R300_BLEND_GL_SRC_ALPHA               (38)
+#       define R300_BLEND_GL_ONE_MINUS_SRC_ALPHA     (39)
+#       define R300_BLEND_GL_DST_ALPHA               (40)
+#       define R300_BLEND_GL_ONE_MINUS_DST_ALPHA     (41)
+#       define R300_BLEND_GL_SRC_ALPHA_SATURATE      (42)
+#       define R300_BLEND_GL_CONST_COLOR             (43)
+#       define R300_BLEND_GL_ONE_MINUS_CONST_COLOR   (44)
+#       define R300_BLEND_GL_CONST_ALPHA             (45)
+#       define R300_BLEND_GL_ONE_MINUS_CONST_ALPHA   (46)
+#       define R300_BLEND_MASK                       (63)
+#       define R300_SRC_BLEND_SHIFT                  (16)
+#       define R300_DST_BLEND_SHIFT                  (24)
+
+/* Constant color used by the blender. Pipelined through the blender.
+ * Note: For R520, this field is ignored, use RB3D_CONSTANT_COLOR_GB__BLUE,
+ * RB3D_CONSTANT_COLOR_GB__GREEN, etc. instead.
+ */
+#define R300_RB3D_BLEND_COLOR               0x4E10
+
+
+/* 3D Color Channel Mask. If all the channels used in the current color format
+ * are disabled, then the cb will discard all the incoming quads. Pipelined
+ * through the blender.
+ */
+#define RB3D_COLOR_CHANNEL_MASK                  0x4E0C
+#	define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0  (1 << 0)
+#	define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 (1 << 1)
+#	define RB3D_COLOR_CHANNEL_MASK_RED_MASK0   (1 << 2)
+#	define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 (1 << 3)
+#	define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK1  (1 << 4)
+#	define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK1 (1 << 5)
+#	define RB3D_COLOR_CHANNEL_MASK_RED_MASK1   (1 << 6)
+#	define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK1 (1 << 7)
+#	define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK2  (1 << 8)
+#	define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK2 (1 << 9)
+#	define RB3D_COLOR_CHANNEL_MASK_RED_MASK2   (1 << 10)
+#	define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK2 (1 << 11)
+#	define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK3  (1 << 12)
+#	define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK3 (1 << 13)
+#	define RB3D_COLOR_CHANNEL_MASK_RED_MASK3   (1 << 14)
+#	define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK3 (1 << 15)
+
+/* Clear color that is used when the color mask is set to 00. Unpipelined.
+ * Program this register with a 32-bit value in ARGB8888 or ARGB2101010
+ * formats, ignoring the fields.
+ */
+#define RB3D_COLOR_CLEAR_VALUE                   0x4e14
+
+/* gap */
+
+/* Color Compare Color. Stalls the 2d/3d datapath until it is idle. */
+#define RB3D_CLRCMP_CLR                     0x4e20
+
+/* Color Compare Mask. Stalls the 2d/3d datapath until it is idle. */
+#define RB3D_CLRCMP_MSK                     0x4e24
+
+/* Color Buffer Address Offset of multibuffer 0. Unpipelined. */
+#define R300_RB3D_COLOROFFSET0              0x4E28
+#       define R300_COLOROFFSET_MASK             0xFFFFFFE0
+/* Color Buffer Address Offset of multibuffer 1. Unpipelined. */
+#define R300_RB3D_COLOROFFSET1              0x4E2C
+/* Color Buffer Address Offset of multibuffer 2. Unpipelined. */
+#define R300_RB3D_COLOROFFSET2              0x4E30
+/* Color Buffer Address Offset of multibuffer 3. Unpipelined. */
+#define R300_RB3D_COLOROFFSET3              0x4E34
+
+/* Color buffer format and tiling control for all the multibuffers and the
+ * pitch of multibuffer 0 to 3. Unpipelined. The cache must be empty before any
+ * of the registers are changed.
+ *
+ * Bit 16: Larger tiles
+ * Bit 17: 4x2 tiles
+ * Bit 18: Extremely weird tile like, but some pixels duplicated?
+ */
+#define R300_RB3D_COLORPITCH0               0x4E38
+#       define R300_COLORPITCH_MASK              0x00003FFE
+#       define R300_COLOR_TILE_DISABLE            (0 << 16)
+#       define R300_COLOR_TILE_ENABLE             (1 << 16)
+#       define R300_COLOR_TILE(x)                 ((x) << 16)
+#       define R300_COLOR_MICROTILE_DISABLE       (0 << 17)
+#       define R300_COLOR_MICROTILE_ENABLE        (1 << 17)
+#       define R300_COLOR_MICROTILE_ENABLE_SQUARE (2 << 17) /* Only available in 16-bit */
+#       define R300_COLOR_MICROTILE(x)            ((x) << 17)
+#       define R300_COLOR_ENDIAN_NO_SWAP          (0 << 19)
+#       define R300_COLOR_ENDIAN_WORD_SWAP        (1 << 19)
+#       define R300_COLOR_ENDIAN_DWORD_SWAP       (2 << 19)
+#       define R300_COLOR_ENDIAN_HALF_DWORD_SWAP  (3 << 19)
+#	define R500_COLOR_FORMAT_ARGB10101010     (0 << 21)
+#	define R500_COLOR_FORMAT_UV1010           (1 << 21)
+#	define R500_COLOR_FORMAT_CI8              (2 << 21) /* 2D only */
+#	define R300_COLOR_FORMAT_ARGB1555         (3 << 21)
+#       define R300_COLOR_FORMAT_RGB565           (4 << 21)
+#       define R500_COLOR_FORMAT_ARGB2101010      (5 << 21)
+#       define R300_COLOR_FORMAT_ARGB8888         (6 << 21)
+#       define R300_COLOR_FORMAT_ARGB32323232     (7 << 21)
+/* reserved */
+#       define R300_COLOR_FORMAT_I8               (9 << 21)
+#       define R300_COLOR_FORMAT_ARGB16161616     (10 << 21)
+#       define R300_COLOR_FORMAT_VYUY             (11 << 21)
+#       define R300_COLOR_FORMAT_YVYU             (12 << 21)
+#       define R300_COLOR_FORMAT_UV88             (13 << 21)
+#       define R500_COLOR_FORMAT_I10              (14 << 21)
+#       define R300_COLOR_FORMAT_ARGB4444         (15 << 21)
+#define R300_RB3D_COLORPITCH1               0x4E3C
+#define R300_RB3D_COLORPITCH2               0x4E40
+#define R300_RB3D_COLORPITCH3               0x4E44
+
+/* gap */
+
+/* Destination Color Buffer Cache Control/Status. If the cb is in e2 mode, then
+ * a flush or free will not occur upon a write to this register, but a sync
+ * will be immediately sent if one is requested. If both DC_FLUSH and DC_FREE
+ * are zero but DC_FINISH is one, then a sync will be sent immediately -- the
+ * cb will not wait for all the previous operations to complete before sending
+ * the sync. Unpipelined except when DC_FINISH and DC_FREE are both set to
+ * zero.
+ *
+ * Set to 0A before 3D operations, set to 02 afterwards.
+ */
+#define R300_RB3D_DSTCACHE_CTLSTAT               0x4e4c
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_NO_EFFECT         (0 << 0)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_NO_EFFECT_1       (1 << 0)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D    (2 << 0)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D_1  (3 << 0)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_NO_EFFECT          (0 << 2)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_NO_EFFECT_1        (1 << 2)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS       (2 << 2)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS_1     (3 << 2)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FINISH_NO_SIGNAL        (0 << 4)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FINISH_SIGNAL           (1 << 4)
+
+#define R300_RB3D_DITHER_CTL 0x4E50
+#	define R300_RB3D_DITHER_CTL_DITHER_MODE_TRUNCATE         (0 << 0)
+#	define R300_RB3D_DITHER_CTL_DITHER_MODE_ROUND            (1 << 0)
+#	define R300_RB3D_DITHER_CTL_DITHER_MODE_LUT              (2 << 0)
+/* reserved */
+#	define R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_TRUNCATE   (0 << 2)
+#	define R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_ROUND      (1 << 2)
+#	define R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_LUT        (2 << 2)
+/* reserved */
+
+/* Resolve buffer destination address. The cache must be empty before changing
+ * this register if the cb is in resolve mode. Unpipelined
+ */
+#define R300_RB3D_AARESOLVE_OFFSET        0x4e80
+#	define R300_RB3D_AARESOLVE_OFFSET_SHIFT 5
+#	define R300_RB3D_AARESOLVE_OFFSET_MASK 0xffffffe0 /* At least according to the calculations of Christoph Brill */
+
+/* Resolve Buffer Pitch and Tiling Control. The cache must be empty before
+ * changing this register if the cb is in resolve mode. Unpipelined
+ */
+#define R300_RB3D_AARESOLVE_PITCH         0x4e84
+#	define R300_RB3D_AARESOLVE_PITCH_SHIFT 1
+#	define R300_RB3D_AARESOLVE_PITCH_MASK  0x00003ffe /* At least according to the calculations of Christoph Brill */
+
+/* Resolve Buffer Control. Unpipelined */
+#define R300_RB3D_AARESOLVE_CTL           0x4e88
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_MODE_NORMAL   (0 << 0)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_MODE_RESOLVE  (1 << 0)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_GAMMA_10      (0 << 1)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_GAMMA_22      (1 << 1)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_SAMPLE0 (0 << 2)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_AVERAGE (1 << 2)
+
+
+/* Discard src pixels less than or equal to threshold. */
+#define R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD 0x4ea0
+/* Discard src pixels greater than or equal to threshold. */
+#define R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD 0x4ea4
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_BLUE_SHIFT 0
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_BLUE_MASK 0x000000ff
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_GREEN_SHIFT 8
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_GREEN_MASK 0x0000ff00
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_RED_SHIFT 16
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_RED_MASK 0x00ff0000
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_ALPHA_SHIFT 24
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_ALPHA_MASK 0xff000000
+
+/* 3D ROP Control. Stalls the 2d/3d datapath until it is idle. */
+#define R300_RB3D_ROPCNTL                             0x4e18
+#	define R300_RB3D_ROPCNTL_ROP_ENABLE            0x00000004
+#	define R300_RB3D_ROPCNTL_ROP_MASK              (15 << 8)
+#	define R300_RB3D_ROPCNTL_ROP_SHIFT             8
+
+/* Color Compare Flip. Stalls the 2d/3d datapath until it is idle. */
+#define R300_RB3D_CLRCMP_FLIPE                        0x4e1c
+
+/* Sets the fifo sizes */
+#define R500_RB3D_FIFO_SIZE                           0x4ef4
+#	define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_FULL   (0 << 0)
+#	define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_HALF   (1 << 0)
+#	define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_QUATER (2 << 0)
+#	define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_EIGTHS (3 << 0)
+
+/* Constant color used by the blender. Pipelined through the blender. */
+#define R500_RB3D_CONSTANT_COLOR_AR                   0x4ef8
+#	define R500_RB3D_CONSTANT_COLOR_AR_RED_MASK    0x0000ffff
+#	define R500_RB3D_CONSTANT_COLOR_AR_RED_SHIFT   0
+#	define R500_RB3D_CONSTANT_COLOR_AR_ALPHA_MASK  0xffff0000
+#	define R500_RB3D_CONSTANT_COLOR_AR_ALPHA_SHIFT 16
+
+/* Constant color used by the blender. Pipelined through the blender. */
+#define R500_RB3D_CONSTANT_COLOR_GB                   0x4efc
+#	define R500_RB3D_CONSTANT_COLOR_AR_BLUE_MASK   0x0000ffff
+#	define R500_RB3D_CONSTANT_COLOR_AR_BLUE_SHIFT  0
+#	define R500_RB3D_CONSTANT_COLOR_AR_GREEN_MASK  0xffff0000
+#	define R500_RB3D_CONSTANT_COLOR_AR_GREEN_SHIFT 16
+
+/* gap */
+/* There seems to be no "write only" setting, so use Z-test = ALWAYS
+ * for this.
+ * Bit (1<<8) is the "test" bit. so plain write is 6  - vd
+ */
+#define R300_ZB_CNTL                             0x4F00
+#	define R300_STENCIL_ENABLE		 (1 << 0)
+#	define R300_Z_ENABLE		         (1 << 1)
+#	define R300_Z_WRITE_ENABLE		 (1 << 2)
+#	define R300_Z_SIGNED_COMPARE		 (1 << 3)
+#	define R300_STENCIL_FRONT_BACK		 (1 << 4)
+#   define R500_STENCIL_ZSIGNED_MAGNITUDE (1 << 5)
+#   define R500_STENCIL_REFMASK_FRONT_BACK (1 << 6)
+
+#define R300_ZB_ZSTENCILCNTL                   0x4f04
+	/* functions */
+#	define R300_ZS_NEVER			0
+#	define R300_ZS_LESS			1
+#	define R300_ZS_LEQUAL			2
+#	define R300_ZS_EQUAL			3
+#	define R300_ZS_GEQUAL			4
+#	define R300_ZS_GREATER			5
+#	define R300_ZS_NOTEQUAL			6
+#	define R300_ZS_ALWAYS			7
+#       define R300_ZS_MASK                     7
+	/* operations */
+#	define R300_ZS_KEEP			0
+#	define R300_ZS_ZERO			1
+#	define R300_ZS_REPLACE			2
+#	define R300_ZS_INCR			3
+#	define R300_ZS_DECR			4
+#	define R300_ZS_INVERT			5
+#	define R300_ZS_INCR_WRAP		6
+#	define R300_ZS_DECR_WRAP		7
+#	define R300_Z_FUNC_SHIFT		0
+	/* front and back refer to operations done for front
+	   and back faces, i.e. separate stencil function support */
+#	define R300_S_FRONT_FUNC_SHIFT	        3
+#	define R300_S_FRONT_SFAIL_OP_SHIFT	6
+#	define R300_S_FRONT_ZPASS_OP_SHIFT	9
+#	define R300_S_FRONT_ZFAIL_OP_SHIFT      12
+#	define R300_S_BACK_FUNC_SHIFT           15
+#	define R300_S_BACK_SFAIL_OP_SHIFT       18
+#	define R300_S_BACK_ZPASS_OP_SHIFT       21
+#	define R300_S_BACK_ZFAIL_OP_SHIFT       24
+
+#define R300_ZB_STENCILREFMASK                        0x4f08
+#	define R300_STENCILREF_SHIFT       0
+#	define R300_STENCILREF_MASK        0x000000ff
+#	define R300_STENCILMASK_SHIFT      8
+#	define R300_STENCILMASK_MASK       0x0000ff00
+#	define R300_STENCILWRITEMASK_SHIFT 16
+#	define R300_STENCILWRITEMASK_MASK  0x00ff0000
+
+/* gap */
+
+#define R300_ZB_FORMAT                             0x4f10
+#	define R300_DEPTHFORMAT_16BIT_INT_Z   (0 << 0)
+#	define R300_DEPTHFORMAT_16BIT_13E3    (1 << 0)
+#	define R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL   (2 << 0)
+/* reserved up to (15 << 0) */
+#	define R300_INVERT_13E3_LEADING_ONES  (0 << 4)
+#	define R300_INVERT_13E3_LEADING_ZEROS (1 << 4)
+
+#define R300_ZB_ZTOP                             0x4F14
+#	define R300_ZTOP_DISABLE                 (0 << 0)
+#	define R300_ZTOP_ENABLE                  (1 << 0)
+
+/* gap */
+
+#define R300_ZB_ZCACHE_CTLSTAT            0x4f18
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_NO_EFFECT      (0 << 0)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE (1 << 0)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_NO_EFFECT       (0 << 1)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE            (1 << 1)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_BUSY_IDLE            (0 << 31)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_BUSY_BUSY            (1 << 31)
+
+#define R300_ZB_BW_CNTL                     0x4f1c
+#	define R300_HIZ_DISABLE                              (0 << 0)
+#	define R300_HIZ_ENABLE                               (1 << 0)
+#	define R300_HIZ_MIN                                  (0 << 1)
+#	define R300_HIZ_MAX                                  (1 << 1)
+#	define R300_FAST_FILL_DISABLE                        (0 << 2)
+#	define R300_FAST_FILL_ENABLE                         (1 << 2)
+#	define R300_RD_COMP_DISABLE                          (0 << 3)
+#	define R300_RD_COMP_ENABLE                           (1 << 3)
+#	define R300_WR_COMP_DISABLE                          (0 << 4)
+#	define R300_WR_COMP_ENABLE                           (1 << 4)
+#	define R300_ZB_CB_CLEAR_RMW                          (0 << 5)
+#	define R300_ZB_CB_CLEAR_CACHE_LINEAR                 (1 << 5)
+#	define R300_FORCE_COMPRESSED_STENCIL_VALUE_DISABLE   (0 << 6)
+#	define R300_FORCE_COMPRESSED_STENCIL_VALUE_ENABLE    (1 << 6)
+
+#	define R500_ZEQUAL_OPTIMIZE_ENABLE                   (0 << 7)
+#	define R500_ZEQUAL_OPTIMIZE_DISABLE                  (1 << 7)
+#	define R500_SEQUAL_OPTIMIZE_ENABLE                   (0 << 8)
+#	define R500_SEQUAL_OPTIMIZE_DISABLE                  (1 << 8)
+
+#	define R500_BMASK_ENABLE                             (0 << 10)
+#	define R500_BMASK_DISABLE                            (1 << 10)
+#	define R500_HIZ_EQUAL_REJECT_DISABLE                 (0 << 11)
+#	define R500_HIZ_EQUAL_REJECT_ENABLE                  (1 << 11)
+#	define R500_HIZ_FP_EXP_BITS_DISABLE                  (0 << 12)
+#	define R500_HIZ_FP_EXP_BITS_1                        (1 << 12)
+#	define R500_HIZ_FP_EXP_BITS_2                        (2 << 12)
+#	define R500_HIZ_FP_EXP_BITS_3                        (3 << 12)
+#	define R500_HIZ_FP_EXP_BITS_4                        (4 << 12)
+#	define R500_HIZ_FP_EXP_BITS_5                        (5 << 12)
+#	define R500_HIZ_FP_INVERT_LEADING_ONES               (0 << 15)
+#	define R500_HIZ_FP_INVERT_LEADING_ZEROS              (1 << 15)
+#	define R500_TILE_OVERWRITE_RECOMPRESSION_ENABLE      (0 << 16)
+#	define R500_TILE_OVERWRITE_RECOMPRESSION_DISABLE     (1 << 16)
+#	define R500_CONTIGUOUS_6XAA_SAMPLES_ENABLE           (0 << 17)
+#	define R500_CONTIGUOUS_6XAA_SAMPLES_DISABLE          (1 << 17)
+#	define R500_PEQ_PACKING_DISABLE                      (0 << 18)
+#	define R500_PEQ_PACKING_ENABLE                       (1 << 18)
+#	define R500_COVERED_PTR_MASKING_DISABLE              (0 << 18)
+#	define R500_COVERED_PTR_MASKING_ENABLE               (1 << 18)
+
+
+/* gap */
+
+/* Z Buffer Address Offset.
+ * Bits 31 to 5 are used for aligned Z buffer address offset for macro tiles.
+ */
+#define R300_ZB_DEPTHOFFSET               0x4f20
+
+/* Z Buffer Pitch and Endian Control */
+#define R300_ZB_DEPTHPITCH                0x4f24
+#       define R300_DEPTHPITCH_MASK              0x00003FFC
+#       define R300_DEPTHMACROTILE_DISABLE      (0 << 16)
+#       define R300_DEPTHMACROTILE_ENABLE       (1 << 16)
+#       define R300_DEPTHMACROTILE(x)           ((x) << 16)
+#       define R300_DEPTHMICROTILE_LINEAR       (0 << 17)
+#       define R300_DEPTHMICROTILE_TILED        (1 << 17)
+#       define R300_DEPTHMICROTILE_TILED_SQUARE (2 << 17)
+#       define R300_DEPTHMICROTILE(x)           ((x) << 17)
+#       define R300_DEPTHENDIAN_NO_SWAP         (0 << 18)
+#       define R300_DEPTHENDIAN_WORD_SWAP       (1 << 18)
+#       define R300_DEPTHENDIAN_DWORD_SWAP      (2 << 18)
+#       define R300_DEPTHENDIAN_HALF_DWORD_SWAP (3 << 18)
+
+/* Z Buffer Clear Value */
+#define R300_ZB_DEPTHCLEARVALUE                  0x4f28
+
+/* Hierarchical Z Memory Offset */
+#define R300_ZB_HIZ_OFFSET                       0x4f44
+
+/* Hierarchical Z Write Index */
+#define R300_ZB_HIZ_WRINDEX                      0x4f48
+
+/* Hierarchical Z Data */
+#define R300_ZB_HIZ_DWORD                        0x4f4c
+
+/* Hierarchical Z Read Index */
+#define R300_ZB_HIZ_RDINDEX                      0x4f50
+
+/* Hierarchical Z Pitch */
+#define R300_ZB_HIZ_PITCH                        0x4f54
+
+/* Z Buffer Z Pass Counter Data */
+#define R300_ZB_ZPASS_DATA                       0x4f58
+
+/* Z Buffer Z Pass Counter Address */
+#define R300_ZB_ZPASS_ADDR                       0x4f5c
+
+/* Depth buffer X and Y coordinate offset */
+#define R300_ZB_DEPTHXY_OFFSET                   0x4f60
+#	define R300_DEPTHX_OFFSET_SHIFT  1
+#	define R300_DEPTHX_OFFSET_MASK   0x000007FE
+#	define R300_DEPTHY_OFFSET_SHIFT  17
+#	define R300_DEPTHY_OFFSET_MASK   0x07FE0000
+
+/* Sets the fifo sizes */
+#define R500_ZB_FIFO_SIZE                        0x4fd0
+#	define R500_OP_FIFO_SIZE_FULL   (0 << 0)
+#	define R500_OP_FIFO_SIZE_HALF   (1 << 0)
+#	define R500_OP_FIFO_SIZE_QUATER (2 << 0)
+#	define R500_OP_FIFO_SIZE_EIGTHS (4 << 0)
+
+/* Stencil Reference Value and Mask for backfacing quads */
+/* R300_ZB_STENCILREFMASK handles front face */
+#define R500_ZB_STENCILREFMASK_BF                0x4fd4
+#	define R500_STENCILREF_SHIFT       0
+#	define R500_STENCILREF_MASK        0x000000ff
+#	define R500_STENCILMASK_SHIFT      8
+#	define R500_STENCILMASK_MASK       0x0000ff00
+#	define R500_STENCILWRITEMASK_SHIFT 16
+#	define R500_STENCILWRITEMASK_MASK  0x00ff0000
+
+/**
+ * \defgroup R3XX_R5XX_PROGRAMMABLE_VERTEX_SHADER_DESCRIPTION R3XX-R5XX PROGRAMMABLE VERTEX SHADER DESCRIPTION
+ *
+ * The PVS_DST_MATH_INST is used to identify whether the instruction is a Vector
+ * Engine instruction or a Math Engine instruction.
+ */
+
+/*\{*/
+
+enum {
+	/* R3XX */
+	VECTOR_NO_OP			= 0,
+	VE_DOT_PRODUCT			= 1,
+	VE_MULTIPLY			= 2,
+	VE_ADD				= 3,
+	VE_MULTIPLY_ADD			= 4,
+	VE_DISTANCE_VECTOR		= 5,
+	VE_FRACTION			= 6,
+	VE_MAXIMUM			= 7,
+	VE_MINIMUM			= 8,
+	VE_SET_GREATER_THAN_EQUAL	= 9,
+	VE_SET_LESS_THAN		= 10,
+	VE_MULTIPLYX2_ADD		= 11,
+	VE_MULTIPLY_CLAMP		= 12,
+	VE_FLT2FIX_DX			= 13,
+	VE_FLT2FIX_DX_RND		= 14,
+	/* R5XX */
+	VE_PRED_SET_EQ_PUSH		= 15,
+	VE_PRED_SET_GT_PUSH		= 16,
+	VE_PRED_SET_GTE_PUSH		= 17,
+	VE_PRED_SET_NEQ_PUSH		= 18,
+	VE_COND_WRITE_EQ		= 19,
+	VE_COND_WRITE_GT		= 20,
+	VE_COND_WRITE_GTE		= 21,
+	VE_COND_WRITE_NEQ		= 22,
+	VE_COND_MUX_EQ			= 23,
+	VE_COND_MUX_GT			= 24,
+	VE_COND_MUX_GTE			= 25,
+	VE_SET_GREATER_THAN		= 26,
+	VE_SET_EQUAL			= 27,
+	VE_SET_NOT_EQUAL		= 28
+};
+
+enum {
+	/* R3XX */
+	MATH_NO_OP			= 0,
+	ME_EXP_BASE2_DX			= 1,
+	ME_LOG_BASE2_DX			= 2,
+	ME_EXP_BASEE_FF			= 3,
+	ME_LIGHT_COEFF_DX		= 4,
+	ME_POWER_FUNC_FF		= 5,
+	ME_RECIP_DX			= 6,
+	ME_RECIP_FF			= 7,
+	ME_RECIP_SQRT_DX		= 8,
+	ME_RECIP_SQRT_FF		= 9,
+	ME_MULTIPLY			= 10,
+	ME_EXP_BASE2_FULL_DX		= 11,
+	ME_LOG_BASE2_FULL_DX		= 12,
+	ME_POWER_FUNC_FF_CLAMP_B	= 13,
+	ME_POWER_FUNC_FF_CLAMP_B1	= 14,
+	ME_POWER_FUNC_FF_CLAMP_01	= 15,
+	ME_SIN				= 16,
+	ME_COS				= 17,
+	/* R5XX */
+	ME_LOG_BASE2_IEEE		= 18,
+	ME_RECIP_IEEE			= 19,
+	ME_RECIP_SQRT_IEEE		= 20,
+	ME_PRED_SET_EQ			= 21,
+	ME_PRED_SET_GT			= 22,
+	ME_PRED_SET_GTE			= 23,
+	ME_PRED_SET_NEQ			= 24,
+	ME_PRED_SET_CLR			= 25,
+	ME_PRED_SET_INV			= 26,
+	ME_PRED_SET_POP			= 27,
+	ME_PRED_SET_RESTORE		= 28
+};
+
+enum {
+	/* R3XX */
+	PVS_MACRO_OP_2CLK_MADD		= 0,
+	PVS_MACRO_OP_2CLK_M2X_ADD	= 1
+};
+
+enum {
+	PVS_SRC_REG_TEMPORARY		= 0,	/* Intermediate Storage */
+	PVS_SRC_REG_INPUT		= 1,	/* Input Vertex Storage */
+	PVS_SRC_REG_CONSTANT		= 2,	/* Constant State Storage */
+	PVS_SRC_REG_ALT_TEMPORARY	= 3	/* Alternate Intermediate Storage */
+};
+
+enum {
+	PVS_DST_REG_TEMPORARY		= 0,	/* Intermediate Storage */
+	PVS_DST_REG_A0			= 1,	/* Address Register Storage */
+	PVS_DST_REG_OUT			= 2,	/* Output Memory. Used for all outputs */
+	PVS_DST_REG_OUT_REPL_X		= 3,	/* Output Memory & Replicate X to all channels */
+	PVS_DST_REG_ALT_TEMPORARY	= 4,	/* Alternate Intermediate Storage */
+	PVS_DST_REG_INPUT		= 5	/* Output Memory & Replicate X to all channels */
+};
+
+enum {
+	PVS_SRC_SELECT_X		= 0,	/* Select X Component */
+	PVS_SRC_SELECT_Y		= 1,	/* Select Y Component */
+	PVS_SRC_SELECT_Z		= 2,	/* Select Z Component */
+	PVS_SRC_SELECT_W		= 3,	/* Select W Component */
+	PVS_SRC_SELECT_FORCE_0		= 4,	/* Force Component to 0.0 */
+	PVS_SRC_SELECT_FORCE_1		= 5	/* Force Component to 1.0 */
+};
+
+/* PVS Opcode & Destination Operand Description */
+
+enum {
+	PVS_DST_OPCODE_MASK		= 0x3f,
+	PVS_DST_OPCODE_SHIFT		= 0,
+	PVS_DST_MATH_INST_MASK		= 0x1,
+	PVS_DST_MATH_INST_SHIFT		= 6,
+	PVS_DST_MACRO_INST_MASK		= 0x1,
+	PVS_DST_MACRO_INST_SHIFT	= 7,
+	PVS_DST_REG_TYPE_MASK		= 0xf,
+	PVS_DST_REG_TYPE_SHIFT		= 8,
+	PVS_DST_ADDR_MODE_1_MASK	= 0x1,
+	PVS_DST_ADDR_MODE_1_SHIFT	= 12,
+	PVS_DST_OFFSET_MASK		= 0x7f,
+	PVS_DST_OFFSET_SHIFT		= 13,
+	PVS_DST_WE_X_MASK		= 0x1,
+	PVS_DST_WE_X_SHIFT		= 20,
+	PVS_DST_WE_Y_MASK		= 0x1,
+	PVS_DST_WE_Y_SHIFT		= 21,
+	PVS_DST_WE_Z_MASK		= 0x1,
+	PVS_DST_WE_Z_SHIFT		= 22,
+	PVS_DST_WE_W_MASK		= 0x1,
+	PVS_DST_WE_W_SHIFT		= 23,
+	PVS_DST_VE_SAT_MASK		= 0x1,
+	PVS_DST_VE_SAT_SHIFT		= 24,
+	PVS_DST_ME_SAT_MASK		= 0x1,
+	PVS_DST_ME_SAT_SHIFT		= 25,
+	PVS_DST_PRED_ENABLE_MASK	= 0x1,
+	PVS_DST_PRED_ENABLE_SHIFT	= 26,
+	PVS_DST_PRED_SENSE_MASK		= 0x1,
+	PVS_DST_PRED_SENSE_SHIFT	= 27,
+	PVS_DST_DUAL_MATH_OP_MASK	= 0x3,
+	PVS_DST_DUAL_MATH_OP_SHIFT	= 27,
+	PVS_DST_ADDR_SEL_MASK		= 0x3,
+	PVS_DST_ADDR_SEL_SHIFT		= 29,
+	PVS_DST_ADDR_MODE_0_MASK	= 0x1,
+	PVS_DST_ADDR_MODE_0_SHIFT	= 31
+};
+
+/* PVS Source Operand Description */
+
+enum {
+	PVS_SRC_REG_TYPE_MASK		= 0x3,
+	PVS_SRC_REG_TYPE_SHIFT		= 0,
+	SPARE_0_MASK			= 0x1,
+	SPARE_0_SHIFT			= 2,
+	PVS_SRC_ABS_XYZW_MASK		= 0x1,
+	PVS_SRC_ABS_XYZW_SHIFT		= 3,
+	PVS_SRC_ADDR_MODE_0_MASK	= 0x1,
+	PVS_SRC_ADDR_MODE_0_SHIFT	= 4,
+	PVS_SRC_OFFSET_MASK		= 0xff,
+	PVS_SRC_OFFSET_SHIFT		= 5,
+	PVS_SRC_SWIZZLE_X_MASK		= 0x7,
+	PVS_SRC_SWIZZLE_X_SHIFT		= 13,
+	PVS_SRC_SWIZZLE_Y_MASK		= 0x7,
+	PVS_SRC_SWIZZLE_Y_SHIFT		= 16,
+	PVS_SRC_SWIZZLE_Z_MASK		= 0x7,
+	PVS_SRC_SWIZZLE_Z_SHIFT		= 19,
+	PVS_SRC_SWIZZLE_W_MASK		= 0x7,
+	PVS_SRC_SWIZZLE_W_SHIFT		= 22,
+	PVS_SRC_MODIFIER_X_MASK		= 0x1,
+	PVS_SRC_MODIFIER_X_SHIFT	= 25,
+	PVS_SRC_MODIFIER_Y_MASK		= 0x1,
+	PVS_SRC_MODIFIER_Y_SHIFT	= 26,
+	PVS_SRC_MODIFIER_Z_MASK		= 0x1,
+	PVS_SRC_MODIFIER_Z_SHIFT	= 27,
+	PVS_SRC_MODIFIER_W_MASK		= 0x1,
+	PVS_SRC_MODIFIER_W_SHIFT	= 28,
+	PVS_SRC_ADDR_SEL_MASK		= 0x3,
+	PVS_SRC_ADDR_SEL_SHIFT		= 29,
+	PVS_SRC_ADDR_MODE_1_MASK	= 0x0,
+	PVS_SRC_ADDR_MODE_1_SHIFT	= 32
+};
+
+/*\}*/
+
+/* BEGIN: Packet 3 commands */
+
+/* A primitive emission dword. */
+#define R300_PRIM_TYPE_NONE                     (0 << 0)
+#define R300_PRIM_TYPE_POINT                    (1 << 0)
+#define R300_PRIM_TYPE_LINE                     (2 << 0)
+#define R300_PRIM_TYPE_LINE_STRIP               (3 << 0)
+#define R300_PRIM_TYPE_TRI_LIST                 (4 << 0)
+#define R300_PRIM_TYPE_TRI_FAN                  (5 << 0)
+#define R300_PRIM_TYPE_TRI_STRIP                (6 << 0)
+#define R300_PRIM_TYPE_TRI_TYPE2                (7 << 0)
+#define R300_PRIM_TYPE_RECT_LIST                (8 << 0)
+#define R300_PRIM_TYPE_3VRT_POINT_LIST          (9 << 0)
+#define R300_PRIM_TYPE_3VRT_LINE_LIST           (10 << 0)
+	/* GUESS (based on r200) */
+#define R300_PRIM_TYPE_POINT_SPRITES            (11 << 0)
+#define R300_PRIM_TYPE_LINE_LOOP                (12 << 0)
+#define R300_PRIM_TYPE_QUADS                    (13 << 0)
+#define R300_PRIM_TYPE_QUAD_STRIP               (14 << 0)
+#define R300_PRIM_TYPE_POLYGON                  (15 << 0)
+#define R300_PRIM_TYPE_MASK                     0xF
+#define R300_PRIM_WALK_IND                      (1 << 4)
+#define R300_PRIM_WALK_LIST                     (2 << 4)
+#define R300_PRIM_WALK_RING                     (3 << 4)
+#define R300_PRIM_WALK_MASK                     (3 << 4)
+	/* GUESS (based on r200) */
+#define R300_PRIM_COLOR_ORDER_BGRA              (0 << 6)
+#define R300_PRIM_COLOR_ORDER_RGBA              (1 << 6)
+#define R300_PRIM_NUM_VERTICES_SHIFT            16
+#define R300_PRIM_NUM_VERTICES_MASK             0xffff
+
+
+
+/*
+ * The R500 unified shader (US) registers come in banks of 512 each, one
+ * for each instruction slot in the shader.  You can't touch them directly.
+ * R500_US_VECTOR_INDEX() sets the base instruction to modify; successive
+ * writes to R500_GA_US_VECTOR_DATA autoincrement the index after the
+ * instruction is fully specified.
+ */
+#define R500_US_ALU_ALPHA_INST_0			0xa800
+#   define R500_ALPHA_OP_MAD				0
+#   define R500_ALPHA_OP_DP				1
+#   define R500_ALPHA_OP_MIN				2
+#   define R500_ALPHA_OP_MAX				3
+/* #define R500_ALPHA_OP_RESERVED			4 */
+#   define R500_ALPHA_OP_CND				5
+#   define R500_ALPHA_OP_CMP				6
+#   define R500_ALPHA_OP_FRC				7
+#   define R500_ALPHA_OP_EX2				8
+#   define R500_ALPHA_OP_LN2				9
+#   define R500_ALPHA_OP_RCP				10
+#   define R500_ALPHA_OP_RSQ				11
+#   define R500_ALPHA_OP_SIN				12
+#   define R500_ALPHA_OP_COS				13
+#   define R500_ALPHA_OP_MDH				14
+#   define R500_ALPHA_OP_MDV				15
+#   define R500_ALPHA_ADDRD(x)				((x) << 4)
+#   define R500_ALPHA_ADDRD_REL				(1 << 11)
+#  define R500_ALPHA_SEL_A_SHIFT			12
+#   define R500_ALPHA_SEL_A_SRC0			(0 << 12)
+#   define R500_ALPHA_SEL_A_SRC1			(1 << 12)
+#   define R500_ALPHA_SEL_A_SRC2			(2 << 12)
+#   define R500_ALPHA_SEL_A_SRCP			(3 << 12)
+#   define R500_ALPHA_SWIZ_A_R				(0 << 14)
+#   define R500_ALPHA_SWIZ_A_G				(1 << 14)
+#   define R500_ALPHA_SWIZ_A_B				(2 << 14)
+#   define R500_ALPHA_SWIZ_A_A				(3 << 14)
+#   define R500_ALPHA_SWIZ_A_0				(4 << 14)
+#   define R500_ALPHA_SWIZ_A_HALF			(5 << 14)
+#   define R500_ALPHA_SWIZ_A_1				(6 << 14)
+/* #define R500_ALPHA_SWIZ_A_UNUSED			(7 << 14) */
+#   define R500_ALPHA_MOD_A_NOP				(0 << 17)
+#   define R500_ALPHA_MOD_A_NEG				(1 << 17)
+#   define R500_ALPHA_MOD_A_ABS				(2 << 17)
+#   define R500_ALPHA_MOD_A_NAB				(3 << 17)
+#  define R500_ALPHA_SEL_B_SHIFT			19
+#   define R500_ALPHA_SEL_B_SRC0			(0 << 19)
+#   define R500_ALPHA_SEL_B_SRC1			(1 << 19)
+#   define R500_ALPHA_SEL_B_SRC2			(2 << 19)
+#   define R500_ALPHA_SEL_B_SRCP			(3 << 19)
+#   define R500_ALPHA_SWIZ_B_R				(0 << 21)
+#   define R500_ALPHA_SWIZ_B_G				(1 << 21)
+#   define R500_ALPHA_SWIZ_B_B				(2 << 21)
+#   define R500_ALPHA_SWIZ_B_A				(3 << 21)
+#   define R500_ALPHA_SWIZ_B_0				(4 << 21)
+#   define R500_ALPHA_SWIZ_B_HALF			(5 << 21)
+#   define R500_ALPHA_SWIZ_B_1				(6 << 21)
+/* #define R500_ALPHA_SWIZ_B_UNUSED			(7 << 21) */
+#   define R500_ALPHA_MOD_B_NOP				(0 << 24)
+#   define R500_ALPHA_MOD_B_NEG				(1 << 24)
+#   define R500_ALPHA_MOD_B_ABS				(2 << 24)
+#   define R500_ALPHA_MOD_B_NAB				(3 << 24)
+#   define R500_ALPHA_OMOD_IDENTITY			(0 << 26)
+#   define R500_ALPHA_OMOD_MUL_2			(1 << 26)
+#   define R500_ALPHA_OMOD_MUL_4			(2 << 26)
+#   define R500_ALPHA_OMOD_MUL_8			(3 << 26)
+#   define R500_ALPHA_OMOD_DIV_2			(4 << 26)
+#   define R500_ALPHA_OMOD_DIV_4			(5 << 26)
+#   define R500_ALPHA_OMOD_DIV_8			(6 << 26)
+#   define R500_ALPHA_OMOD_DISABLE			(7 << 26)
+#   define R500_ALPHA_TARGET(x)				((x) << 29)
+#   define R500_ALPHA_W_OMASK				(1 << 31)
+#define R500_US_ALU_ALPHA_ADDR_0			0x9800
+#   define R500_ALPHA_ADDR0(x)				((x) << 0)
+#   define R500_ALPHA_ADDR0_CONST			(1 << 8)
+#   define R500_ALPHA_ADDR0_REL				(1 << 9)
+#   define R500_ALPHA_ADDR1(x)				((x) << 10)
+#   define R500_ALPHA_ADDR1_CONST			(1 << 18)
+#   define R500_ALPHA_ADDR1_REL				(1 << 19)
+#   define R500_ALPHA_ADDR2(x)				((x) << 20)
+#   define R500_ALPHA_ADDR2_CONST			(1 << 28)
+#   define R500_ALPHA_ADDR2_REL				(1 << 29)
+#   define R500_ALPHA_SRCP_OP_1_MINUS_2A0		(0 << 30)
+#   define R500_ALPHA_SRCP_OP_A1_MINUS_A0		(1 << 30)
+#   define R500_ALPHA_SRCP_OP_A1_PLUS_A0		(2 << 30)
+#   define R500_ALPHA_SRCP_OP_1_MINUS_A0		(3 << 30)
+#define R500_US_ALU_RGBA_INST_0				0xb000
+#   define R500_ALU_RGBA_OP_MAD				(0 << 0)
+#   define R500_ALU_RGBA_OP_DP3				(1 << 0)
+#   define R500_ALU_RGBA_OP_DP4				(2 << 0)
+#   define R500_ALU_RGBA_OP_D2A				(3 << 0)
+#   define R500_ALU_RGBA_OP_MIN				(4 << 0)
+#   define R500_ALU_RGBA_OP_MAX				(5 << 0)
+/* #define R500_ALU_RGBA_OP_RESERVED			(6 << 0) */
+#   define R500_ALU_RGBA_OP_CND				(7 << 0)
+#   define R500_ALU_RGBA_OP_CMP				(8 << 0)
+#   define R500_ALU_RGBA_OP_FRC				(9 << 0)
+#   define R500_ALU_RGBA_OP_SOP				(10 << 0)
+#   define R500_ALU_RGBA_OP_MDH				(11 << 0)
+#   define R500_ALU_RGBA_OP_MDV				(12 << 0)
+#   define R500_ALU_RGBA_ADDRD(x)			((x) << 4)
+#   define R500_ALU_RGBA_ADDRD_REL			(1 << 11)
+#  define R500_ALU_RGBA_SEL_C_SHIFT			12
+#   define R500_ALU_RGBA_SEL_C_SRC0			(0 << 12)
+#   define R500_ALU_RGBA_SEL_C_SRC1			(1 << 12)
+#   define R500_ALU_RGBA_SEL_C_SRC2			(2 << 12)
+#   define R500_ALU_RGBA_SEL_C_SRCP			(3 << 12)
+#   define R500_ALU_RGBA_R_SWIZ_R			(0 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_G			(1 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_B			(2 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_A			(3 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_0			(4 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_HALF			(5 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_1			(6 << 14)
+/* #define R500_ALU_RGBA_R_SWIZ_UNUSED			(7 << 14) */
+#   define R500_ALU_RGBA_G_SWIZ_R			(0 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_G			(1 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_B			(2 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_A			(3 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_0			(4 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_HALF			(5 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_1			(6 << 17)
+/* #define R500_ALU_RGBA_G_SWIZ_UNUSED			(7 << 17) */
+#   define R500_ALU_RGBA_B_SWIZ_R			(0 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_G			(1 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_B			(2 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_A			(3 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_0			(4 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_HALF			(5 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_1			(6 << 20)
+/* #define R500_ALU_RGBA_B_SWIZ_UNUSED			(7 << 20) */
+#   define R500_ALU_RGBA_MOD_C_NOP			(0 << 23)
+#   define R500_ALU_RGBA_MOD_C_NEG			(1 << 23)
+#   define R500_ALU_RGBA_MOD_C_ABS			(2 << 23)
+#   define R500_ALU_RGBA_MOD_C_NAB			(3 << 23)
+#  define R500_ALU_RGBA_ALPHA_SEL_C_SHIFT		25
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRC0		(0 << 25)
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRC1		(1 << 25)
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRC2		(2 << 25)
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRCP		(3 << 25)
+#   define R500_ALU_RGBA_A_SWIZ_R			(0 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_G			(1 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_B			(2 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_A			(3 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_0			(4 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_HALF			(5 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_1			(6 << 27)
+/* #define R500_ALU_RGBA_A_SWIZ_UNUSED			(7 << 27) */
+#   define R500_ALU_RGBA_ALPHA_MOD_C_NOP		(0 << 30)
+#   define R500_ALU_RGBA_ALPHA_MOD_C_NEG		(1 << 30)
+#   define R500_ALU_RGBA_ALPHA_MOD_C_ABS		(2 << 30)
+#   define R500_ALU_RGBA_ALPHA_MOD_C_NAB		(3 << 30)
+#define R500_US_ALU_RGB_INST_0				0xa000
+#  define R500_ALU_RGB_SEL_A_SHIFT			0
+#   define R500_ALU_RGB_SEL_A_SRC0			(0 << 0)
+#   define R500_ALU_RGB_SEL_A_SRC1			(1 << 0)
+#   define R500_ALU_RGB_SEL_A_SRC2			(2 << 0)
+#   define R500_ALU_RGB_SEL_A_SRCP			(3 << 0)
+#   define R500_ALU_RGB_R_SWIZ_A_R			(0 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_G			(1 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_B			(2 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_A			(3 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_0			(4 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_HALF			(5 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_1			(6 << 2)
+/* #define R500_ALU_RGB_R_SWIZ_A_UNUSED			(7 << 2) */
+#   define R500_ALU_RGB_G_SWIZ_A_R			(0 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_G			(1 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_B			(2 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_A			(3 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_0			(4 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_HALF			(5 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_1			(6 << 5)
+/* #define R500_ALU_RGB_G_SWIZ_A_UNUSED			(7 << 5) */
+#   define R500_ALU_RGB_B_SWIZ_A_R			(0 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_G			(1 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_B			(2 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_A			(3 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_0			(4 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_HALF			(5 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_1			(6 << 8)
+/* #define R500_ALU_RGB_B_SWIZ_A_UNUSED			(7 << 8) */
+#   define R500_ALU_RGB_MOD_A_NOP			(0 << 11)
+#   define R500_ALU_RGB_MOD_A_NEG			(1 << 11)
+#   define R500_ALU_RGB_MOD_A_ABS			(2 << 11)
+#   define R500_ALU_RGB_MOD_A_NAB			(3 << 11)
+#  define R500_ALU_RGB_SEL_B_SHIFT			13
+#   define R500_ALU_RGB_SEL_B_SRC0			(0 << 13)
+#   define R500_ALU_RGB_SEL_B_SRC1			(1 << 13)
+#   define R500_ALU_RGB_SEL_B_SRC2			(2 << 13)
+#   define R500_ALU_RGB_SEL_B_SRCP			(3 << 13)
+#   define R500_ALU_RGB_R_SWIZ_B_R			(0 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_G			(1 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_B			(2 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_A			(3 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_0			(4 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_HALF			(5 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_1			(6 << 15)
+/* #define R500_ALU_RGB_R_SWIZ_B_UNUSED			(7 << 15) */
+#   define R500_ALU_RGB_G_SWIZ_B_R			(0 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_G			(1 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_B			(2 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_A			(3 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_0			(4 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_HALF			(5 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_1			(6 << 18)
+/* #define R500_ALU_RGB_G_SWIZ_B_UNUSED			(7 << 18) */
+#   define R500_ALU_RGB_B_SWIZ_B_R			(0 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_G			(1 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_B			(2 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_A			(3 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_0			(4 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_HALF			(5 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_1			(6 << 21)
+/* #define R500_ALU_RGB_B_SWIZ_B_UNUSED			(7 << 21) */
+#   define R500_ALU_RGB_MOD_B_NOP			(0 << 24)
+#   define R500_ALU_RGB_MOD_B_NEG			(1 << 24)
+#   define R500_ALU_RGB_MOD_B_ABS			(2 << 24)
+#   define R500_ALU_RGB_MOD_B_NAB			(3 << 24)
+#   define R500_ALU_RGB_OMOD_IDENTITY			(0 << 26)
+#   define R500_ALU_RGB_OMOD_MUL_2			(1 << 26)
+#   define R500_ALU_RGB_OMOD_MUL_4			(2 << 26)
+#   define R500_ALU_RGB_OMOD_MUL_8			(3 << 26)
+#   define R500_ALU_RGB_OMOD_DIV_2			(4 << 26)
+#   define R500_ALU_RGB_OMOD_DIV_4			(5 << 26)
+#   define R500_ALU_RGB_OMOD_DIV_8			(6 << 26)
+#   define R500_ALU_RGB_OMOD_DISABLE			(7 << 26)
+#   define R500_ALU_RGB_TARGET(x)			((x) << 29)
+#   define R500_ALU_RGB_WMASK				(1 << 31)
+#define R500_US_ALU_RGB_ADDR_0				0x9000
+#   define R500_RGB_ADDR0(x)				((x) << 0)
+#   define R500_RGB_ADDR0_CONST				(1 << 8)
+#   define R500_RGB_ADDR0_REL				(1 << 9)
+#   define R500_RGB_ADDR1(x)				((x) << 10)
+#   define R500_RGB_ADDR1_CONST				(1 << 18)
+#   define R500_RGB_ADDR1_REL				(1 << 19)
+#   define R500_RGB_ADDR2(x)				((x) << 20)
+#   define R500_RGB_ADDR2_CONST				(1 << 28)
+#   define R500_RGB_ADDR2_REL				(1 << 29)
+#   define R500_RGB_SRCP_OP_1_MINUS_2RGB0		(0 << 30)
+#   define R500_RGB_SRCP_OP_RGB1_MINUS_RGB0		(1 << 30)
+#   define R500_RGB_SRCP_OP_RGB1_PLUS_RGB0		(2 << 30)
+#   define R500_RGB_SRCP_OP_1_MINUS_RGB0		(3 << 30)
+#define R500_US_CMN_INST_0				0xb800
+#  define R500_INST_TYPE_MASK				(3 << 0)
+#   define R500_INST_TYPE_ALU				(0 << 0)
+#   define R500_INST_TYPE_OUT				(1 << 0)
+#   define R500_INST_TYPE_FC				(2 << 0)
+#   define R500_INST_TYPE_TEX				(3 << 0)
+#   define R500_INST_TEX_SEM_WAIT			(1 << 2)
+#   define R500_INST_RGB_PRED_SEL_NONE			(0 << 3)
+#   define R500_INST_RGB_PRED_SEL_RGBA			(1 << 3)
+#   define R500_INST_RGB_PRED_SEL_RRRR			(2 << 3)
+#   define R500_INST_RGB_PRED_SEL_GGGG			(3 << 3)
+#   define R500_INST_RGB_PRED_SEL_BBBB			(4 << 3)
+#   define R500_INST_RGB_PRED_SEL_AAAA			(5 << 3)
+#   define R500_INST_RGB_PRED_INV			(1 << 6)
+#   define R500_INST_WRITE_INACTIVE			(1 << 7)
+#   define R500_INST_LAST				(1 << 8)
+#   define R500_INST_NOP				(1 << 9)
+#   define R500_INST_ALU_WAIT				(1 << 10)
+#   define R500_INST_RGB_WMASK_R			(1 << 11)
+#   define R500_INST_RGB_WMASK_G			(1 << 12)
+#   define R500_INST_RGB_WMASK_B			(1 << 13)
+#   define R500_INST_RGB_WMASK_RGB			(7 << 11)
+#   define R500_INST_ALPHA_WMASK			(1 << 14)
+#   define R500_INST_RGB_OMASK_R			(1 << 15)
+#   define R500_INST_RGB_OMASK_G			(1 << 16)
+#   define R500_INST_RGB_OMASK_B			(1 << 17)
+#   define R500_INST_RGB_OMASK_RGB			(7 << 15)
+#   define R500_INST_ALPHA_OMASK			(1 << 18)
+#   define R500_INST_RGB_CLAMP				(1 << 19)
+#   define R500_INST_ALPHA_CLAMP			(1 << 20)
+#   define R500_INST_ALU_RESULT_SEL			(1 << 21)
+#   define R500_INST_ALPHA_PRED_INV			(1 << 22)
+#   define R500_INST_ALU_RESULT_OP_EQ			(0 << 23)
+#   define R500_INST_ALU_RESULT_OP_LT			(1 << 23)
+#   define R500_INST_ALU_RESULT_OP_GE			(2 << 23)
+#   define R500_INST_ALU_RESULT_OP_NE			(3 << 23)
+#   define R500_INST_ALPHA_PRED_SEL_NONE		(0 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_RGBA		(1 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_RRRR		(2 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_GGGG		(3 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_BBBB		(4 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_AAAA		(5 << 25)
+/* XXX next four are kind of guessed */
+#   define R500_INST_STAT_WE_R				(1 << 28)
+#   define R500_INST_STAT_WE_G				(1 << 29)
+#   define R500_INST_STAT_WE_B				(1 << 30)
+#   define R500_INST_STAT_WE_A				(1 << 31)
+
+/* note that these are 8 bit lengths, despite the offsets, at least for R500 */
+#define R500_US_CODE_ADDR				0x4630
+#   define R500_US_CODE_START_ADDR(x)			((x) << 0)
+#   define R500_US_CODE_END_ADDR(x)			((x) << 16)
+#define R500_US_CODE_OFFSET				0x4638
+#   define R500_US_CODE_OFFSET_ADDR(x)			((x) << 0)
+#define R500_US_CODE_RANGE				0x4634
+#   define R500_US_CODE_RANGE_ADDR(x)			((x) << 0)
+#   define R500_US_CODE_RANGE_SIZE(x)			((x) << 16)
+#define R500_US_CONFIG					0x4600
+#   define R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO		(1 << 1)
+#define R500_US_FC_ADDR_0				0xa000
+#   define R500_FC_BOOL_ADDR(x)				((x) << 0)
+#   define R500_FC_INT_ADDR(x)				((x) << 8)
+#   define R500_FC_JUMP_ADDR(x)				((x) << 16)
+#   define R500_FC_JUMP_GLOBAL				(1 << 31)
+#define R500_US_FC_BOOL_CONST				0x4620
+#   define R500_FC_KBOOL(x)				(x)
+#define R500_US_FC_CTRL					0x4624
+#   define R500_FC_TEST_EN				(1 << 30)
+#   define R500_FC_FULL_FC_EN				(1 << 31)
+#define R500_US_FC_INST_0				0x9800
+#   define R500_FC_OP_JUMP				(0 << 0)
+#   define R500_FC_OP_LOOP				(1 << 0)
+#   define R500_FC_OP_ENDLOOP				(2 << 0)
+#   define R500_FC_OP_REP				(3 << 0)
+#   define R500_FC_OP_ENDREP				(4 << 0)
+#   define R500_FC_OP_BREAKLOOP				(5 << 0)
+#   define R500_FC_OP_BREAKREP				(6 << 0)
+#   define R500_FC_OP_CONTINUE				(7 << 0)
+#   define R500_FC_B_ELSE				(1 << 4)
+#   define R500_FC_JUMP_ANY				(1 << 5)
+#   define R500_FC_A_OP_NONE				(0 << 6)
+#   define R500_FC_A_OP_POP				(1 << 6)
+#   define R500_FC_A_OP_PUSH				(2 << 6)
+#   define R500_FC_JUMP_FUNC(x)				((x) << 8)
+#   define R500_FC_B_POP_CNT(x)				((x) << 16)
+#   define R500_FC_B_OP0_NONE				(0 << 24)
+#   define R500_FC_B_OP0_DECR				(1 << 24)
+#   define R500_FC_B_OP0_INCR				(2 << 24)
+#   define R500_FC_B_OP1_DECR				(0 << 26)
+#   define R500_FC_B_OP1_NONE				(1 << 26)
+#   define R500_FC_B_OP1_INCR				(2 << 26)
+#   define R500_FC_IGNORE_UNCOVERED			(1 << 28)
+#define R500_US_FC_INT_CONST_0				0x4c00
+#   define R500_FC_INT_CONST_KR(x)			((x) << 0)
+#   define R500_FC_INT_CONST_KG(x)			((x) << 8)
+#   define R500_FC_INT_CONST_KB(x)			((x) << 16)
+/* _0 through _15 */
+#define R500_US_FORMAT0_0				0x4640
+#   define R500_FORMAT_TXWIDTH(x)			((x) << 0)
+#   define R500_FORMAT_TXHEIGHT(x)			((x) << 11)
+#   define R500_FORMAT_TXDEPTH(x)			((x) << 22)
+#define R500_US_PIXSIZE					0x4604
+#   define R500_PIX_SIZE(x)				(x)
+#define R500_US_TEX_ADDR_0				0x9800
+#   define R500_TEX_SRC_ADDR(x)				((x) << 0)
+#   define R500_TEX_SRC_ADDR_REL			(1 << 7)
+#   define R500_TEX_SRC_S_SWIZ_R			(0 << 8)
+#   define R500_TEX_SRC_S_SWIZ_G			(1 << 8)
+#   define R500_TEX_SRC_S_SWIZ_B			(2 << 8)
+#   define R500_TEX_SRC_S_SWIZ_A			(3 << 8)
+#   define R500_TEX_SRC_T_SWIZ_R			(0 << 10)
+#   define R500_TEX_SRC_T_SWIZ_G			(1 << 10)
+#   define R500_TEX_SRC_T_SWIZ_B			(2 << 10)
+#   define R500_TEX_SRC_T_SWIZ_A			(3 << 10)
+#   define R500_TEX_SRC_R_SWIZ_R			(0 << 12)
+#   define R500_TEX_SRC_R_SWIZ_G			(1 << 12)
+#   define R500_TEX_SRC_R_SWIZ_B			(2 << 12)
+#   define R500_TEX_SRC_R_SWIZ_A			(3 << 12)
+#   define R500_TEX_SRC_Q_SWIZ_R			(0 << 14)
+#   define R500_TEX_SRC_Q_SWIZ_G			(1 << 14)
+#   define R500_TEX_SRC_Q_SWIZ_B			(2 << 14)
+#   define R500_TEX_SRC_Q_SWIZ_A			(3 << 14)
+#   define R500_TEX_DST_ADDR(x)				((x) << 16)
+#   define R500_TEX_DST_ADDR_REL			(1 << 23)
+#   define R500_TEX_DST_R_SWIZ_R			(0 << 24)
+#   define R500_TEX_DST_R_SWIZ_G			(1 << 24)
+#   define R500_TEX_DST_R_SWIZ_B			(2 << 24)
+#   define R500_TEX_DST_R_SWIZ_A			(3 << 24)
+#   define R500_TEX_DST_G_SWIZ_R			(0 << 26)
+#   define R500_TEX_DST_G_SWIZ_G			(1 << 26)
+#   define R500_TEX_DST_G_SWIZ_B			(2 << 26)
+#   define R500_TEX_DST_G_SWIZ_A			(3 << 26)
+#   define R500_TEX_DST_B_SWIZ_R			(0 << 28)
+#   define R500_TEX_DST_B_SWIZ_G			(1 << 28)
+#   define R500_TEX_DST_B_SWIZ_B			(2 << 28)
+#   define R500_TEX_DST_B_SWIZ_A			(3 << 28)
+#   define R500_TEX_DST_A_SWIZ_R			(0 << 30)
+#   define R500_TEX_DST_A_SWIZ_G			(1 << 30)
+#   define R500_TEX_DST_A_SWIZ_B			(2 << 30)
+#   define R500_TEX_DST_A_SWIZ_A			(3 << 30)
+#define R500_US_TEX_ADDR_DXDY_0				0xa000
+#   define R500_DX_ADDR(x)				((x) << 0)
+#   define R500_DX_ADDR_REL				(1 << 7)
+#   define R500_DX_S_SWIZ_R				(0 << 8)
+#   define R500_DX_S_SWIZ_G				(1 << 8)
+#   define R500_DX_S_SWIZ_B				(2 << 8)
+#   define R500_DX_S_SWIZ_A				(3 << 8)
+#   define R500_DX_T_SWIZ_R				(0 << 10)
+#   define R500_DX_T_SWIZ_G				(1 << 10)
+#   define R500_DX_T_SWIZ_B				(2 << 10)
+#   define R500_DX_T_SWIZ_A				(3 << 10)
+#   define R500_DX_R_SWIZ_R				(0 << 12)
+#   define R500_DX_R_SWIZ_G				(1 << 12)
+#   define R500_DX_R_SWIZ_B				(2 << 12)
+#   define R500_DX_R_SWIZ_A				(3 << 12)
+#   define R500_DX_Q_SWIZ_R				(0 << 14)
+#   define R500_DX_Q_SWIZ_G				(1 << 14)
+#   define R500_DX_Q_SWIZ_B				(2 << 14)
+#   define R500_DX_Q_SWIZ_A				(3 << 14)
+#   define R500_DY_ADDR(x)				((x) << 16)
+#   define R500_DY_ADDR_REL				(1 << 17)
+#   define R500_DY_S_SWIZ_R				(0 << 24)
+#   define R500_DY_S_SWIZ_G				(1 << 24)
+#   define R500_DY_S_SWIZ_B				(2 << 24)
+#   define R500_DY_S_SWIZ_A				(3 << 24)
+#   define R500_DY_T_SWIZ_R				(0 << 26)
+#   define R500_DY_T_SWIZ_G				(1 << 26)
+#   define R500_DY_T_SWIZ_B				(2 << 26)
+#   define R500_DY_T_SWIZ_A				(3 << 26)
+#   define R500_DY_R_SWIZ_R				(0 << 28)
+#   define R500_DY_R_SWIZ_G				(1 << 28)
+#   define R500_DY_R_SWIZ_B				(2 << 28)
+#   define R500_DY_R_SWIZ_A				(3 << 28)
+#   define R500_DY_Q_SWIZ_R				(0 << 30)
+#   define R500_DY_Q_SWIZ_G				(1 << 30)
+#   define R500_DY_Q_SWIZ_B				(2 << 30)
+#   define R500_DY_Q_SWIZ_A				(3 << 30)
+#define R500_US_TEX_INST_0				0x9000
+#   define R500_TEX_ID(x)				((x) << 16)
+#   define R500_TEX_INST_NOP				(0 << 22)
+#   define R500_TEX_INST_LD				(1 << 22)
+#   define R500_TEX_INST_TEXKILL			(2 << 22)
+#   define R500_TEX_INST_PROJ				(3 << 22)
+#   define R500_TEX_INST_LODBIAS			(4 << 22)
+#   define R500_TEX_INST_LOD				(5 << 22)
+#   define R500_TEX_INST_DXDY				(6 << 22)
+#   define R500_TEX_SEM_ACQUIRE				(1 << 25)
+#   define R500_TEX_IGNORE_UNCOVERED			(1 << 26)
+#   define R500_TEX_UNSCALED				(1 << 27)
+#define R300_US_W_FMT					0x46b4
+#   define R300_W_FMT_W0				(0 << 0)
+#   define R300_W_FMT_W24				(1 << 0)
+#   define R300_W_FMT_W24FP				(2 << 0)
+#   define R300_W_SRC_US				(0 << 2)
+#   define R300_W_SRC_RAS				(1 << 2)
+
+/* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR.
+ * Two parameter dwords:
+ * 0. VAP_VTX_FMT: The first parameter is not written to hardware
+ * 1. VAP_VF_CTL: The second parameter is a standard primitive emission dword.
+ */
+#define R300_PACKET3_3D_DRAW_VBUF           0x00002800
+
+/* Draw a primitive from immediate vertices in this packet
+ * Up to 16382 dwords:
+ * 0. VAP_VTX_FMT: The first parameter is not written to hardware
+ * 1. VAP_VF_CTL: The second parameter is a standard primitive emission dword.
+ * 2 to end: Up to 16380 dwords of vertex data.
+ */
+#define R300_PACKET3_3D_DRAW_IMMD           0x00002900
+
+/* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR and
+ * immediate vertices in this packet
+ * Up to 16382 dwords:
+ * 0. VAP_VTX_FMT: The first parameter is not written to hardware
+ * 1. VAP_VF_CTL: The second parameter is a standard primitive emission dword.
+ * 2 to end: Up to 16380 dwords of vertex data.
+ */
+#define R300_PACKET3_3D_DRAW_INDX           0x00002A00
+
+
+/* Specify the full set of vertex arrays as (address, stride).
+ * The first parameter is the number of vertex arrays specified.
+ * The rest of the command is a variable length list of blocks, where
+ * each block is three dwords long and specifies two arrays.
+ * The first dword of a block is split into two words, the lower significant
+ * word refers to the first array, the more significant word to the second
+ * array in the block.
+ * The low byte of each word contains the size of an array entry in dwords,
+ * the high byte contains the stride of the array.
+ * The second dword of a block contains the pointer to the first array,
+ * the third dword of a block contains the pointer to the second array.
+ * Note that if the total number of arrays is odd, the third dword of
+ * the last block is omitted.
+ */
+#define R300_PACKET3_3D_LOAD_VBPNTR         0x00002F00
+#   define R300_VC_FORCE_PREFETCH  (1 << 5)
+#   define R300_VBPNTR_SIZE0(x)    ((x) >> 2)
+#   define R300_VBPNTR_STRIDE0(x)  (((x) >> 2) << 8)
+#   define R300_VBPNTR_SIZE1(x)    (((x) >> 2) << 16)
+#   define R300_VBPNTR_STRIDE1(x)  (((x) >> 2) << 24)
+
+#define R300_PACKET3_INDX_BUFFER            0x00003300
+#    define R300_INDX_BUFFER_DST_SHIFT          0
+#    define R300_INDX_BUFFER_SKIP_SHIFT         16
+#    define R300_INDX_BUFFER_ONE_REG_WR		(1<<31)
+
+/* Same as R300_PACKET3_3D_DRAW_VBUF but without VAP_VTX_FMT */
+#define R300_PACKET3_3D_DRAW_VBUF_2         0x00003400
+/* Same as R300_PACKET3_3D_DRAW_IMMD but without VAP_VTX_FMT */
+#define R300_PACKET3_3D_DRAW_IMMD_2         0x00003500
+/* Same as R300_PACKET3_3D_DRAW_INDX but without VAP_VTX_FMT */
+#define R300_PACKET3_3D_DRAW_INDX_2         0x00003600
+
+/* Clears a portion of hierachical Z RAM
+ * 3 dword parameters
+ * 0. START
+ * 1. COUNT: 13:0 (max is 0x3FFF)
+ * 2. CLEAR_VALUE: Value to write into HIZ RAM.
+ */
+#define R300_PACKET3_3D_CLEAR_HIZ           0x00003700
+
+/* Draws a set of primitives using vertex buffers pointed by the state data.
+ * At least 2 Parameters:
+ * 0. VAP_VF_CNTL: The first parameter is a standard primitive emission dword.
+ * 2 to end: Data or indices (see other 3D_DRAW_* packets for details)
+ */
+#define R300_PACKET3_3D_DRAW_128            0x00003900
+
+/* END: Packet 3 commands */
+
+
+/* Color formats for 2d packets
+ */
+#define R300_CP_COLOR_FORMAT_CI8	2
+#define R300_CP_COLOR_FORMAT_ARGB1555	3
+#define R300_CP_COLOR_FORMAT_RGB565	4
+#define R300_CP_COLOR_FORMAT_ARGB8888	6
+#define R300_CP_COLOR_FORMAT_RGB332	7
+#define R300_CP_COLOR_FORMAT_RGB8	9
+#define R300_CP_COLOR_FORMAT_ARGB4444	15
+
+/*
+ * CP type-3 packets
+ */
+#define R300_CP_CMD_BITBLT_MULTI	0xC0009B00
+
+/* XXX Corbin's stuff from radeon and r200 */
+
+#define RADEON_WAIT_UNTIL                   0x1720
+#       define RADEON_WAIT_CRTC_PFLIP       (1 << 0)
+#       define RADEON_WAIT_2D_IDLECLEAN     (1 << 16)
+#       define RADEON_WAIT_3D_IDLECLEAN     (1 << 17)
+#       define RADEON_WAIT_HOST_IDLECLEAN   (1 << 18)
+
+#define R200_3D_DRAW_IMMD_2      0xC0003500
+
+#define RADEON_CP_PACKET0 0x0 /* XXX stolen from radeon_reg.h */
+#define RADEON_CP_PACKET3                           0xC0000000
+
+#define RADEON_ONE_REG_WR        (1 << 15)
+
+#define CP_PACKET0(register, count) \
+    (RADEON_CP_PACKET0 | ((count) << 16) | ((register) >> 2))
+
+#define CP_PACKET3(op, count) \
+    (RADEON_CP_PACKET3 | (op) | ((count) << 16))
+
+#endif /* _R300_REG_H */
+
+/* *INDENT-ON* */
+
+/* vim: set foldenable foldmarker=\\{,\\} foldmethod=marker : */
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
new file mode 100644
index 0000000000..4afd124c0e
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -0,0 +1,1073 @@
+/*
+ * Copyright 2009 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/* r300_render: Vertex and index buffer primitive emission. Contains both
+ * HW TCL fastpath rendering, and SW TCL Draw-assisted rendering. */
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+
+#include "util/u_inlines.h"
+
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
+#include "util/u_prim.h"
+
+#include "r300_cs.h"
+#include "r300_cb.h"
+#include "r300_context.h"
+#include "r300_screen_buffer.h"
+#include "r300_emit.h"
+#include "r300_reg.h"
+#include "r300_state_derived.h"
+
+#include <limits.h>
+
+#define IMMD_DWORDS 32
+
+static uint32_t r300_translate_primitive(unsigned prim)
+{
+    switch (prim) {
+        case PIPE_PRIM_POINTS:
+            return R300_VAP_VF_CNTL__PRIM_POINTS;
+        case PIPE_PRIM_LINES:
+            return R300_VAP_VF_CNTL__PRIM_LINES;
+        case PIPE_PRIM_LINE_LOOP:
+            return R300_VAP_VF_CNTL__PRIM_LINE_LOOP;
+        case PIPE_PRIM_LINE_STRIP:
+            return R300_VAP_VF_CNTL__PRIM_LINE_STRIP;
+        case PIPE_PRIM_TRIANGLES:
+            return R300_VAP_VF_CNTL__PRIM_TRIANGLES;
+        case PIPE_PRIM_TRIANGLE_STRIP:
+            return R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP;
+        case PIPE_PRIM_TRIANGLE_FAN:
+            return R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN;
+        case PIPE_PRIM_QUADS:
+            return R300_VAP_VF_CNTL__PRIM_QUADS;
+        case PIPE_PRIM_QUAD_STRIP:
+            return R300_VAP_VF_CNTL__PRIM_QUAD_STRIP;
+        case PIPE_PRIM_POLYGON:
+            return R300_VAP_VF_CNTL__PRIM_POLYGON;
+        default:
+            return 0;
+    }
+}
+
+static uint32_t r300_provoking_vertex_fixes(struct r300_context *r300,
+                                            unsigned mode)
+{
+    struct r300_rs_state* rs = (struct r300_rs_state*)r300->rs_state.state;
+    uint32_t color_control = rs->color_control;
+
+    /* By default (see r300_state.c:r300_create_rs_state) color_control is
+     * initialized to provoking the first vertex.
+     *
+     * Triangle fans must be reduced to the second vertex, not the first, in
+     * Gallium flatshade-first mode, as per the GL spec.
+     * (http://www.opengl.org/registry/specs/ARB/provoking_vertex.txt)
+     *
+     * Quads never provoke correctly in flatshade-first mode. The first
+     * vertex is never considered as provoking, so only the second, third,
+     * and fourth vertices can be selected, and both "third" and "last" modes
+     * select the fourth vertex. This is probably due to D3D lacking quads.
+     *
+     * Similarly, polygons reduce to the first, not the last, vertex, when in
+     * "last" mode, and all other modes start from the second vertex.
+     *
+     * ~ C.
+     */
+
+    if (rs->rs.flatshade_first) {
+        switch (mode) {
+            case PIPE_PRIM_TRIANGLE_FAN:
+                color_control |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_SECOND;
+                break;
+            case PIPE_PRIM_QUADS:
+            case PIPE_PRIM_QUAD_STRIP:
+            case PIPE_PRIM_POLYGON:
+                color_control |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST;
+                break;
+            default:
+                color_control |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_FIRST;
+                break;
+        }
+    } else {
+        color_control |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST;
+    }
+
+    return color_control;
+}
+
+static boolean index_bias_supported(struct r300_context *r300)
+{
+    return r300->screen->caps.is_r500 &&
+           r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0);
+}
+
+static void r500_emit_index_bias(struct r300_context *r300, int index_bias)
+{
+    CS_LOCALS(r300);
+
+    BEGIN_CS(2);
+    OUT_CS_REG(R500_VAP_INDEX_OFFSET,
+               (index_bias & 0xFFFFFF) | (index_bias < 0 ? 1<<24 : 0));
+    END_CS;
+}
+
+/* This function splits the index bias value into two parts:
+ * - buffer_offset: the value that can be safely added to buffer offsets
+ *   in r300_emit_aos (it must yield a positive offset when added to
+ *   a vertex buffer offset)
+ * - index_offset: the value that must be manually subtracted from indices
+ *   in an index buffer to achieve negative offsets. */
+static void r300_split_index_bias(struct r300_context *r300, int index_bias,
+                                  int *buffer_offset, int *index_offset)
+{
+    struct pipe_vertex_buffer *vb, *vbufs = r300->vertex_buffer;
+    struct pipe_vertex_element *velem = r300->velems->velem;
+    unsigned i, size;
+    int max_neg_bias;
+
+    if (index_bias < 0) {
+        /* See how large index bias we may subtract. We must be careful
+         * here because negative buffer offsets are not allowed
+         * by the DRM API. */
+        max_neg_bias = INT_MAX;
+        for (i = 0; i < r300->velems->count; i++) {
+            vb = &vbufs[velem[i].vertex_buffer_index];
+            size = (vb->buffer_offset + velem[i].src_offset) / vb->stride;
+            max_neg_bias = MIN2(max_neg_bias, size);
+        }
+
+        /* Now set the minimum allowed value. */
+        *buffer_offset = MAX2(-max_neg_bias, index_bias);
+    } else {
+        /* A positive index bias is OK. */
+        *buffer_offset = index_bias;
+    }
+
+    *index_offset = index_bias - *buffer_offset;
+}
+
+enum r300_prepare_flags {
+    PREP_FIRST_DRAW     = (1 << 0), /* call emit_dirty_state and friends? */
+    PREP_VALIDATE_VBOS  = (1 << 1), /* validate VBOs? */
+    PREP_EMIT_AOS       = (1 << 2), /* call emit_aos? */
+    PREP_EMIT_AOS_SWTCL = (1 << 3), /* call emit_aos_swtcl? */
+    PREP_INDEXED        = (1 << 4)  /* is this draw_elements? */
+};
+
+/**
+ * Check if the requested number of dwords is available in the CS and
+ * if not, flush. Then validate buffers and emit dirty state.
+ * \param r300          The context.
+ * \param flags         See r300_prepare_flags.
+ * \param index_buffer  The index buffer to validate. The parameter may be NULL.
+ * \param cs_dwords     The number of dwords to reserve in CS.
+ * \param aos_offset    The offset passed to emit_aos.
+ * \param index_bias    The index bias to emit.
+ * \param end_cs_dwords The number of free dwords which must be available
+ *                      at the end of CS after drawing in case the CS space
+ *                      management is performed by a draw_* function manually.
+ *                      The parameter may be NULL.
+ */
+static void r300_prepare_for_rendering(struct r300_context *r300,
+                                       enum r300_prepare_flags flags,
+                                       struct pipe_resource *index_buffer,
+                                       unsigned cs_dwords,
+                                       int aos_offset,
+                                       int index_bias,
+                                       unsigned *end_cs_dwords)
+{
+    unsigned end_dwords    = 0;
+    boolean flushed        = FALSE;
+    boolean first_draw     = flags & PREP_FIRST_DRAW;
+    boolean emit_aos       = flags & PREP_EMIT_AOS;
+    boolean emit_aos_swtcl = flags & PREP_EMIT_AOS_SWTCL;
+    boolean indexed        = flags & PREP_INDEXED;
+    boolean hw_index_bias  = index_bias_supported(r300);
+
+    /* Add dirty state, index offset, and AOS. */
+    if (first_draw) {
+        cs_dwords += r300_get_num_dirty_dwords(r300);
+
+        if (hw_index_bias)
+            cs_dwords += 2; /* emit_index_offset */
+
+        if (emit_aos)
+            cs_dwords += 55; /* emit_aos */
+
+        if (emit_aos_swtcl)
+            cs_dwords += 7; /* emit_aos_swtcl */
+    }
+
+    /* Emitted in flush. */
+    end_dwords += 26; /* emit_query_end */
+
+    cs_dwords += end_dwords;
+
+    /* Reserve requested CS space. */
+    if (!r300_check_cs(r300, cs_dwords)) {
+        r300->context.flush(&r300->context, 0, NULL);
+        flushed = TRUE;
+    }
+
+    /* Validate buffers and emit dirty state if needed. */
+    if (first_draw || flushed) {
+        r300_emit_buffer_validate(r300, flags & PREP_VALIDATE_VBOS, index_buffer);
+        r300_emit_dirty_state(r300);
+        if (hw_index_bias) {
+            if (r300->screen->caps.has_tcl)
+                r500_emit_index_bias(r300, index_bias);
+            else
+                r500_emit_index_bias(r300, 0);
+        }
+
+        if (emit_aos)
+            r300_emit_aos(r300, aos_offset, indexed);
+
+        if (emit_aos_swtcl)
+            r300_emit_aos_swtcl(r300, indexed);
+    }
+
+    if (end_cs_dwords)
+        *end_cs_dwords = end_dwords;
+}
+
+static boolean immd_is_good_idea(struct r300_context *r300,
+                                 unsigned count)
+{
+    struct pipe_vertex_element* velem;
+    struct pipe_vertex_buffer* vbuf;
+    boolean checked[PIPE_MAX_ATTRIBS] = {0};
+    unsigned vertex_element_count = r300->velems->count;
+    unsigned i, vbi;
+
+    if (DBG_ON(r300, DBG_NO_IMMD)) {
+        return FALSE;
+    }
+
+    if (r300->draw) {
+        return FALSE;
+    }
+
+    if (count * r300->velems->vertex_size_dwords > IMMD_DWORDS) {
+        return FALSE;
+    }
+
+    /* We shouldn't map buffers referenced by CS, busy buffers,
+     * and ones placed in VRAM. */
+    /* XXX Check for VRAM buffers. */
+    for (i = 0; i < vertex_element_count; i++) {
+        velem = &r300->velems->velem[i];
+        vbi = velem->vertex_buffer_index;
+
+        if (!checked[vbi]) {
+            vbuf = &r300->vertex_buffer[vbi];
+
+            if (r300_buffer_is_referenced(&r300->context,
+                                          vbuf->buffer,
+                                          R300_REF_CS | R300_REF_HW)) {
+                /* It's a very bad idea to map it... */
+                return FALSE;
+            }
+            checked[vbi] = TRUE;
+        }
+    }
+    return TRUE;
+}
+
+/*****************************************************************************
+ * The emission of draw packets for r500. Older GPUs may use these functions *
+ * after resolving fallback issues (e.g. stencil ref two-sided).             *
+ ****************************************************************************/
+
+static void r300_emit_draw_arrays_immediate(struct r300_context *r300,
+                                            unsigned mode,
+                                            unsigned start,
+                                            unsigned count)
+{
+    struct pipe_vertex_element* velem;
+    struct pipe_vertex_buffer* vbuf;
+    unsigned vertex_element_count = r300->velems->count;
+    unsigned i, v, vbi, dwords;
+
+    /* Size of the vertex, in dwords. */
+    unsigned vertex_size = r300->velems->vertex_size_dwords;
+
+    /* Offsets of the attribute, in dwords, from the start of the vertex. */
+    unsigned offset[PIPE_MAX_ATTRIBS];
+
+    /* Size of the vertex element, in dwords. */
+    unsigned size[PIPE_MAX_ATTRIBS];
+
+    /* Stride to the same attrib in the next vertex in the vertex buffer,
+     * in dwords. */
+    unsigned stride[PIPE_MAX_ATTRIBS] = {0};
+
+    /* Mapped vertex buffers. */
+    uint32_t* map[PIPE_MAX_ATTRIBS] = {0};
+    struct pipe_transfer* transfer[PIPE_MAX_ATTRIBS] = {NULL};
+
+    CB_LOCALS;
+
+    /* Calculate the vertex size, offsets, strides etc. and map the buffers. */
+    for (i = 0; i < vertex_element_count; i++) {
+        velem = &r300->velems->velem[i];
+        offset[i] = velem->src_offset / 4;
+        size[i] = r300->velems->hw_format_size[i] / 4;
+        vbi = velem->vertex_buffer_index;
+
+        /* Map the buffer. */
+        if (!map[vbi]) {
+            vbuf = &r300->vertex_buffer[vbi];
+            map[vbi] = (uint32_t*)pipe_buffer_map(&r300->context,
+                                                  vbuf->buffer,
+                                                  PIPE_TRANSFER_READ,
+						  &transfer[vbi]);
+            stride[vbi] = vbuf->stride / 4;
+            map[vbi] += vbuf->buffer_offset / 4 + stride[vbi] * start;
+        }
+    }
+
+    dwords = 9 + count * vertex_size;
+
+    r300_prepare_for_rendering(r300, PREP_FIRST_DRAW, NULL, dwords, 0, 0, NULL);
+
+    BEGIN_CS_AS_CB(r300, dwords);
+    OUT_CB_REG(R300_GA_COLOR_CONTROL,
+            r300_provoking_vertex_fixes(r300, mode));
+    OUT_CB_REG(R300_VAP_VTX_SIZE, vertex_size);
+    OUT_CB_REG_SEQ(R300_VAP_VF_MAX_VTX_INDX, 2);
+    OUT_CB(count - 1);
+    OUT_CB(0);
+    OUT_CB_PKT3(R300_PACKET3_3D_DRAW_IMMD_2, count * vertex_size);
+    OUT_CB(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED | (count << 16) |
+            r300_translate_primitive(mode));
+
+    /* Emit vertices. */
+    for (v = 0; v < count; v++) {
+        for (i = 0; i < vertex_element_count; i++) {
+            vbi = r300->velems->velem[i].vertex_buffer_index;
+
+            OUT_CB_TABLE(&map[vbi][offset[i] + stride[vbi] * v], size[i]);
+        }
+    }
+    END_CB;
+
+    /* Unmap buffers. */
+    for (i = 0; i < vertex_element_count; i++) {
+        vbi = r300->velems->velem[i].vertex_buffer_index;
+
+        if (map[vbi]) {
+            vbuf = &r300->vertex_buffer[vbi];
+            pipe_buffer_unmap(&r300->context, vbuf->buffer, transfer[vbi]);
+            map[vbi] = NULL;
+        }
+    }
+}
+
+static void r300_emit_draw_arrays(struct r300_context *r300,
+                                  unsigned mode,
+                                  unsigned count)
+{
+    boolean alt_num_verts = count > 65535;
+    CS_LOCALS(r300);
+
+    if (count >= (1 << 24)) {
+        fprintf(stderr, "r300: Got a huge number of vertices: %i, "
+                "refusing to render.\n", count);
+        return;
+    }
+
+    BEGIN_CS(7 + (alt_num_verts ? 2 : 0));
+    if (alt_num_verts) {
+        OUT_CS_REG(R500_VAP_ALT_NUM_VERTICES, count);
+    }
+    OUT_CS_REG(R300_GA_COLOR_CONTROL,
+            r300_provoking_vertex_fixes(r300, mode));
+    OUT_CS_REG_SEQ(R300_VAP_VF_MAX_VTX_INDX, 2);
+    OUT_CS(count - 1);
+    OUT_CS(0);
+    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (count << 16) |
+           r300_translate_primitive(mode) |
+           (alt_num_verts ? R500_VAP_VF_CNTL__USE_ALT_NUM_VERTS : 0));
+    END_CS;
+}
+
+static void r300_emit_draw_elements(struct r300_context *r300,
+                                    struct pipe_resource* indexBuffer,
+                                    unsigned indexSize,
+                                    unsigned minIndex,
+                                    unsigned maxIndex,
+                                    unsigned mode,
+                                    unsigned start,
+                                    unsigned count)
+{
+    uint32_t count_dwords;
+    uint32_t offset_dwords = indexSize * start / sizeof(uint32_t);
+    boolean alt_num_verts = count > 65535;
+    CS_LOCALS(r300);
+
+    if (count >= (1 << 24)) {
+        fprintf(stderr, "r300: Got a huge number of vertices: %i, "
+                "refusing to render.\n", count);
+        return;
+    }
+
+    maxIndex = MIN2(maxIndex, r300->vertex_buffer_max_index);
+
+    DBG(r300, DBG_DRAW, "r300: Indexbuf of %u indices, min %u max %u\n",
+        count, minIndex, maxIndex);
+
+    BEGIN_CS(13 + (alt_num_verts ? 2 : 0));
+    if (alt_num_verts) {
+        OUT_CS_REG(R500_VAP_ALT_NUM_VERTICES, count);
+    }
+    OUT_CS_REG(R300_GA_COLOR_CONTROL,
+            r300_provoking_vertex_fixes(r300, mode));
+    OUT_CS_REG_SEQ(R300_VAP_VF_MAX_VTX_INDX, 2);
+    OUT_CS(maxIndex);
+    OUT_CS(minIndex);
+    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, 0);
+    if (indexSize == 4) {
+        count_dwords = count;
+        OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
+               R300_VAP_VF_CNTL__INDEX_SIZE_32bit |
+               r300_translate_primitive(mode) |
+               (alt_num_verts ? R500_VAP_VF_CNTL__USE_ALT_NUM_VERTS : 0));
+    } else {
+        count_dwords = (count + 1) / 2;
+        OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
+               r300_translate_primitive(mode) |
+               (alt_num_verts ? R500_VAP_VF_CNTL__USE_ALT_NUM_VERTS : 0));
+    }
+
+    /* INDX_BUFFER is a truly special packet3.
+     * Unlike most other packet3, where the offset is after the count,
+     * the order is reversed, so the relocation ends up carrying the
+     * size of the indexbuf instead of the offset.
+     */
+    OUT_CS_PKT3(R300_PACKET3_INDX_BUFFER, 2);
+    OUT_CS(R300_INDX_BUFFER_ONE_REG_WR | (R300_VAP_PORT_IDX0 >> 2) |
+           (0 << R300_INDX_BUFFER_SKIP_SHIFT));
+    OUT_CS(offset_dwords << 2);
+    OUT_CS_BUF_RELOC(indexBuffer, count_dwords,
+		     r300_buffer(indexBuffer)->domain, 0, 0);
+
+    END_CS;
+}
+
+/* This is the fast-path drawing & emission for HW TCL. */
+static void r300_draw_range_elements(struct pipe_context* pipe,
+                                     struct pipe_resource* indexBuffer,
+                                     unsigned indexSize,
+                                     int indexBias,
+                                     unsigned minIndex,
+                                     unsigned maxIndex,
+                                     unsigned mode,
+                                     unsigned start,
+                                     unsigned count)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct pipe_resource* orgIndexBuffer = indexBuffer;
+    boolean alt_num_verts = r300->screen->caps.is_r500 &&
+                            count > 65536 &&
+                            r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0);
+    unsigned short_count;
+    int buffer_offset = 0, index_offset = 0; /* for index bias emulation */
+    boolean translate = FALSE;
+
+    if (r300->skip_rendering) {
+        return;
+    }
+
+    if (!u_trim_pipe_prim(mode, &count)) {
+        return;
+    }
+
+    /* Set up fallback for incompatible vertex layout if needed. */
+    if (r300->incompatible_vb_layout || r300->velems->incompatible_layout) {
+        r300_begin_vertex_translate(r300);
+        translate = TRUE;
+    }
+
+    if (indexBias && !index_bias_supported(r300)) {
+        r300_split_index_bias(r300, indexBias, &buffer_offset, &index_offset);
+    }
+
+    r300_translate_index_buffer(r300, &indexBuffer, &indexSize, index_offset,
+                                &start, count);
+
+    r300_update_derived_state(r300);
+    r300_upload_index_buffer(r300, &indexBuffer, indexSize, start, count);
+
+    /* 15 dwords for emit_draw_elements */
+    r300_prepare_for_rendering(r300,
+        PREP_FIRST_DRAW | PREP_VALIDATE_VBOS | PREP_EMIT_AOS | PREP_INDEXED,
+        indexBuffer, 15, buffer_offset, indexBias, NULL);
+
+    u_upload_flush(r300->upload_vb);
+    u_upload_flush(r300->upload_ib);
+    if (alt_num_verts || count <= 65535) {
+        r300_emit_draw_elements(r300, indexBuffer, indexSize,
+                                 minIndex, maxIndex, mode, start, count);
+    } else {
+        do {
+            short_count = MIN2(count, 65534);
+            r300_emit_draw_elements(r300, indexBuffer, indexSize,
+                                     minIndex, maxIndex,
+                                     mode, start, short_count);
+
+            start += short_count;
+            count -= short_count;
+
+            /* 15 dwords for emit_draw_elements */
+            if (count) {
+                r300_prepare_for_rendering(r300,
+                    PREP_VALIDATE_VBOS | PREP_EMIT_AOS | PREP_INDEXED,
+                    indexBuffer, 15, buffer_offset, indexBias, NULL);
+            }
+        } while (count);
+    }
+
+    if (indexBuffer != orgIndexBuffer) {
+        pipe_resource_reference( &indexBuffer, NULL );
+    }
+
+    if (translate) {
+        r300_end_vertex_translate(r300);
+    }
+}
+
+/* Simple helpers for context setup. Should probably be moved to util. */
+static void r300_draw_elements(struct pipe_context* pipe,
+                               struct pipe_resource* indexBuffer,
+                               unsigned indexSize, int indexBias, unsigned mode,
+                               unsigned start, unsigned count)
+{
+    struct r300_context *r300 = r300_context(pipe);
+
+    pipe->draw_range_elements(pipe, indexBuffer, indexSize, indexBias,
+                              0, r300->vertex_buffer_max_index,
+                              mode, start, count);
+}
+
+static void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
+                             unsigned start, unsigned count)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    boolean alt_num_verts = r300->screen->caps.is_r500 &&
+                            count > 65536 &&
+                            r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0);
+    unsigned short_count;
+    boolean translate = FALSE;
+
+    if (r300->skip_rendering) {
+        return;
+    }
+
+    if (!u_trim_pipe_prim(mode, &count)) {
+        return;
+    }
+
+    /* Set up fallback for incompatible vertex layout if needed. */
+    if (r300->incompatible_vb_layout || r300->velems->incompatible_layout) {
+        r300_begin_vertex_translate(r300);
+        translate = TRUE;
+    }
+
+    r300_update_derived_state(r300);
+
+    if (immd_is_good_idea(r300, count)) {
+        r300_emit_draw_arrays_immediate(r300, mode, start, count);
+    } else {
+        /* 9 spare dwords for emit_draw_arrays. */
+        r300_prepare_for_rendering(r300, PREP_FIRST_DRAW | PREP_VALIDATE_VBOS | PREP_EMIT_AOS,
+                               NULL, 9, start, 0, NULL);
+
+        if (alt_num_verts || count <= 65535) {
+            r300_emit_draw_arrays(r300, mode, count);
+        } else {
+            do {
+                short_count = MIN2(count, 65535);
+                r300_emit_draw_arrays(r300, mode, short_count);
+
+                start += short_count;
+                count -= short_count;
+
+                /* 9 spare dwords for emit_draw_arrays. */
+                if (count) {
+                    r300_prepare_for_rendering(r300,
+                        PREP_VALIDATE_VBOS | PREP_EMIT_AOS, NULL, 9,
+                        start, 0, NULL);
+                }
+            } while (count);
+        }
+	u_upload_flush(r300->upload_vb);
+    }
+
+    if (translate) {
+        r300_end_vertex_translate(r300);
+    }
+}
+
+/****************************************************************************
+ * The rest of this file is for SW TCL rendering only. Please be polite and *
+ * keep these functions separated so that they are easier to locate. ~C.    *
+ ***************************************************************************/
+
+/* SW TCL arrays, using Draw. */
+static void r300_swtcl_draw_arrays(struct pipe_context* pipe,
+                                   unsigned mode,
+                                   unsigned start,
+                                   unsigned count)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS];
+    int i;
+
+    if (r300->skip_rendering) {
+        return;
+    }
+
+    if (!u_trim_pipe_prim(mode, &count)) {
+        return;
+    }
+
+    r300_update_derived_state(r300);
+
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        void* buf = pipe_buffer_map(pipe,
+                                    r300->vertex_buffer[i].buffer,
+                                    PIPE_TRANSFER_READ,
+				    &vb_transfer[i]);
+        draw_set_mapped_vertex_buffer(r300->draw, i, buf);
+    }
+
+    draw_set_mapped_element_buffer(r300->draw, 0, 0, NULL);
+
+    draw_arrays(r300->draw, mode, start, count);
+
+    /* XXX Not sure whether this is the best fix.
+     * It prevents CS from being rejected and weird assertion failures. */
+    draw_flush(r300->draw);
+
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        pipe_buffer_unmap(pipe, r300->vertex_buffer[i].buffer,
+			  vb_transfer[i]);
+        draw_set_mapped_vertex_buffer(r300->draw, i, NULL);
+    }
+}
+
+/* SW TCL elements, using Draw. */
+static void r300_swtcl_draw_range_elements(struct pipe_context* pipe,
+                                           struct pipe_resource* indexBuffer,
+                                           unsigned indexSize,
+                                           int indexBias,
+                                           unsigned minIndex,
+                                           unsigned maxIndex,
+                                           unsigned mode,
+                                           unsigned start,
+                                           unsigned count)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS];
+    struct pipe_transfer *ib_transfer;
+    int i;
+    void* indices;
+
+    if (r300->skip_rendering) {
+        return;
+    }
+
+    if (!u_trim_pipe_prim(mode, &count)) {
+        return;
+    }
+
+    r300_update_derived_state(r300);
+
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        void* buf = pipe_buffer_map(pipe,
+                                    r300->vertex_buffer[i].buffer,
+                                    PIPE_TRANSFER_READ,
+				    &vb_transfer[i]);
+        draw_set_mapped_vertex_buffer(r300->draw, i, buf);
+    }
+
+    indices = pipe_buffer_map(pipe, indexBuffer,
+                              PIPE_TRANSFER_READ, &ib_transfer);
+    draw_set_mapped_element_buffer_range(r300->draw, indexSize, indexBias,
+                                         minIndex, maxIndex, indices);
+
+    draw_arrays(r300->draw, mode, start, count);
+
+    /* XXX Not sure whether this is the best fix.
+     * It prevents CS from being rejected and weird assertion failures. */
+    draw_flush(r300->draw);
+
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        pipe_buffer_unmap(pipe, r300->vertex_buffer[i].buffer,
+			  vb_transfer[i]);
+        draw_set_mapped_vertex_buffer(r300->draw, i, NULL);
+    }
+
+    pipe_buffer_unmap(pipe, indexBuffer,
+		      ib_transfer);
+    draw_set_mapped_element_buffer_range(r300->draw, 0, 0,
+                                         start, start + count - 1,
+                                         NULL);
+}
+
+/* Object for rendering using Draw. */
+struct r300_render {
+    /* Parent class */
+    struct vbuf_render base;
+
+    /* Pipe context */
+    struct r300_context* r300;
+
+    /* Vertex information */
+    size_t vertex_size;
+    unsigned prim;
+    unsigned hwprim;
+
+    /* VBO */
+    struct pipe_resource* vbo;
+    size_t vbo_size;
+    size_t vbo_offset;
+    size_t vbo_max_used;
+    void * vbo_ptr;
+
+    struct pipe_transfer *vbo_transfer;
+};
+
+static INLINE struct r300_render*
+r300_render(struct vbuf_render* render)
+{
+    return (struct r300_render*)render;
+}
+
+static const struct vertex_info*
+r300_render_get_vertex_info(struct vbuf_render* render)
+{
+    struct r300_render* r300render = r300_render(render);
+    struct r300_context* r300 = r300render->r300;
+
+    return &r300->vertex_info;
+}
+
+static boolean r300_render_allocate_vertices(struct vbuf_render* render,
+                                                   ushort vertex_size,
+                                                   ushort count)
+{
+    struct r300_render* r300render = r300_render(render);
+    struct r300_context* r300 = r300render->r300;
+    struct pipe_screen* screen = r300->context.screen;
+    size_t size = (size_t)vertex_size * (size_t)count;
+
+    if (size + r300render->vbo_offset > r300render->vbo_size)
+    {
+        pipe_resource_reference(&r300->vbo, NULL);
+        r300render->vbo = pipe_buffer_create(screen,
+                                             PIPE_BIND_VERTEX_BUFFER,
+                                             R300_MAX_DRAW_VBO_SIZE);
+        r300render->vbo_offset = 0;
+        r300render->vbo_size = R300_MAX_DRAW_VBO_SIZE;
+    }
+
+    r300render->vertex_size = vertex_size;
+    r300->vbo = r300render->vbo;
+    r300->vbo_offset = r300render->vbo_offset;
+
+    return (r300render->vbo) ? TRUE : FALSE;
+}
+
+static void* r300_render_map_vertices(struct vbuf_render* render)
+{
+    struct r300_render* r300render = r300_render(render);
+
+    assert(!r300render->vbo_transfer);
+
+    r300render->vbo_ptr = pipe_buffer_map(&r300render->r300->context,
+					  r300render->vbo,
+                                          PIPE_TRANSFER_WRITE,
+					  &r300render->vbo_transfer);
+
+    return ((uint8_t*)r300render->vbo_ptr + r300render->vbo_offset);
+}
+
+static void r300_render_unmap_vertices(struct vbuf_render* render,
+                                             ushort min,
+                                             ushort max)
+{
+    struct r300_render* r300render = r300_render(render);
+    struct pipe_context* context = &r300render->r300->context;
+
+    assert(r300render->vbo_transfer);
+
+    r300render->vbo_max_used = MAX2(r300render->vbo_max_used,
+                                    r300render->vertex_size * (max + 1));
+    pipe_buffer_unmap(context, r300render->vbo, r300render->vbo_transfer);
+
+    r300render->vbo_transfer = NULL;
+}
+
+static void r300_render_release_vertices(struct vbuf_render* render)
+{
+    struct r300_render* r300render = r300_render(render);
+
+    r300render->vbo_offset += r300render->vbo_max_used;
+    r300render->vbo_max_used = 0;
+}
+
+static boolean r300_render_set_primitive(struct vbuf_render* render,
+                                               unsigned prim)
+{
+    struct r300_render* r300render = r300_render(render);
+
+    r300render->prim = prim;
+    r300render->hwprim = r300_translate_primitive(prim);
+
+    return TRUE;
+}
+
+static void r300_render_draw_arrays(struct vbuf_render* render,
+                                    unsigned start,
+                                    unsigned count)
+{
+    struct r300_render* r300render = r300_render(render);
+    struct r300_context* r300 = r300render->r300;
+    uint8_t* ptr;
+    unsigned i;
+    unsigned dwords = 6;
+
+    CS_LOCALS(r300);
+
+    (void) i; (void) ptr;
+
+    r300_prepare_for_rendering(r300, PREP_FIRST_DRAW | PREP_EMIT_AOS_SWTCL,
+                               NULL, dwords, 0, 0, NULL);
+
+    DBG(r300, DBG_DRAW, "r300: Doing vbuf render, count %d\n", count);
+
+    /* Uncomment to dump all VBOs rendered through this interface.
+     * Slow and noisy!
+    ptr = pipe_buffer_map(&r300render->r300->context,
+                          r300render->vbo, PIPE_TRANSFER_READ,
+                          &r300render->vbo_transfer);
+
+    for (i = 0; i < count; i++) {
+        printf("r300: Vertex %d\n", i);
+        draw_dump_emitted_vertex(&r300->vertex_info, ptr);
+        ptr += r300->vertex_info.size * 4;
+        printf("\n");
+    }
+
+    pipe_buffer_unmap(&r300render->r300->context, r300render->vbo,
+        r300render->vbo_transfer);
+    */
+
+    BEGIN_CS(dwords);
+    OUT_CS_REG(R300_GA_COLOR_CONTROL,
+            r300_provoking_vertex_fixes(r300, r300render->prim));
+    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, count - 1);
+    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (count << 16) |
+           r300render->hwprim);
+    END_CS;
+}
+
+static void r300_render_draw_elements(struct vbuf_render* render,
+                                      const ushort* indices,
+                                      uint count)
+{
+    struct r300_render* r300render = r300_render(render);
+    struct r300_context* r300 = r300render->r300;
+    int i;
+    unsigned end_cs_dwords;
+    unsigned max_index = (r300render->vbo_size - r300render->vbo_offset) /
+                         (r300render->r300->vertex_info.size * 4) - 1;
+    unsigned short_count;
+    unsigned free_dwords;
+
+    CS_LOCALS(r300);
+
+    /* Reserve at least 256 dwords.
+     *
+     * Below we manage the CS space manually because there may be more
+     * indices than it can fit in CS. */
+    r300_prepare_for_rendering(r300,
+        PREP_FIRST_DRAW | PREP_EMIT_AOS_SWTCL | PREP_INDEXED,
+        NULL, 256, 0, 0, &end_cs_dwords);
+
+    while (count) {
+        free_dwords = r300->rws->get_cs_free_dwords(r300->rws);
+
+        short_count = MIN2(count, (free_dwords - end_cs_dwords - 6) * 2);
+
+        BEGIN_CS(6 + (short_count+1)/2);
+        OUT_CS_REG(R300_GA_COLOR_CONTROL,
+                r300_provoking_vertex_fixes(r300, r300render->prim));
+        OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, max_index);
+        OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, (short_count+1)/2);
+        OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (short_count << 16) |
+               r300render->hwprim);
+        for (i = 0; i < short_count-1; i += 2) {
+            OUT_CS(indices[i+1] << 16 | indices[i]);
+        }
+        if (short_count % 2) {
+            OUT_CS(indices[short_count-1]);
+        }
+        END_CS;
+
+        /* OK now subtract the emitted indices and see if we need to emit
+         * another draw packet. */
+        indices += short_count;
+        count -= short_count;
+
+        if (count) {
+            r300_prepare_for_rendering(r300,
+                PREP_EMIT_AOS_SWTCL | PREP_INDEXED,
+                NULL, 256, 0, 0, &end_cs_dwords);
+        }
+    }
+}
+
+static void r300_render_destroy(struct vbuf_render* render)
+{
+    FREE(render);
+}
+
+static struct vbuf_render* r300_render_create(struct r300_context* r300)
+{
+    struct r300_render* r300render = CALLOC_STRUCT(r300_render);
+
+    r300render->r300 = r300;
+
+    /* XXX find real numbers plz */
+    r300render->base.max_vertex_buffer_bytes = 128 * 1024;
+    r300render->base.max_indices = 16 * 1024;
+
+    r300render->base.get_vertex_info = r300_render_get_vertex_info;
+    r300render->base.allocate_vertices = r300_render_allocate_vertices;
+    r300render->base.map_vertices = r300_render_map_vertices;
+    r300render->base.unmap_vertices = r300_render_unmap_vertices;
+    r300render->base.set_primitive = r300_render_set_primitive;
+    r300render->base.draw_elements = r300_render_draw_elements;
+    r300render->base.draw_arrays = r300_render_draw_arrays;
+    r300render->base.release_vertices = r300_render_release_vertices;
+    r300render->base.destroy = r300_render_destroy;
+
+    r300render->vbo = NULL;
+    r300render->vbo_size = 0;
+    r300render->vbo_offset = 0;
+
+    return &r300render->base;
+}
+
+struct draw_stage* r300_draw_stage(struct r300_context* r300)
+{
+    struct vbuf_render* render;
+    struct draw_stage* stage;
+
+    render = r300_render_create(r300);
+
+    if (!render) {
+        return NULL;
+    }
+
+    stage = draw_vbuf_stage(r300->draw, render);
+
+    if (!stage) {
+        render->destroy(render);
+        return NULL;
+    }
+
+    draw_set_render(r300->draw, render);
+
+    return stage;
+}
+
+/****************************************************************************
+ *                         End of SW TCL functions                          *
+ ***************************************************************************/
+
+static void r300_resource_resolve(struct pipe_context* pipe,
+                                  struct pipe_resource* dest,
+                                  struct pipe_subresource subdest,
+                                  struct pipe_resource* src,
+                                  struct pipe_subresource subsrc)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_surface* destsurf = r300_surface(
+        dest->screen->get_tex_surface(dest->screen,
+            dest, subdest.face, subdest.level, 0, 0));
+    struct pipe_surface* srcsurf = src->screen->get_tex_surface(src->screen,
+            src, subsrc.face, subsrc.level, 0, 0);
+    float color[] = {0, 0, 0, 0};
+    CS_LOCALS(r300);
+
+    DBG(r300, DBG_DRAW, "r300: Resolving resource...\n");
+
+    OUT_CS_REG_SEQ(R300_RB3D_AARESOLVE_OFFSET, 1);
+    OUT_CS_RELOC(destsurf->buffer, destsurf->offset, 0, destsurf->domain, 0);
+
+    OUT_CS_REG_SEQ(R300_RB3D_AARESOLVE_PITCH, 1);
+    OUT_CS_RELOC(destsurf->buffer, destsurf->pitch, 0, destsurf->domain, 0);
+
+    OUT_CS_REG(R300_RB3D_AARESOLVE_CTL,
+        R300_RB3D_AARESOLVE_CTL_AARESOLVE_MODE_RESOLVE |
+        R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_AVERAGE);
+
+    r300->context.clear_render_target(pipe,
+        srcsurf, color, 0, 0, src->width0, src->height0);
+
+    OUT_CS_REG(R300_RB3D_AARESOLVE_CTL, 0x0);
+
+    pipe_surface_reference((struct pipe_surface**)&srcsurf, NULL);
+    pipe_surface_reference((struct pipe_surface**)&destsurf, NULL);
+}
+
+void r300_init_render_functions(struct r300_context *r300)
+{
+    /* Set generic functions. */
+    r300->context.draw_elements = r300_draw_elements;
+
+    /* Set draw functions based on presence of HW TCL. */
+    if (r300->screen->caps.has_tcl) {
+        r300->context.draw_arrays = r300_draw_arrays;
+        r300->context.draw_range_elements = r300_draw_range_elements;
+    } else {
+        r300->context.draw_arrays = r300_swtcl_draw_arrays;
+        r300->context.draw_range_elements = r300_swtcl_draw_range_elements;
+    }
+
+    r300->context.resource_resolve = r300_resource_resolve;
+
+    /* Plug in the two-sided stencil reference value fallback if needed. */
+    if (!r300->screen->caps.is_r500)
+        r300_plug_in_stencil_ref_fallback(r300);
+}
diff --git a/src/gallium/drivers/r300/r300_render_stencilref.c b/src/gallium/drivers/r300/r300_render_stencilref.c
new file mode 100644
index 0000000000..d509ded3ec
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_render_stencilref.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/**
+ * The two-sided stencil reference value fallback for r3xx-r4xx chips.
+ * These chips support two-sided stencil functions but they do not support
+ * a two-sided reference value.
+ *
+ * The functions below split every draw call which uses the two-sided
+ * reference value into two draw calls -- the first one renders front faces
+ * and the second renders back faces with the other reference value.
+ */
+
+#include "r300_context.h"
+#include "r300_reg.h"
+
+struct r300_stencilref_context {
+    void (*draw_arrays)(struct pipe_context *pipe,
+                        unsigned mode, unsigned start, unsigned count);
+
+    void (*draw_range_elements)(
+        struct pipe_context *pipe, struct pipe_resource *indexBuffer,
+        unsigned indexSize, int indexBias, unsigned minIndex, unsigned maxIndex,
+        unsigned mode, unsigned start, unsigned count);
+
+    uint32_t rs_cull_mode;
+    uint32_t zb_stencilrefmask;
+    ubyte ref_value_front;
+};
+
+static boolean r300_stencilref_needed(struct r300_context *r300)
+{
+    struct r300_dsa_state *dsa = (struct r300_dsa_state*)r300->dsa_state.state;
+
+    return dsa->two_sided_stencil_ref ||
+           (dsa->two_sided &&
+            r300->stencil_ref.ref_value[0] != r300->stencil_ref.ref_value[1]);
+}
+
+/* Set drawing for front faces. */
+static void r300_stencilref_begin(struct r300_context *r300)
+{
+    struct r300_stencilref_context *sr = r300->stencilref_fallback;
+    struct r300_rs_state *rs = (struct r300_rs_state*)r300->rs_state.state;
+    struct r300_dsa_state *dsa = (struct r300_dsa_state*)r300->dsa_state.state;
+
+    /* Save state. */
+    sr->rs_cull_mode = rs->cull_mode;
+    sr->zb_stencilrefmask = dsa->stencil_ref_mask;
+    sr->ref_value_front = r300->stencil_ref.ref_value[0];
+
+    /* We *cull* pixels, therefore no need to mask out the bits. */
+    rs->cull_mode |= R300_CULL_BACK;
+
+    r300->rs_state.dirty = TRUE;
+}
+
+/* Set drawing for back faces. */
+static void r300_stencilref_switch_side(struct r300_context *r300)
+{
+    struct r300_stencilref_context *sr = r300->stencilref_fallback;
+    struct r300_rs_state *rs = (struct r300_rs_state*)r300->rs_state.state;
+    struct r300_dsa_state *dsa = (struct r300_dsa_state*)r300->dsa_state.state;
+
+    rs->cull_mode = sr->rs_cull_mode | R300_CULL_FRONT;
+    dsa->stencil_ref_mask = dsa->stencil_ref_bf;
+    r300->stencil_ref.ref_value[0] = r300->stencil_ref.ref_value[1];
+
+    r300->rs_state.dirty = TRUE;
+    r300->dsa_state.dirty = TRUE;
+}
+
+/* Restore the original state. */
+static void r300_stencilref_end(struct r300_context *r300)
+{
+    struct r300_stencilref_context *sr = r300->stencilref_fallback;
+    struct r300_rs_state *rs = (struct r300_rs_state*)r300->rs_state.state;
+    struct r300_dsa_state *dsa = (struct r300_dsa_state*)r300->dsa_state.state;
+
+    /* Restore state. */
+    rs->cull_mode = sr->rs_cull_mode;
+    dsa->stencil_ref_mask = sr->zb_stencilrefmask;
+    r300->stencil_ref.ref_value[0] = sr->ref_value_front;
+
+    r300->rs_state.dirty = TRUE;
+    r300->dsa_state.dirty = TRUE;
+}
+
+static void r300_stencilref_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                                        unsigned start, unsigned count)
+{
+    struct r300_context *r300 = r300_context(pipe);
+    struct r300_stencilref_context *sr = r300->stencilref_fallback;
+
+    if (!r300_stencilref_needed(r300)) {
+        sr->draw_arrays(pipe, mode, start, count);
+    } else {
+        r300_stencilref_begin(r300);
+        sr->draw_arrays(pipe, mode, start, count);
+        r300_stencilref_switch_side(r300);
+        sr->draw_arrays(pipe, mode, start, count);
+        r300_stencilref_end(r300);
+    }
+}
+
+static void r300_stencilref_draw_range_elements(
+    struct pipe_context *pipe, struct pipe_resource *indexBuffer,
+    unsigned indexSize, int indexBias, unsigned minIndex, unsigned maxIndex,
+    unsigned mode, unsigned start, unsigned count)
+{
+    struct r300_context *r300 = r300_context(pipe);
+    struct r300_stencilref_context *sr = r300->stencilref_fallback;
+
+    if (!r300_stencilref_needed(r300)) {
+        sr->draw_range_elements(pipe, indexBuffer, indexSize, indexBias,
+                                minIndex, maxIndex, mode, start, count);
+    } else {
+        r300_stencilref_begin(r300);
+        sr->draw_range_elements(pipe, indexBuffer, indexSize, indexBias,
+                                minIndex, maxIndex, mode, start, count);
+        r300_stencilref_switch_side(r300);
+        sr->draw_range_elements(pipe, indexBuffer, indexSize, indexBias,
+                                minIndex, maxIndex, mode, start, count);
+        r300_stencilref_end(r300);
+    }
+}
+
+void r300_plug_in_stencil_ref_fallback(struct r300_context *r300)
+{
+    r300->stencilref_fallback = CALLOC_STRUCT(r300_stencilref_context);
+
+    /* Save original draw functions. */
+    r300->stencilref_fallback->draw_arrays = r300->context.draw_arrays;
+    r300->stencilref_fallback->draw_range_elements = r300->context.draw_range_elements;
+
+    /* Override the draw functions. */
+    r300->context.draw_arrays = r300_stencilref_draw_arrays;
+    r300->context.draw_range_elements = r300_stencilref_draw_range_elements;
+}
diff --git a/src/gallium/drivers/r300/r300_render_translate.c b/src/gallium/drivers/r300/r300_render_translate.c
new file mode 100644
index 0000000000..0ea11e5bfc
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_render_translate.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/**
+ * The functions below translate vertex and index buffers to the layout
+ * compatible with the hardware, so that all vertex and index fetches are
+ * DWORD-aligned and all used vertex and index formats are supported.
+ * For indices, an optional index offset is added to each index.
+ */
+
+#include "r300_context.h"
+#include "translate/translate.h"
+
+void r300_begin_vertex_translate(struct r300_context *r300)
+{
+    struct pipe_context *pipe = &r300->context;
+    struct translate_key key = {0};
+    struct translate_element *te;
+    unsigned tr_elem_index[PIPE_MAX_ATTRIBS] = {0};
+    struct translate *tr;
+    struct r300_vertex_element_state *ve = r300->velems;
+    boolean vb_translated[PIPE_MAX_ATTRIBS] = {0};
+    void *vb_map[PIPE_MAX_ATTRIBS] = {0}, *out_map;
+    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {0}, *out_transfer;
+    struct pipe_resource *out_buffer;
+    unsigned i, num_verts;
+
+    /* Initialize the translate key, i.e. the recipe how vertices should be
+     * translated. */
+    for (i = 0; i < ve->count; i++) {
+        struct pipe_vertex_buffer *vb =
+                &r300->vertex_buffer[ve->velem[i].vertex_buffer_index];
+        enum pipe_format output_format = ve->hw_format[i];
+        unsigned output_format_size = ve->hw_format_size[i];
+
+        /* Check for support. */
+        if (ve->velem[i].src_format == ve->hw_format[i] &&
+            (vb->buffer_offset + ve->velem[i].src_offset) % 4 == 0 &&
+            vb->stride % 4 == 0) {
+            continue;
+        }
+
+        /* Workaround for translate: output floats instead of halfs. */
+        switch (output_format) {
+            case PIPE_FORMAT_R16_FLOAT:
+                output_format = PIPE_FORMAT_R32_FLOAT;
+                output_format_size = 4;
+                break;
+            case PIPE_FORMAT_R16G16_FLOAT:
+                output_format = PIPE_FORMAT_R32G32_FLOAT;
+                output_format_size = 8;
+                break;
+            case PIPE_FORMAT_R16G16B16_FLOAT:
+                output_format = PIPE_FORMAT_R32G32B32_FLOAT;
+                output_format_size = 12;
+                break;
+            case PIPE_FORMAT_R16G16B16A16_FLOAT:
+                output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+                output_format_size = 16;
+                break;
+            default:;
+        }
+
+        /* Add this vertex element. */
+        te = &key.element[key.nr_elements];
+        /*te->type;
+        te->instance_divisor;*/
+        te->input_buffer = ve->velem[i].vertex_buffer_index;
+        te->input_format = ve->velem[i].src_format;
+        te->input_offset = vb->buffer_offset + ve->velem[i].src_offset;
+        te->output_format = output_format;
+        te->output_offset = key.output_stride;
+
+        key.output_stride += output_format_size;
+        vb_translated[ve->velem[i].vertex_buffer_index] = TRUE;
+        tr_elem_index[i] = key.nr_elements;
+        key.nr_elements++;
+    }
+
+    /* Get a translate object. */
+    tr = translate_cache_find(r300->tran.translate_cache, &key);
+
+    /* Map buffers we want to translate. */
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        if (vb_translated[i]) {
+            struct pipe_vertex_buffer *vb = &r300->vertex_buffer[i];
+
+            vb_map[i] = pipe_buffer_map(pipe, vb->buffer,
+                                        PIPE_TRANSFER_READ, &vb_transfer[i]);
+
+            tr->set_buffer(tr, i, vb_map[i], vb->stride, vb->max_index);
+        }
+    }
+
+    /* Create and map the output buffer. */
+    num_verts = r300->vertex_buffer_max_index + 1;
+
+    out_buffer = pipe_buffer_create(&r300->screen->screen,
+                                    PIPE_BIND_VERTEX_BUFFER,
+                                    key.output_stride * num_verts);
+
+    out_map = pipe_buffer_map(pipe, out_buffer, PIPE_TRANSFER_WRITE,
+                              &out_transfer);
+
+    /* Translate. */
+    tr->run(tr, 0, num_verts, 0, out_map);
+
+    /* Unmap all buffers. */
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        if (vb_translated[i]) {
+            pipe_buffer_unmap(pipe, r300->vertex_buffer[i].buffer,
+                              vb_transfer[i]);
+        }
+    }
+
+    pipe_buffer_unmap(pipe, out_buffer, out_transfer);
+
+    /* Setup the new vertex buffer in the first free slot. */
+    for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+        struct pipe_vertex_buffer *vb = &r300->vertex_buffer[i];
+
+        if (!vb->buffer) {
+            pipe_resource_reference(&vb->buffer, out_buffer);
+            vb->buffer_offset = 0;
+            vb->max_index = num_verts - 1;
+            vb->stride = key.output_stride;
+            r300->tran.vb_slot = i;
+            break;
+        }
+    }
+
+    /* Save and replace vertex elements. */
+    {
+        struct pipe_vertex_element new_velems[PIPE_MAX_ATTRIBS];
+
+        r300->tran.saved_velems = r300->velems;
+
+        for (i = 0; i < ve->count; i++) {
+            if (vb_translated[ve->velem[i].vertex_buffer_index]) {
+                te = &key.element[tr_elem_index[i]];
+                new_velems[i].instance_divisor = ve->velem[i].instance_divisor;
+                new_velems[i].src_format = te->output_format;
+                new_velems[i].src_offset = te->output_offset;
+                new_velems[i].vertex_buffer_index = r300->tran.vb_slot;
+            } else {
+                memcpy(&new_velems[i], &ve->velem[i],
+                       sizeof(struct pipe_vertex_element));
+            }
+        }
+
+        r300->tran.new_velems =
+            pipe->create_vertex_elements_state(pipe, ve->count, new_velems);
+        pipe->bind_vertex_elements_state(pipe, r300->tran.new_velems);
+    }
+
+    pipe_resource_reference(&out_buffer, NULL);
+}
+
+void r300_end_vertex_translate(struct r300_context *r300)
+{
+    struct pipe_context *pipe = &r300->context;
+
+    /* Restore vertex elements. */
+    pipe->bind_vertex_elements_state(pipe, r300->tran.saved_velems);
+    pipe->delete_vertex_elements_state(pipe, r300->tran.new_velems);
+
+    /* Delete the now-unused VBO. */
+    pipe_resource_reference(&r300->vertex_buffer[r300->tran.vb_slot].buffer,
+                            NULL);
+}
+
+static void r300_shorten_ubyte_elts(struct r300_context* r300,
+                                    struct pipe_resource** elts,
+                                    int index_bias,
+                                    unsigned start,
+                                    unsigned count)
+{
+    struct pipe_context* context = &r300->context;
+    struct pipe_screen* screen = r300->context.screen;
+    struct pipe_resource* new_elts;
+    unsigned char *in_map;
+    unsigned short *out_map;
+    struct pipe_transfer *src_transfer, *dst_transfer;
+    unsigned i;
+
+    new_elts = pipe_buffer_create(screen,
+                                  PIPE_BIND_INDEX_BUFFER,
+                                  2 * count);
+
+    in_map = pipe_buffer_map(context, *elts, PIPE_TRANSFER_READ, &src_transfer);
+    out_map = pipe_buffer_map(context, new_elts, PIPE_TRANSFER_WRITE, &dst_transfer);
+
+    in_map += start;
+
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned short)(*in_map + index_bias);
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(context, *elts, src_transfer);
+    pipe_buffer_unmap(context, new_elts, dst_transfer);
+
+    *elts = new_elts;
+}
+
+static void r300_rebuild_ushort_elts(struct r300_context *r300,
+                                     struct pipe_resource **elts,
+                                     int index_bias,
+                                     unsigned start, unsigned count)
+{
+    struct pipe_context *context = &r300->context;
+    struct pipe_transfer *in_transfer = NULL;
+    struct pipe_transfer *out_transfer = NULL;
+    struct pipe_resource *new_elts;
+    unsigned short *in_map;
+    unsigned short *out_map;
+    unsigned i;
+
+    new_elts = pipe_buffer_create(context->screen,
+                                  PIPE_BIND_INDEX_BUFFER,
+                                  2 * count);
+
+    in_map = pipe_buffer_map(context, *elts,
+                             PIPE_TRANSFER_READ, &in_transfer);
+    out_map = pipe_buffer_map(context, new_elts,
+                              PIPE_TRANSFER_WRITE, &out_transfer);
+
+    in_map += start;
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned short)(*in_map + index_bias);
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(context, *elts, in_transfer);
+    pipe_buffer_unmap(context, new_elts, out_transfer);
+
+    *elts = new_elts;
+}
+
+static void r300_rebuild_uint_elts(struct r300_context *r300,
+                                   struct pipe_resource **elts,
+                                   int index_bias,
+                                   unsigned start, unsigned count)
+{
+    struct pipe_context *context = &r300->context;
+    struct pipe_transfer *in_transfer = NULL;
+    struct pipe_transfer *out_transfer = NULL;
+    struct pipe_resource *new_elts;
+    unsigned int *in_map;
+    unsigned int *out_map;
+    unsigned i;
+
+    new_elts = pipe_buffer_create(context->screen,
+                                  PIPE_BIND_INDEX_BUFFER,
+                                  2 * count);
+
+    in_map = pipe_buffer_map(context, *elts,
+                             PIPE_TRANSFER_READ, &in_transfer);
+    out_map = pipe_buffer_map(context, new_elts,
+                              PIPE_TRANSFER_WRITE, &out_transfer);
+
+    in_map += start;
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned int)(*in_map + index_bias);
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(context, *elts, in_transfer);
+    pipe_buffer_unmap(context, new_elts, out_transfer);
+
+    *elts = new_elts;
+}
+
+void r300_translate_index_buffer(struct r300_context *r300,
+                                 struct pipe_resource **index_buffer,
+                                 unsigned *index_size, unsigned index_offset,
+                                 unsigned *start, unsigned count)
+{
+    switch (*index_size) {
+        case 1:
+            r300_shorten_ubyte_elts(r300, index_buffer, index_offset, *start, count);
+            *index_size = 2;
+            *start = 0;
+            break;
+
+        case 2:
+            if (*start % 2 != 0 || index_offset) {
+                r300_rebuild_ushort_elts(r300, index_buffer, index_offset, *start, count);
+                *start = 0;
+            }
+            break;
+
+        case 4:
+            if (index_offset) {
+                r300_rebuild_uint_elts(r300, index_buffer, index_offset, *start, count);
+                *start = 0;
+            }
+            break;
+    }
+}
diff --git a/src/gallium/drivers/r300/r300_resource.c b/src/gallium/drivers/r300/r300_resource.c
new file mode 100644
index 0000000000..f6f33028dc
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_resource.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2010 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Dave Airlie
+ */
+
+#include "r300_context.h"
+#include "r300_texture.h"
+#include "r300_screen_buffer.h"
+
+static struct pipe_resource *
+r300_resource_create(struct pipe_screen *screen,
+                    const struct pipe_resource *templ)
+{
+   if (templ->target == PIPE_BUFFER)
+      return r300_buffer_create(screen, templ);
+   else
+      return r300_texture_create(screen, templ);
+
+}
+
+static struct pipe_resource *
+r300_resource_from_handle(struct pipe_screen * screen,
+			 const struct pipe_resource *templ,
+			 struct winsys_handle *whandle)
+{
+   if (templ->target == PIPE_BUFFER)
+      return NULL;
+   else
+      return r300_texture_from_handle(screen, templ, whandle);
+}
+
+void r300_init_resource_functions(struct r300_context *r300)
+{
+   r300->context.get_transfer = u_get_transfer_vtbl;
+   r300->context.transfer_map = u_transfer_map_vtbl;
+   r300->context.transfer_flush_region = u_transfer_flush_region_vtbl;
+   r300->context.transfer_unmap = u_transfer_unmap_vtbl;
+   r300->context.transfer_destroy = u_transfer_destroy_vtbl;
+   r300->context.transfer_inline_write = u_transfer_inline_write_vtbl;
+   r300->context.is_resource_referenced = u_is_resource_referenced_vtbl;
+}
+
+void r300_init_screen_resource_functions(struct r300_screen *r300screen)
+{
+   r300screen->screen.resource_create = r300_resource_create;
+   r300screen->screen.resource_from_handle = r300_resource_from_handle;
+   r300screen->screen.resource_get_handle = u_resource_get_handle_vtbl;
+   r300screen->screen.resource_destroy = u_resource_destroy_vtbl;
+   r300screen->screen.user_buffer_create = r300_user_buffer_create;
+
+   r300screen->screen.get_tex_surface = r300_get_tex_surface;
+   r300screen->screen.tex_surface_destroy = r300_tex_surface_destroy;
+}
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
new file mode 100644
index 0000000000..8f7c96b829
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "util/u_format.h"
+#include "util/u_format_s3tc.h"
+#include "util/u_memory.h"
+
+#include "r300_context.h"
+#include "r300_texture.h"
+#include "r300_screen_buffer.h"
+#include "r300_state_inlines.h"
+#include "r300_winsys.h"
+
+/* Return the identifier behind whom the brave coders responsible for this
+ * amalgamation of code, sweat, and duct tape, routinely obscure their names.
+ *
+ * ...I should have just put "Corbin Simpson", but I'm not that cool.
+ *
+ * (Or egotistical. Yet.) */
+static const char* r300_get_vendor(struct pipe_screen* pscreen)
+{
+    return "X.Org R300 Project";
+}
+
+static const char* chip_families[] = {
+    "R300",
+    "R350",
+    "R360",
+    "RV350",
+    "RV370",
+    "RV380",
+    "R420",
+    "R423",
+    "R430",
+    "R480",
+    "R481",
+    "RV410",
+    "RS400",
+    "RC410",
+    "RS480",
+    "RS482",
+    "RS600",
+    "RS690",
+    "RS740",
+    "RV515",
+    "R520",
+    "RV530",
+    "R580",
+    "RV560",
+    "RV570"
+};
+
+static const char* r300_get_name(struct pipe_screen* pscreen)
+{
+    struct r300_screen* r300screen = r300_screen(pscreen);
+
+    return chip_families[r300screen->caps.family];
+}
+
+static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
+{
+    struct r300_screen* r300screen = r300_screen(pscreen);
+    boolean is_r400 = r300screen->caps.is_r400;
+    boolean is_r500 = r300screen->caps.is_r500;
+
+    /* XXX extended shader capabilities of r400 unimplemented */
+    is_r400 = FALSE;
+
+    switch (param) {
+        /* Supported features (boolean caps). */
+        case PIPE_CAP_NPOT_TEXTURES:
+        case PIPE_CAP_TWO_SIDED_STENCIL:
+        case PIPE_CAP_GLSL:
+            /* I'll be frank. This is a lie.
+             *
+             * We don't truly support GLSL on any of this driver's chipsets.
+             * To be fair, no chipset supports the full GLSL specification
+             * to the best of our knowledge, but some of the less esoteric
+             * features are still missing here.
+             *
+             * Rather than cripple ourselves intentionally, I'm going to set
+             * this flag, and as Gallium's interface continues to change, I
+             * hope that this single monolithic GLSL enable can slowly get
+             * split down into many different pieces and the state tracker
+             * will handle fallbacks transparently, like it should.
+             *
+             * ~ C.
+             */
+        case PIPE_CAP_ANISOTROPIC_FILTER:
+        case PIPE_CAP_POINT_SPRITE:
+        case PIPE_CAP_OCCLUSION_QUERY:
+        case PIPE_CAP_TEXTURE_SHADOW_MAP:
+        case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+        case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+        case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+        case PIPE_CAP_TEXTURE_SWIZZLE:
+            return 1;
+
+        /* Unsupported features (boolean caps). */
+        case PIPE_CAP_TIMER_QUERY:
+        case PIPE_CAP_DUAL_SOURCE_BLEND:
+        case PIPE_CAP_TGSI_CONT_SUPPORTED:
+        case PIPE_CAP_INDEP_BLEND_ENABLE:
+        case PIPE_CAP_INDEP_BLEND_FUNC:
+            return 0;
+
+        /* Texturing. */
+        case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+        case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+            return r300screen->caps.num_tex_units;
+        case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+            return 0;
+        case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+        case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+        case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+            /* 13 == 4096, 12 == 2048 */
+            return is_r500 ? 13 : 12;
+
+        /* Render targets. */
+        case PIPE_CAP_MAX_RENDER_TARGETS:
+            return 4;
+
+        /* General shader limits and features. */
+        case PIPE_CAP_SM3:
+            return is_r500 ? 1 : 0;
+        case PIPE_CAP_MAX_CONST_BUFFERS:
+            return 1;
+        case PIPE_CAP_MAX_CONST_BUFFER_SIZE:
+            return 256;
+
+        case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
+            return 1;
+
+        /* Fragment coordinate conventions. */
+        case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+        case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+	    return 1;
+        case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+        case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+            return 0;
+
+        /* Fragment shader limits. */
+        case PIPE_CAP_MAX_FS_INSTRUCTIONS:
+            return is_r500 || is_r400 ? 512 : 96;
+        case PIPE_CAP_MAX_FS_ALU_INSTRUCTIONS:
+            return is_r500 || is_r400 ? 512 : 64;
+        case PIPE_CAP_MAX_FS_TEX_INSTRUCTIONS:
+            return is_r500 || is_r400 ? 512 : 32;
+        case PIPE_CAP_MAX_FS_TEX_INDIRECTIONS:
+            return is_r500 ? 511 : 4;
+        case PIPE_CAP_MAX_FS_CONTROL_FLOW_DEPTH:
+            return is_r500 ? 64 : 0; /* Actually unlimited on r500. */
+        case PIPE_CAP_MAX_FS_INPUTS:
+            /* 2 colors + 8 texcoords are always supported
+             * (minus fog and wpos).
+             *
+             * R500 has the ability to turn 3rd and 4th color into
+             * additional texcoords but there is no two-sided color
+             * selection then. However the facing bit can be used instead. */
+            return 10;
+        case PIPE_CAP_MAX_FS_CONSTS:
+            return is_r500 ? 256 : 32;
+        case PIPE_CAP_MAX_FS_TEMPS:
+            return is_r500 ? 128 : is_r400 ? 64 : 32;
+        case PIPE_CAP_MAX_FS_ADDRS:
+            return 0;
+        case PIPE_CAP_MAX_FS_PREDS:
+            return is_r500 ? 1 : 0;
+
+        /* Vertex shader limits. */
+        case PIPE_CAP_MAX_VS_INSTRUCTIONS:
+        case PIPE_CAP_MAX_VS_ALU_INSTRUCTIONS:
+            return is_r500 ? 1024 : 256;
+        case PIPE_CAP_MAX_VS_TEX_INSTRUCTIONS:
+        case PIPE_CAP_MAX_VS_TEX_INDIRECTIONS:
+            return 0;
+        case PIPE_CAP_MAX_VS_CONTROL_FLOW_DEPTH:
+            return is_r500 ? 4 : 0; /* For loops; not sure about conditionals. */
+        case PIPE_CAP_MAX_VS_INPUTS:
+            return 16;
+        case PIPE_CAP_MAX_VS_CONSTS:
+            return 256;
+        case PIPE_CAP_MAX_VS_TEMPS:
+            return 32;
+        case PIPE_CAP_MAX_VS_ADDRS:
+            return 1; /* XXX guessed */
+        case PIPE_CAP_MAX_VS_PREDS:
+            return is_r500 ? 4 : 0; /* XXX guessed. */
+
+        default:
+            fprintf(stderr, "r300: Implementation error: Bad param %d\n",
+                param);
+            return 0;
+    }
+}
+
+static float r300_get_paramf(struct pipe_screen* pscreen, enum pipe_cap param)
+{
+    struct r300_screen* r300screen = r300_screen(pscreen);
+
+    switch (param) {
+        case PIPE_CAP_MAX_LINE_WIDTH:
+        case PIPE_CAP_MAX_LINE_WIDTH_AA:
+        case PIPE_CAP_MAX_POINT_WIDTH:
+        case PIPE_CAP_MAX_POINT_WIDTH_AA:
+            /* The maximum dimensions of the colorbuffer are our practical
+             * rendering limits. 2048 pixels should be enough for anybody. */
+            if (r300screen->caps.is_r500) {
+                return 4096.0f;
+            } else if (r300screen->caps.is_r400) {
+                return 4021.0f;
+            } else {
+                return 2560.0f;
+            }
+        case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+            return 16.0f;
+        case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+            return 16.0f;
+        default:
+            fprintf(stderr, "r300: Implementation error: Bad paramf %d\n",
+                param);
+            return 0.0f;
+    }
+}
+
+static boolean r300_is_format_supported(struct pipe_screen* screen,
+                                        enum pipe_format format,
+                                        enum pipe_texture_target target,
+                                        unsigned sample_count,
+                                        unsigned usage,
+                                        unsigned geom_flags)
+{
+    uint32_t retval = 0;
+    boolean is_r500 = r300_screen(screen)->caps.is_r500;
+    boolean is_r400 = r300_screen(screen)->caps.is_r400;
+    boolean is_rv350 = r300_screen(screen)->caps.is_rv350;
+    boolean is_z24 = format == PIPE_FORMAT_X8Z24_UNORM ||
+                     format == PIPE_FORMAT_S8_USCALED_Z24_UNORM;
+    boolean is_color2101010 = format == PIPE_FORMAT_R10G10B10A2_UNORM ||
+                              format == PIPE_FORMAT_R10G10B10X2_SNORM ||
+                              format == PIPE_FORMAT_B10G10R10A2_UNORM ||
+                              format == PIPE_FORMAT_R10SG10SB10SA2U_NORM;
+    boolean is_ati1n = format == PIPE_FORMAT_RGTC1_UNORM ||
+                       format == PIPE_FORMAT_RGTC1_SNORM;
+    boolean is_ati2n = format == PIPE_FORMAT_RGTC2_UNORM ||
+                       format == PIPE_FORMAT_RGTC2_SNORM;
+    boolean is_half_float = format == PIPE_FORMAT_R16_FLOAT ||
+                            format == PIPE_FORMAT_R16G16_FLOAT ||
+                            format == PIPE_FORMAT_R16G16B16_FLOAT ||
+                            format == PIPE_FORMAT_R16G16B16A16_FLOAT;
+
+    if (target >= PIPE_MAX_TEXTURE_TYPES) {
+        fprintf(stderr, "r300: Implementation error: Received bogus texture "
+            "target %d in %s\n", target, __FUNCTION__);
+        return FALSE;
+    }
+
+    switch (sample_count) {
+        case 0:
+        case 1:
+            break;
+        case 2:
+        case 3:
+        case 4:
+        case 6:
+            if (usage != PIPE_BIND_RENDER_TARGET ||
+                !util_format_is_rgba8_variant(
+                    util_format_description(format))) {
+                return FALSE;
+            }
+            break;
+        default:
+            return FALSE;
+    }
+
+    /* Check sampler format support. */
+    if ((usage & PIPE_BIND_SAMPLER_VIEW) &&
+        /* Z24 cannot be sampled from on non-r5xx. */
+        (is_r500 || !is_z24) &&
+        /* ATI1N is r5xx-only. */
+        (is_r500 || !is_ati1n) &&
+        /* ATI2N is supported on r4xx-r5xx. */
+        (is_r400 || is_r500 || !is_ati2n) &&
+        r300_is_sampler_format_supported(format)) {
+        retval |= PIPE_BIND_SAMPLER_VIEW;
+    }
+
+    /* Check colorbuffer format support. */
+    if ((usage & (PIPE_BIND_RENDER_TARGET |
+                  PIPE_BIND_DISPLAY_TARGET |
+                  PIPE_BIND_SCANOUT |
+                  PIPE_BIND_SHARED)) &&
+        /* 2101010 cannot be rendered to on non-r5xx. */
+        (is_r500 || !is_color2101010) &&
+        r300_is_colorbuffer_format_supported(format)) {
+        retval |= usage &
+            (PIPE_BIND_RENDER_TARGET |
+             PIPE_BIND_DISPLAY_TARGET |
+             PIPE_BIND_SCANOUT |
+             PIPE_BIND_SHARED);
+    }
+
+    /* Check depth-stencil format support. */
+    if (usage & PIPE_BIND_DEPTH_STENCIL &&
+        r300_is_zs_format_supported(format)) {
+        retval |= PIPE_BIND_DEPTH_STENCIL;
+    }
+
+    /* Check vertex buffer format support. */
+    if (usage & PIPE_BIND_VERTEX_BUFFER &&
+        /* Half float is supported on >= RV350. */
+        (is_rv350 || !is_half_float) &&
+        r300_translate_vertex_data_type(format) != R300_INVALID_FORMAT) {
+        retval |= PIPE_BIND_VERTEX_BUFFER;
+    }
+
+    /* Transfers are always supported. */
+    if (usage & PIPE_BIND_TRANSFER_READ)
+        retval |= PIPE_BIND_TRANSFER_READ;
+    if (usage & PIPE_BIND_TRANSFER_WRITE)
+        retval |= PIPE_BIND_TRANSFER_WRITE;
+
+    return retval == usage;
+}
+
+static void r300_destroy_screen(struct pipe_screen* pscreen)
+{
+    struct r300_screen* r300screen = r300_screen(pscreen);
+    struct r300_winsys_screen *rws = r300_winsys_screen(pscreen);
+
+    if (rws)
+      rws->destroy(rws);
+
+    FREE(r300screen);
+}
+
+static void r300_fence_reference(struct pipe_screen *screen,
+                                 struct pipe_fence_handle **ptr,
+                                 struct pipe_fence_handle *fence)
+{
+    struct r300_fence **oldf = (struct r300_fence**)ptr;
+    struct r300_fence *newf = (struct r300_fence*)fence;
+
+    if (pipe_reference(&(*oldf)->reference, &newf->reference))
+        FREE(*oldf);
+
+    *ptr = fence;
+}
+
+static int r300_fence_signalled(struct pipe_screen *screen,
+                                struct pipe_fence_handle *fence,
+                                unsigned flags)
+{
+    struct r300_fence *rfence = (struct r300_fence*)fence;
+
+    return rfence->signalled ? 0 : 1; /* 0 == success */
+}
+
+static int r300_fence_finish(struct pipe_screen *screen,
+                             struct pipe_fence_handle *fence,
+                             unsigned flags)
+{
+    struct r300_fence *rfence = (struct r300_fence*)fence;
+
+    r300_finish(rfence->ctx);
+    rfence->signalled = TRUE;
+    return 0; /* 0 == success */
+}
+
+struct pipe_screen* r300_create_screen(struct r300_winsys_screen *rws)
+{
+    struct r300_screen *r300screen = CALLOC_STRUCT(r300_screen);
+
+    if (!r300screen) {
+        FREE(r300screen);
+        return NULL;
+    }
+
+    r300screen->caps.pci_id = rws->get_value(rws, R300_VID_PCI_ID);
+    r300screen->caps.num_frag_pipes = rws->get_value(rws, R300_VID_GB_PIPES);
+    r300screen->caps.num_z_pipes = rws->get_value(rws, R300_VID_Z_PIPES);
+
+    r300_init_debug(r300screen);
+    r300_parse_chipset(&r300screen->caps);
+
+    r300screen->rws = rws;
+    r300screen->screen.winsys = (struct pipe_winsys*)rws;
+    r300screen->screen.destroy = r300_destroy_screen;
+    r300screen->screen.get_name = r300_get_name;
+    r300screen->screen.get_vendor = r300_get_vendor;
+    r300screen->screen.get_param = r300_get_param;
+    r300screen->screen.get_paramf = r300_get_paramf;
+    r300screen->screen.is_format_supported = r300_is_format_supported;
+    r300screen->screen.context_create = r300_create_context;
+
+    r300screen->screen.fence_reference = r300_fence_reference;
+    r300screen->screen.fence_signalled = r300_fence_signalled;
+    r300screen->screen.fence_finish = r300_fence_finish;
+
+    r300_init_screen_resource_functions(r300screen);
+
+    util_format_s3tc_init();
+
+    return &r300screen->screen;
+}
+
+struct r300_winsys_screen *
+r300_winsys_screen(struct pipe_screen *screen)
+{
+    return r300_screen(screen)->rws;
+}
diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h
new file mode 100644
index 0000000000..29cd5dbe26
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_screen.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_SCREEN_H
+#define R300_SCREEN_H
+
+#include "pipe/p_screen.h"
+
+#include "r300_chipset.h"
+
+#include <stdio.h>
+
+struct r300_screen {
+    /* Parent class */
+    struct pipe_screen screen;
+
+    struct r300_winsys_screen *rws;
+
+    /* Chipset capabilities */
+    struct r300_capabilities caps;
+
+    /** Combination of DBG_xxx flags */
+    unsigned debug;
+};
+
+
+/* Convenience cast wrapper. */
+static INLINE struct r300_screen* r300_screen(struct pipe_screen* screen) {
+    return (struct r300_screen*)screen;
+}
+
+/* Debug functionality. */
+
+/**
+ * Debug flags to disable/enable certain groups of debugging outputs.
+ *
+ * \note These may be rather coarse, and the grouping may be impractical.
+ * If you find, while debugging the driver, that a different grouping
+ * of these flags would be beneficial, just feel free to change them
+ * but make sure to update the documentation in r300_debug.c to reflect
+ * those changes.
+ */
+/*@{*/
+#define DBG_HELP        (1 << 0)
+/* Logging. */
+#define DBG_FP          (1 << 1)
+#define DBG_VP          (1 << 2)
+/* The bit (1 << 3) is unused. */
+#define DBG_DRAW        (1 << 4)
+#define DBG_TEX         (1 << 5)
+#define DBG_TEXALLOC    (1 << 6)
+#define DBG_RS          (1 << 7)
+#define DBG_FALL        (1 << 8)
+#define DBG_FB          (1 << 9)
+/* Features. */
+#define DBG_ANISOHQ     (1 << 16)
+#define DBG_NO_TILING   (1 << 17)
+#define DBG_NO_IMMD     (1 << 18)
+#define DBG_FAKE_OCC    (1 << 19)
+/* Statistics. */
+#define DBG_STATS       (1 << 24)
+/*@}*/
+
+static INLINE boolean SCREEN_DBG_ON(struct r300_screen * screen, unsigned flags)
+{
+    return (screen->debug & flags) ? TRUE : FALSE;
+}
+
+static INLINE void SCREEN_DBG(struct r300_screen * screen, unsigned flags,
+                              const char * fmt, ...)
+{
+    if (SCREEN_DBG_ON(screen, flags)) {
+        va_list va;
+        va_start(va, fmt);
+        vfprintf(stderr, fmt, va);
+        va_end(va);
+    }
+}
+
+void r300_init_debug(struct r300_screen* ctx);
+
+void r300_init_screen_resource_functions(struct r300_screen *r300screen);
+
+#endif /* R300_SCREEN_H */
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c
new file mode 100644
index 0000000000..7959e6a2f9
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_screen_buffer.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright 2010 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Dave Airlie
+ */
+
+#include <stdio.h>
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
+#include "util/u_math.h"
+
+#include "r300_screen_buffer.h"
+#include "r300_winsys.h"
+
+unsigned r300_buffer_is_referenced(struct pipe_context *context,
+				   struct pipe_resource *buf,
+                                   enum r300_reference_domain domain)
+{
+    struct r300_context *r300 = r300_context(context);
+    struct r300_buffer *rbuf = r300_buffer(buf);
+
+    if (r300_buffer_is_user_buffer(buf))
+ 	return PIPE_UNREFERENCED;
+
+    if (r300->rws->is_buffer_referenced(r300->rws, rbuf->buf, domain))
+        return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+
+    return PIPE_UNREFERENCED;
+}
+
+static unsigned r300_buffer_is_referenced_by_cs(struct pipe_context *context,
+                                                struct pipe_resource *buf,
+                                                unsigned face, unsigned level)
+{
+    return r300_buffer_is_referenced(context, buf, R300_REF_CS);
+}
+
+/* External helper, not required to implent u_resource_vtbl:
+ */
+int r300_upload_index_buffer(struct r300_context *r300,
+			     struct pipe_resource **index_buffer,
+			     unsigned index_size,
+			     unsigned start,
+			     unsigned count)
+{
+   struct pipe_resource *upload_buffer = NULL;
+   unsigned index_offset = start * index_size;
+   int ret = 0;
+
+    if (r300_buffer_is_user_buffer(*index_buffer)) {
+	ret = u_upload_buffer(r300->upload_ib,
+			      index_offset,
+			      count * index_size,
+			      *index_buffer,
+			      &index_offset,
+			      &upload_buffer);
+	if (ret) {
+	    goto done;
+	}
+	*index_buffer = upload_buffer;
+    }
+ done:
+    //    if (upload_buffer)
+    //	pipe_resource_reference(&upload_buffer, NULL);
+    return ret;
+}
+
+/* External helper, not required to implement u_resource_vtbl:
+ */
+int r300_upload_user_buffers(struct r300_context *r300)
+{
+    enum pipe_error ret = PIPE_OK;
+    int i, nr;
+
+    nr = r300->velems->count;
+
+    for (i = 0; i < nr; i++) {
+        struct pipe_vertex_buffer *vb =
+            &r300->vertex_buffer[r300->velems->velem[i].vertex_buffer_index];
+
+        if (r300_buffer_is_user_buffer(vb->buffer)) {
+            struct pipe_resource *upload_buffer = NULL;
+            unsigned offset = 0; /*vb->buffer_offset * 4;*/
+            unsigned size = vb->buffer->width0;
+            unsigned upload_offset;
+            ret = u_upload_buffer(r300->upload_vb,
+                                  offset, size,
+                                  vb->buffer,
+                                  &upload_offset, &upload_buffer);
+            if (ret)
+                return ret;
+
+            pipe_resource_reference(&vb->buffer, NULL);
+            vb->buffer = upload_buffer;
+            vb->buffer_offset = upload_offset;
+        }
+    }
+    return ret;
+}
+
+static void r300_winsys_buffer_destroy(struct r300_screen *r300screen,
+				       struct r300_buffer *rbuf)
+{
+    struct r300_winsys_screen *rws = r300screen->rws;
+
+    if (rbuf->buf) {
+	rws->buffer_reference(rws, &rbuf->buf, NULL);
+	rbuf->buf = NULL;
+    }
+}
+
+static void r300_buffer_destroy(struct pipe_screen *screen,
+				struct pipe_resource *buf)
+{
+    struct r300_screen *r300screen = r300_screen(screen);
+    struct r300_buffer *rbuf = r300_buffer(buf);
+
+    r300_winsys_buffer_destroy(r300screen, rbuf);
+    FREE(rbuf);
+}
+
+static void *
+r300_buffer_transfer_map( struct pipe_context *pipe,
+			  struct pipe_transfer *transfer )
+{
+    struct r300_screen *r300screen = r300_screen(pipe->screen);
+    struct r300_winsys_screen *rws = r300screen->rws;
+    struct r300_buffer *rbuf = r300_buffer(transfer->resource);
+    uint8_t *map;
+    boolean flush = FALSE;
+    unsigned i;
+
+    if (rbuf->user_buffer)
+        return (uint8_t *) rbuf->user_buffer + transfer->box.x;
+
+    if (rbuf->b.b.bind & PIPE_BIND_CONSTANT_BUFFER) {
+	goto just_map;
+    }
+
+    /* check if the mapping is to a range we already flushed */
+    if (transfer->usage & PIPE_TRANSFER_DISCARD) {
+	for (i = 0; i < rbuf->num_ranges; i++) {
+	    if ((transfer->box.x >= rbuf->ranges[i].start) &&
+		(transfer->box.x < rbuf->ranges[i].end))
+		flush = TRUE;
+
+	    if (flush) {
+		/* unreference this hw buffer and allocate a new one */
+		rws->buffer_reference(rws, &rbuf->buf, NULL);
+
+		rbuf->num_ranges = 0;
+		rbuf->buf = r300screen->rws->buffer_create(r300screen->rws, 16,
+						      rbuf->b.b.bind,
+                                                      rbuf->domain,
+						      rbuf->b.b.width0);
+		break;
+	    }
+	}
+    }
+just_map:
+    map = rws->buffer_map(rws, rbuf->buf, transfer->usage);
+
+    if (map == NULL)
+        return NULL;
+
+    /* map_buffer() returned a pointer to the beginning of the buffer,
+     * but transfers are expected to return a pointer to just the
+     * region specified in the box.
+     */
+    return map + transfer->box.x;
+}
+
+static void r300_buffer_transfer_flush_region( struct pipe_context *pipe,
+					       struct pipe_transfer *transfer,
+					       const struct pipe_box *box)
+{
+    struct r300_buffer *rbuf = r300_buffer(transfer->resource);
+    unsigned i;
+    unsigned offset = transfer->box.x + box->x;
+    unsigned length = box->width;
+
+    assert(box->x + box->width <= transfer->box.width);
+
+    if (rbuf->user_buffer)
+	return;
+
+    if (rbuf->b.b.bind & PIPE_BIND_CONSTANT_BUFFER)
+	return;
+
+    /* mark the range as used */
+    for(i = 0; i < rbuf->num_ranges; ++i) {
+	if(offset <= rbuf->ranges[i].end && rbuf->ranges[i].start <= (offset+box->width)) {
+	    rbuf->ranges[i].start = MIN2(rbuf->ranges[i].start, offset);
+	    rbuf->ranges[i].end   = MAX2(rbuf->ranges[i].end, (offset+length));
+	    return;
+	}
+    }
+
+    rbuf->ranges[rbuf->num_ranges].start = offset;
+    rbuf->ranges[rbuf->num_ranges].end = offset+length;
+    rbuf->num_ranges++;
+}
+
+static void r300_buffer_transfer_unmap( struct pipe_context *pipe,
+			    struct pipe_transfer *transfer )
+{
+    struct r300_screen *r300screen = r300_screen(pipe->screen);
+    struct r300_winsys_screen *rws = r300screen->rws;
+    struct r300_buffer *rbuf = r300_buffer(transfer->resource);
+
+    if (rbuf->buf) {
+        rws->buffer_unmap(rws, rbuf->buf);
+    }
+}
+
+struct u_resource_vtbl r300_buffer_vtbl = 
+{
+   u_default_resource_get_handle,      /* get_handle */
+   r300_buffer_destroy,		     /* resource_destroy */
+   r300_buffer_is_referenced_by_cs,  /* is_buffer_referenced */
+   u_default_get_transfer,	     /* get_transfer */
+   u_default_transfer_destroy,	     /* transfer_destroy */
+   r300_buffer_transfer_map,	     /* transfer_map */
+   r300_buffer_transfer_flush_region,  /* transfer_flush_region */
+   r300_buffer_transfer_unmap,	     /* transfer_unmap */
+   u_default_transfer_inline_write   /* transfer_inline_write */
+};
+
+struct pipe_resource *r300_buffer_create(struct pipe_screen *screen,
+					 const struct pipe_resource *templ)
+{
+    struct r300_screen *r300screen = r300_screen(screen);
+    struct r300_buffer *rbuf;
+    unsigned alignment = 16;
+
+    rbuf = CALLOC_STRUCT(r300_buffer);
+    if (!rbuf)
+	goto error1;
+
+    rbuf->magic = R300_BUFFER_MAGIC;
+
+    rbuf->b.b = *templ;
+    rbuf->b.vtbl = &r300_buffer_vtbl;
+    pipe_reference_init(&rbuf->b.b.reference, 1);
+    rbuf->b.b.screen = screen;
+    rbuf->domain = R300_DOMAIN_GTT;
+
+    rbuf->buf = r300screen->rws->buffer_create(r300screen->rws,
+					  alignment,
+					  rbuf->b.b.bind,
+                                          rbuf->domain,
+					  rbuf->b.b.width0);
+
+    if (!rbuf->buf)
+	goto error2;
+
+    return &rbuf->b.b;
+error2:
+    FREE(rbuf);
+error1:
+    return NULL;
+}
+
+struct pipe_resource *r300_user_buffer_create(struct pipe_screen *screen,
+					      void *ptr,
+					      unsigned bytes,
+					      unsigned bind)
+{
+    struct r300_buffer *rbuf;
+
+    rbuf = CALLOC_STRUCT(r300_buffer);
+    if (!rbuf)
+	goto no_rbuf;
+
+    rbuf->magic = R300_BUFFER_MAGIC;
+
+    pipe_reference_init(&rbuf->b.b.reference, 1);
+    rbuf->b.vtbl = &r300_buffer_vtbl;
+    rbuf->b.b.screen = screen;
+    rbuf->b.b.format = PIPE_FORMAT_R8_UNORM;
+    rbuf->b.b.usage = PIPE_USAGE_IMMUTABLE;
+    rbuf->b.b.bind = bind;
+    rbuf->b.b.width0 = bytes;
+    rbuf->b.b.height0 = 1;
+    rbuf->b.b.depth0 = 1;
+    rbuf->domain = R300_DOMAIN_GTT;
+
+    rbuf->user_buffer = ptr;
+    return &rbuf->b.b;
+
+no_rbuf:
+    return NULL;
+}
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.h b/src/gallium/drivers/r300/r300_screen_buffer.h
new file mode 100644
index 0000000000..ff35585870
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_screen_buffer.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2010 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Dave Airlie
+ */
+
+#ifndef R300_SCREEN_BUFFER_H
+#define R300_SCREEN_BUFFER_H
+
+#include <stdio.h>
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "util/u_transfer.h"
+
+#include "r300_screen.h"
+#include "r300_winsys.h"
+#include "r300_context.h"
+
+#define R300_BUFFER_MAGIC 0xabcd1234
+#define R300_BUFFER_MAX_RANGES 32
+
+struct r300_buffer_range {
+    uint32_t start;
+    uint32_t end;
+};
+
+/* Vertex buffer. */
+struct r300_buffer
+{
+    struct u_resource b;
+
+    uint32_t magic;
+
+    struct r300_winsys_buffer *buf;
+
+    enum r300_buffer_domain domain;
+
+    void *user_buffer;
+    struct r300_buffer_range ranges[R300_BUFFER_MAX_RANGES];
+    unsigned num_ranges;
+};
+
+/* Functions. */
+
+int r300_upload_user_buffers(struct r300_context *r300);
+
+int r300_upload_index_buffer(struct r300_context *r300,
+			     struct pipe_resource **index_buffer,
+			     unsigned index_size,
+			     unsigned start,
+			     unsigned count);
+
+struct pipe_resource *r300_buffer_create(struct pipe_screen *screen,
+					 const struct pipe_resource *templ);
+
+struct pipe_resource *r300_user_buffer_create(struct pipe_screen *screen,
+					      void *ptr,
+					      unsigned bytes,
+					      unsigned usage);
+
+unsigned r300_buffer_is_referenced(struct pipe_context *context,
+				   struct pipe_resource *buf,
+                                   enum r300_reference_domain domain);
+
+/* Inline functions. */
+
+static INLINE struct r300_buffer *r300_buffer(struct pipe_resource *buffer)
+{
+    if (buffer) {
+	assert(((struct r300_buffer *)buffer)->magic == R300_BUFFER_MAGIC);
+	return (struct r300_buffer *)buffer;
+    }
+    return NULL;
+}
+
+static INLINE boolean r300_buffer_is_user_buffer(struct pipe_resource *buffer)
+{
+    return r300_buffer(buffer)->user_buffer ? true : false;
+}
+
+static INLINE boolean r300_add_buffer(struct r300_winsys_screen *rws,
+				      struct pipe_resource *buffer,
+				      int rd, int wr)
+{
+    struct r300_buffer *buf = r300_buffer(buffer);
+
+    if (!buf->buf)
+	return true;
+
+    return rws->add_buffer(rws, buf->buf, rd, wr);
+}
+
+static INLINE boolean r300_add_texture(struct r300_winsys_screen *rws,
+				       struct r300_texture *tex,
+				       int rd, int wr)
+{
+    return rws->add_buffer(rws, tex->buffer, rd, wr);
+}
+
+#endif
diff --git a/src/gallium/drivers/r300/r300_shader_semantics.h b/src/gallium/drivers/r300/r300_shader_semantics.h
new file mode 100644
index 0000000000..cb7a37033f
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_shader_semantics.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_SHADER_SEMANTICS_H
+#define R300_SHADER_SEMANTICS_H
+
+#define ATTR_UNUSED             (-1)
+#define ATTR_COLOR_COUNT        2
+#define ATTR_GENERIC_COUNT      32
+
+/* This structure contains information about what attributes are written by VS
+ * or read by FS. (but not both) It's much easier to work with than
+ * tgsi_shader_info.
+ *
+ * The variables contain indices to tgsi_shader_info semantics and those
+ * indices are nothing else than input/output register numbers. */
+struct r300_shader_semantics {
+    int pos;
+    int psize;
+    int color[ATTR_COLOR_COUNT];
+    int bcolor[ATTR_COLOR_COUNT];
+    int generic[ATTR_GENERIC_COUNT];
+    int fog;
+    int wpos;
+};
+
+static INLINE void r300_shader_semantics_reset(
+    struct r300_shader_semantics* info)
+{
+    int i;
+
+    info->pos = ATTR_UNUSED;
+    info->psize = ATTR_UNUSED;
+    info->fog = ATTR_UNUSED;
+    info->wpos = ATTR_UNUSED;
+
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        info->color[i] = ATTR_UNUSED;
+        info->bcolor[i] = ATTR_UNUSED;
+    }
+
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        info->generic[i] = ATTR_UNUSED;
+    }
+}
+
+#endif
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
new file mode 100644
index 0000000000..bc2b62ba54
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -0,0 +1,1750 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "draw/draw_context.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "pipe/p_config.h"
+
+#include "r300_cb.h"
+#include "r300_context.h"
+#include "r300_emit.h"
+#include "r300_reg.h"
+#include "r300_screen.h"
+#include "r300_screen_buffer.h"
+#include "r300_state_inlines.h"
+#include "r300_fs.h"
+#include "r300_texture.h"
+#include "r300_vs.h"
+#include "r300_winsys.h"
+
+/* r300_state: Functions used to intialize state context by translating
+ * Gallium state objects into semi-native r300 state objects. */
+
+#define UPDATE_STATE(cso, atom) \
+    if (cso != atom.state) { \
+        atom.state = cso;    \
+        atom.dirty = TRUE;   \
+    }
+
+static boolean blend_discard_if_src_alpha_0(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA == 0, and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_alpha_1(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA == 1, and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_color_0(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_COLOR == (0,0,0), and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_color_1(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_COLOR == (1,1,1), and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_alpha_color_0(unsigned srcRGB, unsigned srcA,
+                                                  unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA_COLOR == (0,0,0,0), and the following state is set,
+     * the colorbuffer will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_alpha_color_1(unsigned srcRGB, unsigned srcA,
+                                                  unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA_COLOR == (1,1,1,1), and the following state is set,
+     * the colorbuffer will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static unsigned bgra_cmask(unsigned mask)
+{
+    /* Gallium uses RGBA color ordering while R300 expects BGRA. */
+
+    return ((mask & PIPE_MASK_R) << 2) |
+           ((mask & PIPE_MASK_B) >> 2) |
+           (mask & (PIPE_MASK_G | PIPE_MASK_A));
+}
+
+/* Create a new blend state based on the CSO blend state.
+ *
+ * This encompasses alpha blending, logic/raster ops, and blend dithering. */
+static void* r300_create_blend_state(struct pipe_context* pipe,
+                                     const struct pipe_blend_state* state)
+{
+    struct r300_screen* r300screen = r300_screen(pipe->screen);
+    struct r300_blend_state* blend = CALLOC_STRUCT(r300_blend_state);
+    uint32_t blend_control = 0;       /* R300_RB3D_CBLEND: 0x4e04 */
+    uint32_t alpha_blend_control = 0; /* R300_RB3D_ABLEND: 0x4e08 */
+    uint32_t color_channel_mask = 0;  /* R300_RB3D_COLOR_CHANNEL_MASK: 0x4e0c */
+    uint32_t rop = 0;                 /* R300_RB3D_ROPCNTL: 0x4e18 */
+    uint32_t dither = 0;              /* R300_RB3D_DITHER_CTL: 0x4e50 */
+    CB_LOCALS;
+
+    if (state->rt[0].blend_enable)
+    {
+        unsigned eqRGB = state->rt[0].rgb_func;
+        unsigned srcRGB = state->rt[0].rgb_src_factor;
+        unsigned dstRGB = state->rt[0].rgb_dst_factor;
+
+        unsigned eqA = state->rt[0].alpha_func;
+        unsigned srcA = state->rt[0].alpha_src_factor;
+        unsigned dstA = state->rt[0].alpha_dst_factor;
+
+        /* despite the name, ALPHA_BLEND_ENABLE has nothing to do with alpha,
+         * this is just the crappy D3D naming */
+        blend_control = R300_ALPHA_BLEND_ENABLE |
+            r300_translate_blend_function(eqRGB) |
+            ( r300_translate_blend_factor(srcRGB) << R300_SRC_BLEND_SHIFT) |
+            ( r300_translate_blend_factor(dstRGB) << R300_DST_BLEND_SHIFT);
+
+        /* Optimization: some operations do not require the destination color.
+         *
+         * When SRC_ALPHA_SATURATE is used, colorbuffer reads must be enabled,
+         * otherwise blending gives incorrect results. It seems to be
+         * a hardware bug. */
+        if (eqRGB == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MIN ||
+            eqRGB == PIPE_BLEND_MAX || eqA == PIPE_BLEND_MAX ||
+            dstRGB != PIPE_BLENDFACTOR_ZERO ||
+            dstA != PIPE_BLENDFACTOR_ZERO ||
+            srcRGB == PIPE_BLENDFACTOR_DST_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_DST_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_INV_DST_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_INV_DST_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_DST_COLOR ||
+            srcA == PIPE_BLENDFACTOR_DST_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_INV_DST_COLOR ||
+            srcA == PIPE_BLENDFACTOR_INV_DST_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) {
+            /* Enable reading from the colorbuffer. */
+            blend_control |= R300_READ_ENABLE;
+
+            if (r300screen->caps.is_r500) {
+                /* Optimization: Depending on incoming pixels, we can
+                 * conditionally disable the reading in hardware... */
+                if (eqRGB != PIPE_BLEND_MIN && eqA != PIPE_BLEND_MIN &&
+                    eqRGB != PIPE_BLEND_MAX && eqA != PIPE_BLEND_MAX) {
+                    /* Disable reading if SRC_ALPHA == 0. */
+                    if ((dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+                         dstRGB == PIPE_BLENDFACTOR_ZERO) &&
+                        (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
+                         dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+                         dstA == PIPE_BLENDFACTOR_ZERO)) {
+                         blend_control |= R500_SRC_ALPHA_0_NO_READ;
+                    }
+
+                    /* Disable reading if SRC_ALPHA == 1. */
+                    if ((dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+                         dstRGB == PIPE_BLENDFACTOR_ZERO) &&
+                        (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+                         dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+                         dstA == PIPE_BLENDFACTOR_ZERO)) {
+                         blend_control |= R500_SRC_ALPHA_1_NO_READ;
+                    }
+                }
+            }
+        }
+
+        /* Optimization: discard pixels which don't change the colorbuffer.
+         *
+         * The code below is non-trivial and some math is involved.
+         *
+         * Discarding pixels must be disabled when FP16 AA is enabled.
+         * This is a hardware bug. Also, this implementation wouldn't work
+         * with FP blending enabled and equation clamping disabled.
+         *
+         * Equations other than ADD are rarely used and therefore won't be
+         * optimized. */
+        if ((eqRGB == PIPE_BLEND_ADD || eqRGB == PIPE_BLEND_REVERSE_SUBTRACT) &&
+            (eqA == PIPE_BLEND_ADD || eqA == PIPE_BLEND_REVERSE_SUBTRACT)) {
+            /* ADD: X+Y
+             * REVERSE_SUBTRACT: Y-X
+             *
+             * The idea is:
+             * If X = src*srcFactor = 0 and Y = dst*dstFactor = 1,
+             * then CB will not be changed.
+             *
+             * Given the srcFactor and dstFactor variables, we can derive
+             * what src and dst should be equal to and discard appropriate
+             * pixels.
+             */
+            if (blend_discard_if_src_alpha_0(srcRGB, srcA, dstRGB, dstA)) {
+                blend_control |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_0;
+            } else if (blend_discard_if_src_alpha_1(srcRGB, srcA,
+                                                    dstRGB, dstA)) {
+                blend_control |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1;
+            } else if (blend_discard_if_src_color_0(srcRGB, srcA,
+                                                    dstRGB, dstA)) {
+                blend_control |= R300_DISCARD_SRC_PIXELS_SRC_COLOR_0;
+            } else if (blend_discard_if_src_color_1(srcRGB, srcA,
+                                                    dstRGB, dstA)) {
+                blend_control |= R300_DISCARD_SRC_PIXELS_SRC_COLOR_1;
+            } else if (blend_discard_if_src_alpha_color_0(srcRGB, srcA,
+                                                          dstRGB, dstA)) {
+                blend_control |=
+                    R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_0;
+            } else if (blend_discard_if_src_alpha_color_1(srcRGB, srcA,
+                                                          dstRGB, dstA)) {
+                blend_control |=
+                    R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1;
+            }
+        }
+
+        /* separate alpha */
+        if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
+            blend_control |= R300_SEPARATE_ALPHA_ENABLE;
+            alpha_blend_control =
+                r300_translate_blend_function(eqA) |
+                (r300_translate_blend_factor(srcA) << R300_SRC_BLEND_SHIFT) |
+                (r300_translate_blend_factor(dstA) << R300_DST_BLEND_SHIFT);
+        }
+    }
+
+    /* PIPE_LOGICOP_* don't need to be translated, fortunately. */
+    if (state->logicop_enable) {
+        rop = R300_RB3D_ROPCNTL_ROP_ENABLE |
+                (state->logicop_func) << R300_RB3D_ROPCNTL_ROP_SHIFT;
+    }
+
+    /* Color channel masks for all MRTs. */
+    color_channel_mask = bgra_cmask(state->rt[0].colormask);
+    if (r300screen->caps.is_r500 && state->independent_blend_enable) {
+        if (state->rt[1].blend_enable) {
+            color_channel_mask |= bgra_cmask(state->rt[1].colormask) << 4;
+        }
+        if (state->rt[2].blend_enable) {
+            color_channel_mask |= bgra_cmask(state->rt[2].colormask) << 8;
+        }
+        if (state->rt[3].blend_enable) {
+            color_channel_mask |= bgra_cmask(state->rt[3].colormask) << 12;
+        }
+    }
+
+    /* Neither fglrx nor classic r300 ever set this, regardless of dithering
+     * state. Since it's an optional implementation detail, we can leave it
+     * out and never dither.
+     *
+     * This could be revisited if we ever get quality or conformance hints.
+     *
+    if (state->dither) {
+        dither = R300_RB3D_DITHER_CTL_DITHER_MODE_LUT |
+                        R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_LUT;
+    }
+    */
+
+    /* Build a command buffer. */
+    BEGIN_CB(blend->cb, 8);
+    OUT_CB_REG(R300_RB3D_ROPCNTL, rop);
+    OUT_CB_REG_SEQ(R300_RB3D_CBLEND, 3);
+    OUT_CB(blend_control);
+    OUT_CB(alpha_blend_control);
+    OUT_CB(color_channel_mask);
+    OUT_CB_REG(R300_RB3D_DITHER_CTL, dither);
+    END_CB;
+
+    /* The same as above, but with no colorbuffer reads and writes. */
+    BEGIN_CB(blend->cb_no_readwrite, 8);
+    OUT_CB_REG(R300_RB3D_ROPCNTL, rop);
+    OUT_CB_REG_SEQ(R300_RB3D_CBLEND, 3);
+    OUT_CB(0);
+    OUT_CB(0);
+    OUT_CB(0);
+    OUT_CB_REG(R300_RB3D_DITHER_CTL, dither);
+    END_CB;
+
+    return (void*)blend;
+}
+
+/* Bind blend state. */
+static void r300_bind_blend_state(struct pipe_context* pipe,
+                                  void* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    UPDATE_STATE(state, r300->blend_state);
+}
+
+/* Free blend state. */
+static void r300_delete_blend_state(struct pipe_context* pipe,
+                                    void* state)
+{
+    FREE(state);
+}
+
+/* Convert float to 10bit integer */
+static unsigned float_to_fixed10(float f)
+{
+    return CLAMP((unsigned)(f * 1023.9f), 0, 1023);
+}
+
+/* Set blend color.
+ * Setup both R300 and R500 registers, figure out later which one to write. */
+static void r300_set_blend_color(struct pipe_context* pipe,
+                                 const struct pipe_blend_color* color)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_blend_color_state* state =
+        (struct r300_blend_color_state*)r300->blend_color_state.state;
+    CB_LOCALS;
+
+    if (r300->screen->caps.is_r500) {
+        /* XXX if FP16 blending is enabled, we should use the FP16 format */
+        BEGIN_CB(state->cb, 3);
+        OUT_CB_REG_SEQ(R500_RB3D_CONSTANT_COLOR_AR, 2);
+        OUT_CB(float_to_fixed10(color->color[0]) |
+               (float_to_fixed10(color->color[3]) << 16));
+        OUT_CB(float_to_fixed10(color->color[2]) |
+               (float_to_fixed10(color->color[1]) << 16));
+        END_CB;
+    } else {
+        union util_color uc;
+        util_pack_color(color->color, PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
+
+        BEGIN_CB(state->cb, 2);
+        OUT_CB_REG(R300_RB3D_BLEND_COLOR, uc.ui);
+        END_CB;
+    }
+
+    r300->blend_color_state.dirty = TRUE;
+}
+
+static void r300_set_clip_state(struct pipe_context* pipe,
+                                const struct pipe_clip_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_clip_state *clip =
+            (struct r300_clip_state*)r300->clip_state.state;
+    CB_LOCALS;
+
+    clip->clip = *state;
+
+    if (r300->screen->caps.has_tcl) {
+        BEGIN_CB(clip->cb, 29);
+        OUT_CB_REG(R300_VAP_PVS_VECTOR_INDX_REG,
+                (r300->screen->caps.is_r500 ?
+                 R500_PVS_UCP_START : R300_PVS_UCP_START));
+        OUT_CB_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, 6 * 4);
+        OUT_CB_TABLE(state->ucp, 6 * 4);
+        OUT_CB_REG(R300_VAP_CLIP_CNTL, ((1 << state->nr) - 1) |
+                R300_PS_UCP_MODE_CLIP_AS_TRIFAN);
+        END_CB;
+
+        r300->clip_state.dirty = TRUE;
+    } else {
+        draw_flush(r300->draw);
+        draw_set_clip_state(r300->draw, state);
+    }
+}
+
+static void
+r300_set_sample_mask(struct pipe_context *pipe,
+                     unsigned sample_mask)
+{
+}
+
+
+/* Create a new depth, stencil, and alpha state based on the CSO dsa state.
+ *
+ * This contains the depth buffer, stencil buffer, alpha test, and such.
+ * On the Radeon, depth and stencil buffer setup are intertwined, which is
+ * the reason for some of the strange-looking assignments across registers. */
+static void*
+        r300_create_dsa_state(struct pipe_context* pipe,
+                              const struct pipe_depth_stencil_alpha_state* state)
+{
+    struct r300_capabilities *caps = &r300_screen(pipe->screen)->caps;
+    struct r300_dsa_state* dsa = CALLOC_STRUCT(r300_dsa_state);
+    CB_LOCALS;
+
+    dsa->dsa = *state;
+
+    /* Depth test setup. */
+    if (state->depth.enabled) {
+        dsa->z_buffer_control |= R300_Z_ENABLE;
+
+        if (state->depth.writemask) {
+            dsa->z_buffer_control |= R300_Z_WRITE_ENABLE;
+        }
+
+        dsa->z_stencil_control |=
+            (r300_translate_depth_stencil_function(state->depth.func) <<
+                R300_Z_FUNC_SHIFT);
+    }
+
+    /* Stencil buffer setup. */
+    if (state->stencil[0].enabled) {
+        dsa->z_buffer_control |= R300_STENCIL_ENABLE;
+        dsa->z_stencil_control |=
+            (r300_translate_depth_stencil_function(state->stencil[0].func) <<
+                R300_S_FRONT_FUNC_SHIFT) |
+            (r300_translate_stencil_op(state->stencil[0].fail_op) <<
+                R300_S_FRONT_SFAIL_OP_SHIFT) |
+            (r300_translate_stencil_op(state->stencil[0].zpass_op) <<
+                R300_S_FRONT_ZPASS_OP_SHIFT) |
+            (r300_translate_stencil_op(state->stencil[0].zfail_op) <<
+                R300_S_FRONT_ZFAIL_OP_SHIFT);
+
+        dsa->stencil_ref_mask =
+                (state->stencil[0].valuemask << R300_STENCILMASK_SHIFT) |
+                (state->stencil[0].writemask << R300_STENCILWRITEMASK_SHIFT);
+
+        if (state->stencil[1].enabled) {
+            dsa->two_sided = TRUE;
+
+            dsa->z_buffer_control |= R300_STENCIL_FRONT_BACK;
+            dsa->z_stencil_control |=
+            (r300_translate_depth_stencil_function(state->stencil[1].func) <<
+                R300_S_BACK_FUNC_SHIFT) |
+            (r300_translate_stencil_op(state->stencil[1].fail_op) <<
+                R300_S_BACK_SFAIL_OP_SHIFT) |
+            (r300_translate_stencil_op(state->stencil[1].zpass_op) <<
+                R300_S_BACK_ZPASS_OP_SHIFT) |
+            (r300_translate_stencil_op(state->stencil[1].zfail_op) <<
+                R300_S_BACK_ZFAIL_OP_SHIFT);
+
+            dsa->stencil_ref_bf =
+                (state->stencil[1].valuemask << R300_STENCILMASK_SHIFT) |
+                (state->stencil[1].writemask << R300_STENCILWRITEMASK_SHIFT);
+
+            if (caps->is_r500) {
+                dsa->z_buffer_control |= R500_STENCIL_REFMASK_FRONT_BACK;
+            } else {
+                dsa->two_sided_stencil_ref =
+                  (state->stencil[0].valuemask != state->stencil[1].valuemask ||
+                   state->stencil[0].writemask != state->stencil[1].writemask);
+            }
+        }
+    }
+
+    /* Alpha test setup. */
+    if (state->alpha.enabled) {
+        dsa->alpha_function =
+            r300_translate_alpha_function(state->alpha.func) |
+            R300_FG_ALPHA_FUNC_ENABLE;
+
+        /* We could use 10bit alpha ref but who needs that? */
+        dsa->alpha_function |= float_to_ubyte(state->alpha.ref_value);
+
+        if (caps->is_r500)
+            dsa->alpha_function |= R500_FG_ALPHA_FUNC_8BIT;
+    }
+
+    BEGIN_CB(&dsa->cb_begin, 8);
+    OUT_CB_REG(R300_FG_ALPHA_FUNC, dsa->alpha_function);
+    OUT_CB_REG_SEQ(R300_ZB_CNTL, 3);
+    OUT_CB(dsa->z_buffer_control);
+    OUT_CB(dsa->z_stencil_control);
+    OUT_CB(dsa->stencil_ref_mask);
+    OUT_CB_REG(R500_ZB_STENCILREFMASK_BF, dsa->stencil_ref_bf);
+    END_CB;
+
+    BEGIN_CB(dsa->cb_no_readwrite, 8);
+    OUT_CB_REG(R300_FG_ALPHA_FUNC, dsa->alpha_function);
+    OUT_CB_REG_SEQ(R300_ZB_CNTL, 3);
+    OUT_CB(0);
+    OUT_CB(0);
+    OUT_CB(0);
+    OUT_CB_REG(R500_ZB_STENCILREFMASK_BF, 0);
+    END_CB;
+
+    return (void*)dsa;
+}
+
+static void r300_dsa_inject_stencilref(struct r300_context *r300)
+{
+    struct r300_dsa_state *dsa =
+            (struct r300_dsa_state*)r300->dsa_state.state;
+
+    if (!dsa)
+        return;
+
+    dsa->stencil_ref_mask =
+        (dsa->stencil_ref_mask & ~R300_STENCILREF_MASK) |
+        r300->stencil_ref.ref_value[0];
+    dsa->stencil_ref_bf =
+        (dsa->stencil_ref_bf & ~R300_STENCILREF_MASK) |
+        r300->stencil_ref.ref_value[1];
+}
+
+/* Bind DSA state. */
+static void r300_bind_dsa_state(struct pipe_context* pipe,
+                                void* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    if (!state) {
+        return;
+    }
+
+    UPDATE_STATE(state, r300->dsa_state);
+
+    r300_dsa_inject_stencilref(r300);
+}
+
+/* Free DSA state. */
+static void r300_delete_dsa_state(struct pipe_context* pipe,
+                                  void* state)
+{
+    FREE(state);
+}
+
+static void r300_set_stencil_ref(struct pipe_context* pipe,
+                                 const struct pipe_stencil_ref* sr)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    r300->stencil_ref = *sr;
+
+    r300_dsa_inject_stencilref(r300);
+    r300->dsa_state.dirty = TRUE;
+}
+
+/* This switcheroo is needed just because of goddamned MACRO_SWITCH. */
+static void r300_fb_set_tiling_flags(struct r300_context *r300,
+                               const struct pipe_framebuffer_state *old_state,
+                               const struct pipe_framebuffer_state *new_state)
+{
+    struct r300_texture *tex;
+    unsigned i, level;
+
+    /* Set tiling flags for new surfaces. */
+    for (i = 0; i < new_state->nr_cbufs; i++) {
+        tex = r300_texture(new_state->cbufs[i]->texture);
+        level = new_state->cbufs[i]->level;
+
+        r300->rws->buffer_set_tiling(r300->rws, tex->buffer,
+                tex->pitch[0] * util_format_get_blocksize(tex->b.b.format),
+                tex->microtile,
+                tex->mip_macrotile[level]);
+    }
+    if (new_state->zsbuf) {
+        tex = r300_texture(new_state->zsbuf->texture);
+        level = new_state->zsbuf->level;
+
+        r300->rws->buffer_set_tiling(r300->rws, tex->buffer,
+                tex->pitch[0] * util_format_get_blocksize(tex->b.b.format),
+                tex->microtile,
+                tex->mip_macrotile[level]);
+    }
+}
+
+static void r300_print_fb_surf_info(struct pipe_surface *surf, unsigned index,
+                                    const char *binding)
+{
+    struct pipe_resource *tex = surf->texture;
+    struct r300_texture *rtex = r300_texture(tex);
+
+    fprintf(stderr,
+            "r300:   %s[%i] Dim: %ix%i, Offset: %i, ZSlice: %i, "
+            "Face: %i, Level: %i, Format: %s\n"
+
+            "r300:     TEX: Macro: %s, Micro: %s, Pitch: %i, "
+            "Dim: %ix%ix%i, LastLevel: %i, Format: %s\n",
+
+            binding, index, surf->width, surf->height, surf->offset,
+            surf->zslice, surf->face, surf->level,
+            util_format_short_name(surf->format),
+
+            rtex->macrotile ? "YES" : " NO", rtex->microtile ? "YES" : " NO",
+            rtex->hwpitch[0], tex->width0, tex->height0, tex->depth0,
+            tex->last_level, util_format_short_name(tex->format));
+}
+
+static void
+    r300_set_framebuffer_state(struct pipe_context* pipe,
+                               const struct pipe_framebuffer_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct pipe_framebuffer_state *old_state = r300->fb_state.state;
+    unsigned max_width, max_height, i;
+    uint32_t zbuffer_bpp = 0;
+
+    if (state->nr_cbufs > 4) {
+        fprintf(stderr, "r300: Implementation error: Too many MRTs in %s, "
+            "refusing to bind framebuffer state!\n", __FUNCTION__);
+        return;
+    }
+
+    if (r300->screen->caps.is_r500) {
+        max_width = max_height = 4096;
+    } else if (r300->screen->caps.is_r400) {
+        max_width = max_height = 4021;
+    } else {
+        max_width = max_height = 2560;
+    }
+
+    if (state->width > max_width || state->height > max_height) {
+        fprintf(stderr, "r300: Implementation error: Render targets are too "
+        "big in %s, refusing to bind framebuffer state!\n", __FUNCTION__);
+        return;
+    }
+
+    if (r300->draw) {
+        draw_flush(r300->draw);
+    }
+
+    r300->fb_state.dirty = TRUE;
+
+    /* If nr_cbufs is changed from zero to non-zero or vice versa... */
+    if (!!old_state->nr_cbufs != !!state->nr_cbufs) {
+        r300->blend_state.dirty = TRUE;
+    }
+    /* If zsbuf is set from NULL to non-NULL or vice versa.. */
+    if (!!old_state->zsbuf != !!state->zsbuf) {
+        r300->dsa_state.dirty = TRUE;
+    }
+
+    /* The tiling flags are dependent on the surface miplevel, unfortunately. */
+    r300_fb_set_tiling_flags(r300, r300->fb_state.state, state);
+
+    memcpy(r300->fb_state.state, state, sizeof(struct pipe_framebuffer_state));
+
+    r300->fb_state.size = (10 * state->nr_cbufs) + (2 * (4 - state->nr_cbufs)) +
+                          (state->zsbuf ? 10 : 0) + 11;
+
+    /* Polygon offset depends on the zbuffer bit depth. */
+    if (state->zsbuf && r300->polygon_offset_enabled) {
+        switch (util_format_get_blocksize(state->zsbuf->texture->format)) {
+            case 2:
+                zbuffer_bpp = 16;
+                break;
+            case 4:
+                zbuffer_bpp = 24;
+                break;
+        }
+
+        if (r300->zbuffer_bpp != zbuffer_bpp) {
+            r300->zbuffer_bpp = zbuffer_bpp;
+            r300->rs_state.dirty = TRUE;
+        }
+    }
+
+    if (DBG_ON(r300, DBG_FB)) {
+        fprintf(stderr, "r300: set_framebuffer_state:\n");
+        for (i = 0; i < state->nr_cbufs; i++) {
+            r300_print_fb_surf_info(state->cbufs[i], i, "CB");
+        }
+        if (state->zsbuf) {
+            r300_print_fb_surf_info(state->zsbuf, 0, "ZB");
+        }
+    }
+}
+
+/* Create fragment shader state. */
+static void* r300_create_fs_state(struct pipe_context* pipe,
+                                  const struct pipe_shader_state* shader)
+{
+    struct r300_fragment_shader* fs = NULL;
+
+    fs = (struct r300_fragment_shader*)CALLOC_STRUCT(r300_fragment_shader);
+
+    /* Copy state directly into shader. */
+    fs->state = *shader;
+    fs->state.tokens = tgsi_dup_tokens(shader->tokens);
+
+    return (void*)fs;
+}
+
+void r300_mark_fs_code_dirty(struct r300_context *r300)
+{
+    struct r300_fragment_shader* fs = r300_fs(r300);
+
+    r300->fs.dirty = TRUE;
+    r300->fs_rc_constant_state.dirty = TRUE;
+    r300->fs_constants.dirty = TRUE;
+    r300->fs.size = fs->shader->cb_code_size;
+
+    if (r300->screen->caps.is_r500) {
+        r300->fs_rc_constant_state.size = fs->shader->rc_state_count * 7;
+        r300->fs_constants.size = fs->shader->externals_count * 4 + 3;
+    } else {
+        r300->fs_rc_constant_state.size = fs->shader->rc_state_count * 5;
+        r300->fs_constants.size = fs->shader->externals_count * 4 + 1;
+    }
+}
+
+/* Bind fragment shader state. */
+static void r300_bind_fs_state(struct pipe_context* pipe, void* shader)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_fragment_shader* fs = (struct r300_fragment_shader*)shader;
+
+    if (fs == NULL) {
+        r300->fs.state = NULL;
+        return;
+    }
+
+    r300->fs.state = fs;
+    r300_pick_fragment_shader(r300);
+    r300_mark_fs_code_dirty(r300);
+
+    r300->rs_block_state.dirty = TRUE; /* Will be updated before the emission. */
+}
+
+/* Delete fragment shader state. */
+static void r300_delete_fs_state(struct pipe_context* pipe, void* shader)
+{
+    struct r300_fragment_shader* fs = (struct r300_fragment_shader*)shader;
+    struct r300_fragment_shader_code *tmp, *ptr = fs->first;
+
+    while (ptr) {
+        tmp = ptr;
+        ptr = ptr->next;
+        rc_constants_destroy(&tmp->code.constants);
+        FREE(tmp->cb_code);
+        FREE(tmp);
+    }
+    FREE((void*)fs->state.tokens);
+    FREE(shader);
+}
+
+static void r300_set_polygon_stipple(struct pipe_context* pipe,
+                                     const struct pipe_poly_stipple* state)
+{
+    /* XXX no idea how to set this up, but not terribly important */
+}
+
+/* Create a new rasterizer state based on the CSO rasterizer state.
+ *
+ * This is a very large chunk of state, and covers most of the graphics
+ * backend (GB), geometry assembly (GA), and setup unit (SU) blocks.
+ *
+ * In a not entirely unironic sidenote, this state has nearly nothing to do
+ * with the actual block on the Radeon called the rasterizer (RS). */
+static void* r300_create_rs_state(struct pipe_context* pipe,
+                                  const struct pipe_rasterizer_state* state)
+{
+    struct r300_rs_state* rs = CALLOC_STRUCT(r300_rs_state);
+    int i;
+    float psiz;
+
+    /* Copy rasterizer state. */
+    rs->rs = *state;
+    rs->rs_draw = *state;
+
+    /* Override some states for Draw. */
+    rs->rs_draw.sprite_coord_enable = 0; /* We can do this in HW. */
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+    rs->vap_control_status = R300_VC_NO_SWAP;
+#else
+    rs->vap_control_status = R300_VC_32BIT_SWAP;
+#endif
+
+    /* If no TCL engine is present, turn off the HW TCL. */
+    if (!r300_screen(pipe->screen)->caps.has_tcl) {
+        rs->vap_control_status |= R300_VAP_TCL_BYPASS;
+    }
+
+    /* Point size width and height. */
+    rs->point_size =
+        pack_float_16_6x(state->point_size) |
+        (pack_float_16_6x(state->point_size) << R300_POINTSIZE_X_SHIFT);
+
+    /* Point size clamping. */
+    if (state->point_size_per_vertex) {
+        /* Per-vertex point size.
+         * Clamp to [0, max FB size] */
+        psiz = pipe->screen->get_paramf(pipe->screen,
+                                        PIPE_CAP_MAX_POINT_WIDTH);
+        rs->point_minmax =
+            pack_float_16_6x(psiz) << R300_GA_POINT_MINMAX_MAX_SHIFT;
+    } else {
+        /* We cannot disable the point-size vertex output,
+         * so clamp it. */
+        psiz = state->point_size;
+        rs->point_minmax =
+            (pack_float_16_6x(psiz) << R300_GA_POINT_MINMAX_MIN_SHIFT) |
+            (pack_float_16_6x(psiz) << R300_GA_POINT_MINMAX_MAX_SHIFT);
+    }
+
+    /* Line control. */
+    rs->line_control = pack_float_16_6x(state->line_width) |
+        R300_GA_LINE_CNTL_END_TYPE_COMP;
+
+    /* Enable polygon mode */
+    if (state->fill_front != PIPE_POLYGON_MODE_FILL ||
+        state->fill_back != PIPE_POLYGON_MODE_FILL) {
+        rs->polygon_mode = R300_GA_POLY_MODE_DUAL;
+    }
+
+    /* Front face */
+    if (state->front_ccw) 
+        rs->cull_mode = R300_FRONT_FACE_CCW;
+    else
+        rs->cull_mode = R300_FRONT_FACE_CW;
+
+    /* Polygon offset */
+    if (util_get_offset(state, state->fill_front)) {
+       rs->polygon_offset_enable |= R300_FRONT_ENABLE;
+    }
+    if (util_get_offset(state, state->fill_back)) {
+       rs->polygon_offset_enable |= R300_BACK_ENABLE;
+    }
+
+    /* Polygon mode */
+    if (rs->polygon_mode) {
+       rs->polygon_mode |=
+          r300_translate_polygon_mode_front(state->fill_front);
+       rs->polygon_mode |=
+          r300_translate_polygon_mode_back(state->fill_back);
+    }
+
+    if (state->cull_face & PIPE_FACE_FRONT) {
+        rs->cull_mode |= R300_CULL_FRONT;
+    }
+    if (state->cull_face & PIPE_FACE_BACK) {
+        rs->cull_mode |= R300_CULL_BACK;
+    }
+
+    if (rs->polygon_offset_enable) {
+        rs->depth_offset = state->offset_units;
+        rs->depth_scale = state->offset_scale;
+    }
+
+    if (state->line_stipple_enable) {
+        rs->line_stipple_config =
+            R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_LINE |
+            (fui((float)state->line_stipple_factor) &
+                R300_GA_LINE_STIPPLE_CONFIG_STIPPLE_SCALE_MASK);
+        /* XXX this might need to be scaled up */
+        rs->line_stipple_value = state->line_stipple_pattern;
+    }
+
+    if (state->flatshade) {
+        rs->color_control = R300_SHADE_MODEL_FLAT;
+    } else {
+        rs->color_control = R300_SHADE_MODEL_SMOOTH;
+    }
+
+    rs->clip_rule = state->scissor ? 0xAAAA : 0xFFFF;
+
+    /* Point sprites */
+    if (state->sprite_coord_enable) {
+        rs->stuffing_enable = R300_GB_POINT_STUFF_ENABLE;
+	for (i = 0; i < 8; i++) {
+	    if (state->sprite_coord_enable & (1 << i))
+		rs->stuffing_enable |=
+		    R300_GB_TEX_STR << (R300_GB_TEX0_SOURCE_SHIFT + (i*2));
+	}
+
+        rs->point_texcoord_left = 0.0f;
+        rs->point_texcoord_right = 1.0f;
+
+        switch (state->sprite_coord_mode) {
+            case PIPE_SPRITE_COORD_UPPER_LEFT:
+                rs->point_texcoord_top = 0.0f;
+                rs->point_texcoord_bottom = 1.0f;
+                break;
+            case PIPE_SPRITE_COORD_LOWER_LEFT:
+                rs->point_texcoord_top = 1.0f;
+                rs->point_texcoord_bottom = 0.0f;
+                break;
+        }
+    }
+
+    if (state->gl_rasterization_rules) {
+        rs->multisample_position_0 = 0x66666666;
+        rs->multisample_position_1 = 0x6666666;
+    }
+
+    return (void*)rs;
+}
+
+/* Bind rasterizer state. */
+static void r300_bind_rs_state(struct pipe_context* pipe, void* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_rs_state* rs = (struct r300_rs_state*)state;
+    int last_sprite_coord_enable = r300->sprite_coord_enable;
+    boolean last_two_sided_color = r300->two_sided_color;
+
+    if (r300->draw && rs) {
+        draw_flush(r300->draw);
+        draw_set_rasterizer_state(r300->draw, &rs->rs_draw, state);
+    }
+
+    if (rs) {
+        r300->polygon_offset_enabled = (rs->rs.offset_point ||
+                                        rs->rs.offset_line ||
+                                        rs->rs.offset_tri);
+        r300->sprite_coord_enable = rs->rs.sprite_coord_enable;
+        r300->two_sided_color = rs->rs.light_twoside;
+    } else {
+        r300->polygon_offset_enabled = FALSE;
+        r300->sprite_coord_enable = 0;
+        r300->two_sided_color = FALSE;
+    }
+
+    UPDATE_STATE(state, r300->rs_state);
+    r300->rs_state.size = 25 + (r300->polygon_offset_enabled ? 5 : 0) +
+        (r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0) ? 5 : 0);
+
+    if (last_sprite_coord_enable != r300->sprite_coord_enable ||
+        last_two_sided_color != r300->two_sided_color) {
+        r300->rs_block_state.dirty = TRUE;
+    }
+}
+
+/* Free rasterizer state. */
+static void r300_delete_rs_state(struct pipe_context* pipe, void* state)
+{
+    FREE(state);
+}
+
+static void*
+        r300_create_sampler_state(struct pipe_context* pipe,
+                                  const struct pipe_sampler_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_sampler_state* sampler = CALLOC_STRUCT(r300_sampler_state);
+    boolean is_r500 = r300->screen->caps.is_r500;
+    int lod_bias;
+    union util_color uc;
+
+    sampler->state = *state;
+
+    /* r300 doesn't handle CLAMP and MIRROR_CLAMP correctly when either MAG
+     * or MIN filter is NEAREST. Since texwrap produces same results
+     * for CLAMP and CLAMP_TO_EDGE, we use them instead. */
+    if (sampler->state.min_img_filter == PIPE_TEX_FILTER_NEAREST ||
+        sampler->state.mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
+        /* Wrap S. */
+        if (sampler->state.wrap_s == PIPE_TEX_WRAP_CLAMP)
+            sampler->state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+        else if (sampler->state.wrap_s == PIPE_TEX_WRAP_MIRROR_CLAMP)
+            sampler->state.wrap_s = PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE;
+
+        /* Wrap T. */
+        if (sampler->state.wrap_t == PIPE_TEX_WRAP_CLAMP)
+            sampler->state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+        else if (sampler->state.wrap_t == PIPE_TEX_WRAP_MIRROR_CLAMP)
+            sampler->state.wrap_t = PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE;
+
+        /* Wrap R. */
+        if (sampler->state.wrap_r == PIPE_TEX_WRAP_CLAMP)
+            sampler->state.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+        else if (sampler->state.wrap_r == PIPE_TEX_WRAP_MIRROR_CLAMP)
+            sampler->state.wrap_r = PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE;
+    }
+
+    sampler->filter0 |=
+        (r300_translate_wrap(sampler->state.wrap_s) << R300_TX_WRAP_S_SHIFT) |
+        (r300_translate_wrap(sampler->state.wrap_t) << R300_TX_WRAP_T_SHIFT) |
+        (r300_translate_wrap(sampler->state.wrap_r) << R300_TX_WRAP_R_SHIFT);
+
+    sampler->filter0 |= r300_translate_tex_filters(state->min_img_filter,
+                                                   state->mag_img_filter,
+                                                   state->min_mip_filter,
+                                                   state->max_anisotropy > 0);
+
+    sampler->filter0 |= r300_anisotropy(state->max_anisotropy);
+
+    /* Unfortunately, r300-r500 don't support floating-point mipmap lods. */
+    /* We must pass these to the merge function to clamp them properly. */
+    sampler->min_lod = MAX2((unsigned)state->min_lod, 0);
+    sampler->max_lod = MAX2((unsigned)ceilf(state->max_lod), 0);
+
+    lod_bias = CLAMP((int)(state->lod_bias * 32 + 1), -(1 << 9), (1 << 9) - 1);
+
+    sampler->filter1 |= lod_bias << R300_LOD_BIAS_SHIFT;
+
+    /* This is very high quality anisotropic filtering for R5xx.
+     * It's good for benchmarking the performance of texturing but
+     * in practice we don't want to slow down the driver because it's
+     * a pretty good performance killer. Feel free to play with it. */
+    if (DBG_ON(r300, DBG_ANISOHQ) && is_r500) {
+        sampler->filter1 |= r500_anisotropy(state->max_anisotropy);
+    }
+
+    util_pack_color(state->border_color, PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
+    sampler->border_color = uc.ui;
+
+    /* R500-specific fixups and optimizations */
+    if (r300->screen->caps.is_r500) {
+        sampler->filter1 |= R500_BORDER_FIX;
+    }
+
+    return (void*)sampler;
+}
+
+static void r300_bind_sampler_states(struct pipe_context* pipe,
+                                     unsigned count,
+                                     void** states)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_textures_state* state =
+        (struct r300_textures_state*)r300->textures_state.state;
+    unsigned tex_units = r300->screen->caps.num_tex_units;
+
+    if (count > tex_units) {
+        return;
+    }
+
+    memcpy(state->sampler_states, states, sizeof(void*) * count);
+    state->sampler_state_count = count;
+
+    r300->textures_state.dirty = TRUE;
+}
+
+static void r300_lacks_vertex_textures(struct pipe_context* pipe,
+                                       unsigned count,
+                                       void** states)
+{
+}
+
+static void r300_delete_sampler_state(struct pipe_context* pipe, void* state)
+{
+    FREE(state);
+}
+
+static uint32_t r300_assign_texture_cache_region(unsigned index, unsigned num)
+{
+    /* This looks like a hack, but I believe it's suppose to work like
+     * that. To illustrate how this works, let's assume you have 5 textures.
+     * From docs, 5 and the successive numbers are:
+     *
+     * FOURTH_1     = 5
+     * FOURTH_2     = 6
+     * FOURTH_3     = 7
+     * EIGHTH_0     = 8
+     * EIGHTH_1     = 9
+     *
+     * First 3 textures will get 3/4 of size of the cache, divived evenly
+     * between them. The last 1/4 of the cache must be divided between
+     * the last 2 textures, each will therefore get 1/8 of the cache.
+     * Why not just to use "5 + texture_index" ?
+     *
+     * This simple trick works for all "num" <= 16.
+     */
+    if (num <= 1)
+        return R300_TX_CACHE(R300_TX_CACHE_WHOLE);
+    else
+        return R300_TX_CACHE(num + index);
+}
+
+static void r300_set_fragment_sampler_views(struct pipe_context* pipe,
+                                            unsigned count,
+                                            struct pipe_sampler_view** views)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_textures_state* state =
+        (struct r300_textures_state*)r300->textures_state.state;
+    struct r300_texture *texture;
+    unsigned i, real_num_views = 0, view_index = 0;
+    unsigned tex_units = r300->screen->caps.num_tex_units;
+    boolean dirty_tex = FALSE;
+
+    if (count > tex_units) {
+        return;
+    }
+
+    /* Calculate the real number of views. */
+    for (i = 0; i < count; i++) {
+        if (views[i])
+            real_num_views++;
+    }
+
+    for (i = 0; i < count; i++) {
+        if (&state->sampler_views[i]->base != views[i]) {
+            pipe_sampler_view_reference(
+                    (struct pipe_sampler_view**)&state->sampler_views[i],
+                    views[i]);
+
+            if (!views[i]) {
+                continue;
+            }
+
+            /* A new sampler view (= texture)... */
+            dirty_tex = TRUE;
+
+            /* Set the texrect factor in the fragment shader.
+             * Needed for RECT and NPOT fallback. */
+            texture = r300_texture(views[i]->texture);
+            if (texture->uses_pitch) {
+                r300->fs_rc_constant_state.dirty = TRUE;
+            }
+
+            state->sampler_views[i]->texcache_region =
+                r300_assign_texture_cache_region(view_index, real_num_views);
+            view_index++;
+        }
+    }
+
+    for (i = count; i < tex_units; i++) {
+        if (state->sampler_views[i]) {
+            pipe_sampler_view_reference(
+                    (struct pipe_sampler_view**)&state->sampler_views[i],
+                    NULL);
+        }
+    }
+
+    state->sampler_view_count = count;
+
+    r300->textures_state.dirty = TRUE;
+
+    if (dirty_tex) {
+        r300->texture_cache_inval.dirty = TRUE;
+    }
+}
+
+static struct pipe_sampler_view *
+r300_create_sampler_view(struct pipe_context *pipe,
+                         struct pipe_resource *texture,
+                         const struct pipe_sampler_view *templ)
+{
+    struct r300_sampler_view *view = CALLOC_STRUCT(r300_sampler_view);
+    struct r300_texture *tex = r300_texture(texture);
+
+    if (view) {
+        view->base = *templ;
+        view->base.reference.count = 1;
+        view->base.context = pipe;
+        view->base.texture = NULL;
+        pipe_resource_reference(&view->base.texture, texture);
+
+        view->swizzle[0] = templ->swizzle_r;
+        view->swizzle[1] = templ->swizzle_g;
+        view->swizzle[2] = templ->swizzle_b;
+        view->swizzle[3] = templ->swizzle_a;
+
+        view->format = tex->tx_format;
+        view->format.format1 |= r300_translate_texformat(templ->format,
+                                                         view->swizzle);
+        if (r300_screen(pipe->screen)->caps.is_r500) {
+            view->format.format2 |= r500_tx_format_msb_bit(templ->format);
+        }
+    }
+
+    return (struct pipe_sampler_view*)view;
+}
+
+static void
+r300_sampler_view_destroy(struct pipe_context *pipe,
+                          struct pipe_sampler_view *view)
+{
+   pipe_resource_reference(&view->texture, NULL);
+   FREE(view);
+}
+
+static void r300_set_scissor_state(struct pipe_context* pipe,
+                                   const struct pipe_scissor_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    memcpy(r300->scissor_state.state, state,
+        sizeof(struct pipe_scissor_state));
+
+    r300->scissor_state.dirty = TRUE;
+}
+
+static void r300_set_viewport_state(struct pipe_context* pipe,
+                                    const struct pipe_viewport_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_viewport_state* viewport =
+        (struct r300_viewport_state*)r300->viewport_state.state;
+
+    r300->viewport = *state;
+
+    if (r300->draw) {
+        draw_flush(r300->draw);
+        draw_set_viewport_state(r300->draw, state);
+        viewport->vte_control = R300_VTX_XY_FMT | R300_VTX_Z_FMT;
+        return;
+    }
+
+    /* Do the transform in HW. */
+    viewport->vte_control = R300_VTX_W0_FMT;
+
+    if (state->scale[0] != 1.0f) {
+        viewport->xscale = state->scale[0];
+        viewport->vte_control |= R300_VPORT_X_SCALE_ENA;
+    }
+    if (state->scale[1] != 1.0f) {
+        viewport->yscale = state->scale[1];
+        viewport->vte_control |= R300_VPORT_Y_SCALE_ENA;
+    }
+    if (state->scale[2] != 1.0f) {
+        viewport->zscale = state->scale[2];
+        viewport->vte_control |= R300_VPORT_Z_SCALE_ENA;
+    }
+    if (state->translate[0] != 0.0f) {
+        viewport->xoffset = state->translate[0];
+        viewport->vte_control |= R300_VPORT_X_OFFSET_ENA;
+    }
+    if (state->translate[1] != 0.0f) {
+        viewport->yoffset = state->translate[1];
+        viewport->vte_control |= R300_VPORT_Y_OFFSET_ENA;
+    }
+    if (state->translate[2] != 0.0f) {
+        viewport->zoffset = state->translate[2];
+        viewport->vte_control |= R300_VPORT_Z_OFFSET_ENA;
+    }
+
+    r300->viewport_state.dirty = TRUE;
+    if (r300->fs.state && r300_fs(r300)->shader->inputs.wpos != ATTR_UNUSED) {
+        r300->fs_rc_constant_state.dirty = TRUE;
+    }
+}
+
+static void r300_set_vertex_buffers(struct pipe_context* pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_buffer* buffers)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct pipe_vertex_buffer *vbo;
+    unsigned i, max_index = (1 << 24) - 1;
+    boolean any_user_buffer = FALSE;
+
+    if (count == r300->vertex_buffer_count &&
+        memcmp(r300->vertex_buffer, buffers,
+            sizeof(struct pipe_vertex_buffer) * count) == 0) {
+        return;
+    }
+
+    if (r300->screen->caps.has_tcl) {
+        /* HW TCL. */
+        r300->incompatible_vb_layout = FALSE;
+
+        /* Check if the strides and offsets are aligned to the size of DWORD. */
+        for (i = 0; i < count; i++) {
+            if (buffers[i].buffer) {
+                if (buffers[i].stride % 4 != 0 ||
+                    buffers[i].buffer_offset % 4 != 0) {
+                    r300->incompatible_vb_layout = TRUE;
+                    break;
+                }
+            }
+        }
+
+        for (i = 0; i < count; i++) {
+            /* Why, yes, I AM casting away constness. How did you know? */
+            vbo = (struct pipe_vertex_buffer*)&buffers[i];
+
+            /* Skip NULL buffers */
+            if (!buffers[i].buffer) {
+                continue;
+            }
+
+            if (r300_buffer_is_user_buffer(vbo->buffer)) {
+                any_user_buffer = TRUE;
+            }
+
+            if (vbo->max_index == ~0) {
+                /* if no VBO stride then only one vertex value so max index is 1 */
+                /* should think about converting to VS constants like svga does */
+                if (!vbo->stride)
+                    vbo->max_index = 1;
+                else
+                    vbo->max_index =
+                             (vbo->buffer->width0 - vbo->buffer_offset) / vbo->stride;
+            }
+
+            max_index = MIN2(vbo->max_index, max_index);
+        }
+
+        r300->any_user_vbs = any_user_buffer;
+        r300->vertex_buffer_max_index = max_index;
+
+    } else {
+        /* SW TCL. */
+        draw_flush(r300->draw);
+        draw_set_vertex_buffers(r300->draw, count, buffers);
+    }
+
+    /* Common code. */
+    for (i = 0; i < count; i++) {
+        /* Reference our buffer. */
+        pipe_resource_reference(&r300->vertex_buffer[i].buffer, buffers[i].buffer);
+    }
+    for (; i < r300->vertex_buffer_count; i++) {
+        /* Dereference any old buffers. */
+        pipe_resource_reference(&r300->vertex_buffer[i].buffer, NULL);
+    }
+
+    memcpy(r300->vertex_buffer, buffers,
+        sizeof(struct pipe_vertex_buffer) * count);
+    r300->vertex_buffer_count = count;
+}
+
+/* Initialize the PSC tables. */
+static void r300_vertex_psc(struct r300_vertex_element_state *velems)
+{
+    struct r300_vertex_stream_state *vstream = &velems->vertex_stream;
+    uint16_t type, swizzle;
+    enum pipe_format format;
+    unsigned i;
+
+    if (velems->count > 16) {
+        fprintf(stderr, "r300: More than 16 vertex elements are not supported,"
+                " requested %i, using 16.\n", velems->count);
+        velems->count = 16;
+    }
+
+    /* Vertex shaders have no semantics on their inputs,
+     * so PSC should just route stuff based on the vertex elements,
+     * and not on attrib information. */
+    for (i = 0; i < velems->count; i++) {
+        format = velems->hw_format[i];
+
+        type = r300_translate_vertex_data_type(format);
+        if (type == R300_INVALID_FORMAT) {
+            fprintf(stderr, "r300: Bad vertex format %s.\n",
+                    util_format_short_name(format));
+            assert(0);
+            abort();
+        }
+
+        type |= i << R300_DST_VEC_LOC_SHIFT;
+        swizzle = r300_translate_vertex_data_swizzle(format);
+
+        if (i & 1) {
+            vstream->vap_prog_stream_cntl[i >> 1] |= type << 16;
+            vstream->vap_prog_stream_cntl_ext[i >> 1] |= swizzle << 16;
+        } else {
+            vstream->vap_prog_stream_cntl[i >> 1] |= type;
+            vstream->vap_prog_stream_cntl_ext[i >> 1] |= swizzle;
+        }
+    }
+
+    /* Set the last vector in the PSC. */
+    if (i) {
+        i -= 1;
+    }
+    vstream->vap_prog_stream_cntl[i >> 1] |=
+        (R300_LAST_VEC << (i & 1 ? 16 : 0));
+
+    vstream->count = (i >> 1) + 1;
+}
+
+#define FORMAT_REPLACE(what, withwhat) \
+    case PIPE_FORMAT_##what: *format = PIPE_FORMAT_##withwhat; break
+
+static void* r300_create_vertex_elements_state(struct pipe_context* pipe,
+                                               unsigned count,
+                                               const struct pipe_vertex_element* attribs)
+{
+    struct r300_vertex_element_state *velems;
+    unsigned i;
+    enum pipe_format *format;
+
+    assert(count <= PIPE_MAX_ATTRIBS);
+    velems = CALLOC_STRUCT(r300_vertex_element_state);
+    if (velems != NULL) {
+        velems->count = count;
+        memcpy(velems->velem, attribs, sizeof(struct pipe_vertex_element) * count);
+
+        if (r300_screen(pipe->screen)->caps.has_tcl) {
+            /* Set the best hw format in case the original format is not
+             * supported by hw. */
+            for (i = 0; i < count; i++) {
+                velems->hw_format[i] = velems->velem[i].src_format;
+                format = &velems->hw_format[i];
+
+                /* This is basically the list of unsupported formats.
+                 * For now we don't care about the alignment, that's going to
+                 * be sorted out after the PSC setup. */
+                switch (*format) {
+                    FORMAT_REPLACE(R64_FLOAT,           R32_FLOAT);
+                    FORMAT_REPLACE(R64G64_FLOAT,        R32G32_FLOAT);
+                    FORMAT_REPLACE(R64G64B64_FLOAT,     R32G32B32_FLOAT);
+                    FORMAT_REPLACE(R64G64B64A64_FLOAT,  R32G32B32A32_FLOAT);
+
+                    FORMAT_REPLACE(R32_UNORM,           R32_FLOAT);
+                    FORMAT_REPLACE(R32G32_UNORM,        R32G32_FLOAT);
+                    FORMAT_REPLACE(R32G32B32_UNORM,     R32G32B32_FLOAT);
+                    FORMAT_REPLACE(R32G32B32A32_UNORM,  R32G32B32A32_FLOAT);
+
+                    FORMAT_REPLACE(R32_USCALED,         R32_FLOAT);
+                    FORMAT_REPLACE(R32G32_USCALED,      R32G32_FLOAT);
+                    FORMAT_REPLACE(R32G32B32_USCALED,   R32G32B32_FLOAT);
+                    FORMAT_REPLACE(R32G32B32A32_USCALED,R32G32B32A32_FLOAT);
+
+                    FORMAT_REPLACE(R32_SNORM,           R32_FLOAT);
+                    FORMAT_REPLACE(R32G32_SNORM,        R32G32_FLOAT);
+                    FORMAT_REPLACE(R32G32B32_SNORM,     R32G32B32_FLOAT);
+                    FORMAT_REPLACE(R32G32B32A32_SNORM,  R32G32B32A32_FLOAT);
+
+                    FORMAT_REPLACE(R32_SSCALED,         R32_FLOAT);
+                    FORMAT_REPLACE(R32G32_SSCALED,      R32G32_FLOAT);
+                    FORMAT_REPLACE(R32G32B32_SSCALED,   R32G32B32_FLOAT);
+                    FORMAT_REPLACE(R32G32B32A32_SSCALED,R32G32B32A32_FLOAT);
+
+                    FORMAT_REPLACE(R32_FIXED,           R32_FLOAT);
+                    FORMAT_REPLACE(R32G32_FIXED,        R32G32_FLOAT);
+                    FORMAT_REPLACE(R32G32B32_FIXED,     R32G32B32_FLOAT);
+                    FORMAT_REPLACE(R32G32B32A32_FIXED,  R32G32B32A32_FLOAT);
+
+                    default:;
+                }
+
+                velems->incompatible_layout =
+                        velems->incompatible_layout ||
+                        velems->velem[i].src_format != velems->hw_format[i] ||
+                        velems->velem[i].src_offset % 4 != 0;
+            }
+
+            /* Now setup PSC.
+             * The unused components will be replaced by (..., 0, 1). */
+            r300_vertex_psc(velems);
+
+            /* Align the formats to the size of DWORD.
+             * We only care about the blocksizes of the formats since
+             * swizzles are already set up.
+             * Also compute the vertex size. */
+            for (i = 0; i < count; i++) {
+                /* This is OK because we check for aligned strides too. */
+                velems->hw_format_size[i] =
+                    align(util_format_get_blocksize(velems->hw_format[i]), 4);
+                velems->vertex_size_dwords += velems->hw_format_size[i] / 4;
+            }
+        }
+    }
+    return velems;
+}
+
+static void r300_bind_vertex_elements_state(struct pipe_context *pipe,
+                                            void *state)
+{
+    struct r300_context *r300 = r300_context(pipe);
+    struct r300_vertex_element_state *velems = state;
+
+    if (velems == NULL) {
+        return;
+    }
+
+    r300->velems = velems;
+
+    if (r300->draw) {
+        draw_flush(r300->draw);
+        draw_set_vertex_elements(r300->draw, velems->count, velems->velem);
+        return;
+    }
+
+    UPDATE_STATE(&velems->vertex_stream, r300->vertex_stream_state);
+    r300->vertex_stream_state.size = (1 + velems->vertex_stream.count) * 2;
+}
+
+static void r300_delete_vertex_elements_state(struct pipe_context *pipe, void *state)
+{
+   FREE(state);
+}
+
+static void* r300_create_vs_state(struct pipe_context* pipe,
+                                  const struct pipe_shader_state* shader)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    struct r300_vertex_shader* vs = CALLOC_STRUCT(r300_vertex_shader);
+
+    /* Copy state directly into shader. */
+    vs->state = *shader;
+    vs->state.tokens = tgsi_dup_tokens(shader->tokens);
+
+    if (r300->screen->caps.has_tcl) {
+        r300_init_vs_outputs(vs);
+        r300_translate_vertex_shader(r300, vs);
+    } else {
+        r300_draw_init_vertex_shader(r300->draw, vs);
+    }
+
+    return vs;
+}
+
+static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_vertex_shader* vs = (struct r300_vertex_shader*)shader;
+
+    if (vs == NULL) {
+        r300->vs_state.state = NULL;
+        return;
+    }
+    if (vs == r300->vs_state.state) {
+        return;
+    }
+    r300->vs_state.state = vs;
+
+    /* The majority of the RS block bits is dependent on the vertex shader. */
+    r300->rs_block_state.dirty = TRUE; /* Will be updated before the emission. */
+
+    if (r300->screen->caps.has_tcl) {
+        r300->vs_state.dirty = TRUE;
+        r300->vs_state.size =
+                vs->code.length + 9 +
+                (vs->immediates_count ? vs->immediates_count * 4 + 3 : 0);
+
+        if (vs->externals_count) {
+            r300->vs_constants.dirty = TRUE;
+            r300->vs_constants.size = vs->externals_count * 4 + 3;
+        } else {
+            r300->vs_constants.size = 0;
+        }
+
+        r300->pvs_flush.dirty = TRUE;
+    } else {
+        draw_flush(r300->draw);
+        draw_bind_vertex_shader(r300->draw,
+                (struct draw_vertex_shader*)vs->draw_vs);
+    }
+}
+
+static void r300_delete_vs_state(struct pipe_context* pipe, void* shader)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_vertex_shader* vs = (struct r300_vertex_shader*)shader;
+
+    if (r300->screen->caps.has_tcl) {
+        rc_constants_destroy(&vs->code.constants);
+    } else {
+        draw_delete_vertex_shader(r300->draw,
+                (struct draw_vertex_shader*)vs->draw_vs);
+    }
+
+    FREE((void*)vs->state.tokens);
+    FREE(shader);
+}
+
+static void r300_set_constant_buffer(struct pipe_context *pipe,
+                                     uint shader, uint index,
+                                     struct pipe_resource *buf)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_constant_buffer *cbuf;
+    struct pipe_transfer *tr;
+    float *mapped;
+    int max_size = 0, max_size_bytes = 0, clamped_size = 0;
+
+    switch (shader) {
+        case PIPE_SHADER_VERTEX:
+            cbuf = (struct r300_constant_buffer*)r300->vs_constants.state;
+            max_size = 256;
+            break;
+        case PIPE_SHADER_FRAGMENT:
+            cbuf = (struct r300_constant_buffer*)r300->fs_constants.state;
+            if (r300->screen->caps.is_r500) {
+                max_size = 256;
+            } else {
+                max_size = 32;
+            }
+            break;
+        default:
+            assert(0);
+            return;
+    }
+    max_size_bytes = max_size * 4 * sizeof(float);
+
+    if (buf == NULL || buf->width0 == 0 ||
+        (mapped = pipe_buffer_map(pipe, buf, PIPE_TRANSFER_READ, &tr)) == NULL)
+    {
+        cbuf->count = 0;
+        return;
+    }
+
+    if (shader == PIPE_SHADER_FRAGMENT ||
+        (shader == PIPE_SHADER_VERTEX && r300->screen->caps.has_tcl)) {
+        assert((buf->width0 % (4 * sizeof(float))) == 0);
+
+        /* Check the size of the constant buffer. */
+        /* XXX Subtract immediates and RC_STATE_* variables. */
+        if (buf->width0 > max_size_bytes) {
+            fprintf(stderr, "r300: Max size of the constant buffer is "
+                          "%i*4 floats.\n", max_size);
+        }
+
+        clamped_size = MIN2(buf->width0, max_size_bytes);
+        cbuf->count = clamped_size / (4 * sizeof(float));
+
+        if (shader == PIPE_SHADER_FRAGMENT && !r300->screen->caps.is_r500) {
+            unsigned i,j;
+
+            /* Convert constants to float24. */
+            for (i = 0; i < cbuf->count; i++)
+                for (j = 0; j < 4; j++)
+                    cbuf->constants[i][j] = pack_float24(mapped[i*4+j]);
+        } else {
+            memcpy(cbuf->constants, mapped, clamped_size);
+        }
+    }
+
+    if (shader == PIPE_SHADER_VERTEX) {
+        if (r300->screen->caps.has_tcl) {
+            if (r300->vs_constants.size) {
+                r300->vs_constants.dirty = TRUE;
+            }
+            r300->pvs_flush.dirty = TRUE;
+        } else if (r300->draw) {
+            draw_set_mapped_constant_buffer(r300->draw, PIPE_SHADER_VERTEX,
+                0, mapped, buf->width0);
+        }
+    } else if (shader == PIPE_SHADER_FRAGMENT) {
+        r300->fs_constants.dirty = TRUE;
+    }
+
+    pipe_buffer_unmap(pipe, buf, tr);
+}
+
+void r300_init_state_functions(struct r300_context* r300)
+{
+    r300->context.create_blend_state = r300_create_blend_state;
+    r300->context.bind_blend_state = r300_bind_blend_state;
+    r300->context.delete_blend_state = r300_delete_blend_state;
+
+    r300->context.set_blend_color = r300_set_blend_color;
+
+    r300->context.set_clip_state = r300_set_clip_state;
+    r300->context.set_sample_mask = r300_set_sample_mask;
+
+    r300->context.set_constant_buffer = r300_set_constant_buffer;
+
+    r300->context.create_depth_stencil_alpha_state = r300_create_dsa_state;
+    r300->context.bind_depth_stencil_alpha_state = r300_bind_dsa_state;
+    r300->context.delete_depth_stencil_alpha_state = r300_delete_dsa_state;
+
+    r300->context.set_stencil_ref = r300_set_stencil_ref;
+
+    r300->context.set_framebuffer_state = r300_set_framebuffer_state;
+
+    r300->context.create_fs_state = r300_create_fs_state;
+    r300->context.bind_fs_state = r300_bind_fs_state;
+    r300->context.delete_fs_state = r300_delete_fs_state;
+
+    r300->context.set_polygon_stipple = r300_set_polygon_stipple;
+
+    r300->context.create_rasterizer_state = r300_create_rs_state;
+    r300->context.bind_rasterizer_state = r300_bind_rs_state;
+    r300->context.delete_rasterizer_state = r300_delete_rs_state;
+
+    r300->context.create_sampler_state = r300_create_sampler_state;
+    r300->context.bind_fragment_sampler_states = r300_bind_sampler_states;
+    r300->context.bind_vertex_sampler_states = r300_lacks_vertex_textures;
+    r300->context.delete_sampler_state = r300_delete_sampler_state;
+
+    r300->context.set_fragment_sampler_views = r300_set_fragment_sampler_views;
+    r300->context.create_sampler_view = r300_create_sampler_view;
+    r300->context.sampler_view_destroy = r300_sampler_view_destroy;
+
+    r300->context.set_scissor_state = r300_set_scissor_state;
+
+    r300->context.set_viewport_state = r300_set_viewport_state;
+
+    r300->context.set_vertex_buffers = r300_set_vertex_buffers;
+
+    r300->context.create_vertex_elements_state = r300_create_vertex_elements_state;
+    r300->context.bind_vertex_elements_state = r300_bind_vertex_elements_state;
+    r300->context.delete_vertex_elements_state = r300_delete_vertex_elements_state;
+
+    r300->context.create_vs_state = r300_create_vs_state;
+    r300->context.bind_vs_state = r300_bind_vs_state;
+    r300->context.delete_vs_state = r300_delete_vs_state;
+}
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
new file mode 100644
index 0000000000..3aa8deb63c
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -0,0 +1,684 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "draw/draw_context.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "r300_context.h"
+#include "r300_fs.h"
+#include "r300_hyperz.h"
+#include "r300_screen.h"
+#include "r300_shader_semantics.h"
+#include "r300_state_derived.h"
+#include "r300_state_inlines.h"
+#include "r300_texture.h"
+#include "r300_vs.h"
+
+/* r300_state_derived: Various bits of state which are dependent upon
+ * currently bound CSO data. */
+
+enum r300_rs_swizzle {
+    SWIZ_XYZW = 0,
+    SWIZ_X001,
+    SWIZ_XY01,
+    SWIZ_0001,
+};
+
+static void r300_draw_emit_attrib(struct r300_context* r300,
+                                  enum attrib_emit emit,
+                                  enum interp_mode interp,
+                                  int index)
+{
+    struct r300_vertex_shader* vs = r300->vs_state.state;
+    struct tgsi_shader_info* info = &vs->info;
+    int output;
+
+    output = draw_find_shader_output(r300->draw,
+                                     info->output_semantic_name[index],
+                                     info->output_semantic_index[index]);
+    draw_emit_vertex_attr(&r300->vertex_info, emit, interp, output);
+}
+
+static void r300_draw_emit_all_attribs(struct r300_context* r300)
+{
+    struct r300_vertex_shader* vs = r300->vs_state.state;
+    struct r300_shader_semantics* vs_outputs = &vs->outputs;
+    int i, gen_count;
+
+    /* Position. */
+    if (vs_outputs->pos != ATTR_UNUSED) {
+        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
+                              vs_outputs->pos);
+    } else {
+        assert(0);
+    }
+
+    /* Point size. */
+    if (vs_outputs->psize != ATTR_UNUSED) {
+        r300_draw_emit_attrib(r300, EMIT_1F_PSIZE, INTERP_POS,
+                              vs_outputs->psize);
+    }
+
+    /* Colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (vs_outputs->color[i] != ATTR_UNUSED) {
+            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR,
+                                  vs_outputs->color[i]);
+        }
+    }
+
+    /* Back-face colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (vs_outputs->bcolor[i] != ATTR_UNUSED) {
+            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR,
+                                  vs_outputs->bcolor[i]);
+        }
+    }
+
+    /* Texture coordinates. */
+    /* Only 8 generic vertex attributes can be used. If there are more,
+     * they won't be rasterized. */
+    gen_count = 0;
+    for (i = 0; i < ATTR_GENERIC_COUNT && gen_count < 8; i++) {
+        if (vs_outputs->generic[i] != ATTR_UNUSED) {
+            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
+                                  vs_outputs->generic[i]);
+            gen_count++;
+        }
+    }
+
+    /* Fog coordinates. */
+    if (gen_count < 8 && vs_outputs->fog != ATTR_UNUSED) {
+        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
+                              vs_outputs->fog);
+        gen_count++;
+    }
+
+    /* WPOS. */
+    if (r300_fs(r300)->shader->inputs.wpos != ATTR_UNUSED && gen_count < 8) {
+        DBG(r300, DBG_DRAW, "draw_emit_attrib: WPOS, index: %i\n",
+            vs_outputs->wpos);
+        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
+                              vs_outputs->wpos);
+    }
+}
+
+/* Update the PSC tables for SW TCL, using Draw. */
+static void r300_swtcl_vertex_psc(struct r300_context *r300)
+{
+    struct r300_vertex_stream_state *vstream = r300->vertex_stream_state.state;
+    struct vertex_info *vinfo = &r300->vertex_info;
+    uint16_t type, swizzle;
+    enum pipe_format format;
+    unsigned i, attrib_count;
+    int* vs_output_tab = r300->stream_loc_notcl;
+
+    memset(vstream, 0, sizeof(struct r300_vertex_stream_state));
+
+    /* For each Draw attribute, route it to the fragment shader according
+     * to the vs_output_tab. */
+    attrib_count = vinfo->num_attribs;
+    DBG(r300, DBG_DRAW, "r300: attrib count: %d\n", attrib_count);
+    for (i = 0; i < attrib_count; i++) {
+        DBG(r300, DBG_DRAW, "r300: attrib: index %d, interp %d, emit %d,"
+               " vs_output_tab %d\n", vinfo->attrib[i].src_index,
+               vinfo->attrib[i].interp_mode, vinfo->attrib[i].emit,
+               vs_output_tab[i]);
+
+        /* Make sure we have a proper destination for our attribute. */
+        assert(vs_output_tab[i] != -1);
+
+        format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
+
+        /* Obtain the type of data in this attribute. */
+        type = r300_translate_vertex_data_type(format);
+        if (type == R300_INVALID_FORMAT) {
+            fprintf(stderr, "r300: Bad vertex format %s.\n",
+                    util_format_short_name(format));
+            assert(0);
+            abort();
+        }
+
+        type |= vs_output_tab[i] << R300_DST_VEC_LOC_SHIFT;
+
+        /* Obtain the swizzle for this attribute. Note that the default
+         * swizzle in the hardware is not XYZW! */
+        swizzle = r300_translate_vertex_data_swizzle(format);
+
+        /* Add the attribute to the PSC table. */
+        if (i & 1) {
+            vstream->vap_prog_stream_cntl[i >> 1] |= type << 16;
+            vstream->vap_prog_stream_cntl_ext[i >> 1] |= swizzle << 16;
+        } else {
+            vstream->vap_prog_stream_cntl[i >> 1] |= type;
+            vstream->vap_prog_stream_cntl_ext[i >> 1] |= swizzle;
+        }
+    }
+
+    /* Set the last vector in the PSC. */
+    if (i) {
+        i -= 1;
+    }
+    vstream->vap_prog_stream_cntl[i >> 1] |=
+        (R300_LAST_VEC << (i & 1 ? 16 : 0));
+
+    vstream->count = (i >> 1) + 1;
+    r300->vertex_stream_state.dirty = TRUE;
+    r300->vertex_stream_state.size = (1 + vstream->count) * 2;
+}
+
+static void r300_rs_col(struct r300_rs_block* rs, int id, int ptr,
+                        enum r300_rs_swizzle swiz)
+{
+    rs->ip[id] |= R300_RS_COL_PTR(ptr);
+    if (swiz == SWIZ_0001) {
+        rs->ip[id] |= R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
+    } else {
+        rs->ip[id] |= R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
+    }
+    rs->inst[id] |= R300_RS_INST_COL_ID(id);
+}
+
+static void r300_rs_col_write(struct r300_rs_block* rs, int id, int fp_offset)
+{
+    rs->inst[id] |= R300_RS_INST_COL_CN_WRITE |
+                    R300_RS_INST_COL_ADDR(fp_offset);
+}
+
+static void r300_rs_tex(struct r300_rs_block* rs, int id, int ptr,
+                        enum r300_rs_swizzle swiz)
+{
+    if (swiz == SWIZ_X001) {
+        rs->ip[id] |= R300_RS_TEX_PTR(ptr*4) |
+                      R300_RS_SEL_S(R300_RS_SEL_C0) |
+                      R300_RS_SEL_T(R300_RS_SEL_K0) |
+                      R300_RS_SEL_R(R300_RS_SEL_K0) |
+                      R300_RS_SEL_Q(R300_RS_SEL_K1);
+    } else if (swiz == SWIZ_XY01) {
+        rs->ip[id] |= R300_RS_TEX_PTR(ptr*4) |
+                      R300_RS_SEL_S(R300_RS_SEL_C0) |
+                      R300_RS_SEL_T(R300_RS_SEL_C1) |
+                      R300_RS_SEL_R(R300_RS_SEL_K0) |
+                      R300_RS_SEL_Q(R300_RS_SEL_K1);
+    } else {
+        rs->ip[id] |= R300_RS_TEX_PTR(ptr*4) |
+                      R300_RS_SEL_S(R300_RS_SEL_C0) |
+                      R300_RS_SEL_T(R300_RS_SEL_C1) |
+                      R300_RS_SEL_R(R300_RS_SEL_C2) |
+                      R300_RS_SEL_Q(R300_RS_SEL_C3);
+    }
+    rs->inst[id] |= R300_RS_INST_TEX_ID(id);
+}
+
+static void r300_rs_tex_write(struct r300_rs_block* rs, int id, int fp_offset)
+{
+    rs->inst[id] |= R300_RS_INST_TEX_CN_WRITE |
+                    R300_RS_INST_TEX_ADDR(fp_offset);
+}
+
+static void r500_rs_col(struct r300_rs_block* rs, int id, int ptr,
+                        enum r300_rs_swizzle swiz)
+{
+    rs->ip[id] |= R500_RS_COL_PTR(ptr);
+    if (swiz == SWIZ_0001) {
+        rs->ip[id] |= R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
+    } else {
+        rs->ip[id] |= R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
+    }
+    rs->inst[id] |= R500_RS_INST_COL_ID(id);
+}
+
+static void r500_rs_col_write(struct r300_rs_block* rs, int id, int fp_offset)
+{
+    rs->inst[id] |= R500_RS_INST_COL_CN_WRITE |
+                    R500_RS_INST_COL_ADDR(fp_offset);
+}
+
+static void r500_rs_tex(struct r300_rs_block* rs, int id, int ptr,
+			enum r300_rs_swizzle swiz)
+{
+    int rs_tex_comp = ptr*4;
+
+    if (swiz == SWIZ_X001) {
+        rs->ip[id] |= R500_RS_SEL_S(rs_tex_comp) |
+                      R500_RS_SEL_T(R500_RS_IP_PTR_K0) |
+                      R500_RS_SEL_R(R500_RS_IP_PTR_K0) |
+                      R500_RS_SEL_Q(R500_RS_IP_PTR_K1);
+    } else if (swiz == SWIZ_XY01) {
+        rs->ip[id] |= R500_RS_SEL_S(rs_tex_comp) |
+                      R500_RS_SEL_T(rs_tex_comp + 1) |
+                      R500_RS_SEL_R(R500_RS_IP_PTR_K0) |
+                      R500_RS_SEL_Q(R500_RS_IP_PTR_K1);
+    } else {
+        rs->ip[id] |= R500_RS_SEL_S(rs_tex_comp) |
+                      R500_RS_SEL_T(rs_tex_comp + 1) |
+                      R500_RS_SEL_R(rs_tex_comp + 2) |
+                      R500_RS_SEL_Q(rs_tex_comp + 3);
+    }
+    rs->inst[id] |= R500_RS_INST_TEX_ID(id);
+}
+
+static void r500_rs_tex_write(struct r300_rs_block* rs, int id, int fp_offset)
+{
+    rs->inst[id] |= R500_RS_INST_TEX_CN_WRITE |
+                    R500_RS_INST_TEX_ADDR(fp_offset);
+}
+
+/* Set up the RS block.
+ *
+ * This is the part of the chipset that is responsible for linking vertex
+ * and fragment shaders and stuffed texture coordinates.
+ *
+ * The rasterizer reads data from VAP, which produces vertex shader outputs,
+ * and GA, which produces stuffed texture coordinates. VAP outputs have
+ * precedence over GA. All outputs must be rasterized otherwise it locks up.
+ * If there are more outputs rasterized than is set in VAP/GA, it locks up
+ * too. The funky part is that this info has been pretty much obtained by trial
+ * and error. */
+static void r300_update_rs_block(struct r300_context *r300)
+{
+    struct r300_vertex_shader *vs = r300->vs_state.state;
+    struct r300_shader_semantics *vs_outputs = &vs->outputs;
+    struct r300_shader_semantics *fs_inputs = &r300_fs(r300)->shader->inputs;
+    struct r300_rs_block rs = {0};
+    int i, col_count = 0, tex_count = 0, fp_offset = 0, count, loc = 0;
+    void (*rX00_rs_col)(struct r300_rs_block*, int, int, enum r300_rs_swizzle);
+    void (*rX00_rs_col_write)(struct r300_rs_block*, int, int);
+    void (*rX00_rs_tex)(struct r300_rs_block*, int, int, enum r300_rs_swizzle);
+    void (*rX00_rs_tex_write)(struct r300_rs_block*, int, int);
+    boolean any_bcolor_used = vs_outputs->bcolor[0] != ATTR_UNUSED ||
+                              vs_outputs->bcolor[1] != ATTR_UNUSED;
+    int *stream_loc_notcl = r300->stream_loc_notcl;
+
+    if (r300->screen->caps.is_r500) {
+        rX00_rs_col       = r500_rs_col;
+        rX00_rs_col_write = r500_rs_col_write;
+        rX00_rs_tex       = r500_rs_tex;
+        rX00_rs_tex_write = r500_rs_tex_write;
+    } else {
+        rX00_rs_col       = r300_rs_col;
+        rX00_rs_col_write = r300_rs_col_write;
+        rX00_rs_tex       = r300_rs_tex;
+        rX00_rs_tex_write = r300_rs_tex_write;
+    }
+
+    /* The position is always present in VAP. */
+    rs.vap_vsm_vtx_assm |= R300_INPUT_CNTL_POS;
+    rs.vap_out_vtx_fmt[0] |= R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
+    stream_loc_notcl[loc++] = 0;
+
+    /* Set up the point size in VAP. */
+    if (vs_outputs->psize != ATTR_UNUSED) {
+        rs.vap_out_vtx_fmt[0] |= R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
+        stream_loc_notcl[loc++] = 1;
+    }
+
+    /* Set up and rasterize colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used ||
+            vs_outputs->color[1] != ATTR_UNUSED) {
+            /* Set up the color in VAP. */
+            rs.vap_vsm_vtx_assm |= R300_INPUT_CNTL_COLOR;
+            rs.vap_out_vtx_fmt[0] |=
+                    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << i;
+            stream_loc_notcl[loc++] = 2 + i;
+
+            /* Rasterize it. */
+            rX00_rs_col(&rs, col_count, col_count, SWIZ_XYZW);
+
+            /* Write it to the FS input register if it's needed by the FS. */
+            if (fs_inputs->color[i] != ATTR_UNUSED) {
+                rX00_rs_col_write(&rs, col_count, fp_offset);
+                fp_offset++;
+
+                DBG(r300, DBG_RS,
+                    "r300: Rasterized color %i written to FS.\n", i);
+            } else {
+                DBG(r300, DBG_RS, "r300: Rasterized color %i unused.\n", i);
+            }
+            col_count++;
+        } else {
+            /* Skip the FS input register, leave it uninitialized. */
+            /* If we try to set it to (0,0,0,1), it will lock up. */
+            if (fs_inputs->color[i] != ATTR_UNUSED) {
+                fp_offset++;
+
+                DBG(r300, DBG_RS, "r300: FS input color %i unassigned%s.\n",
+                    i);
+            }
+        }
+    }
+
+    /* Set up back-face colors. The rasterizer will do the color selection
+     * automatically. */
+    if (any_bcolor_used) {
+        if (r300->two_sided_color) {
+            /* Rasterize as back-face colors. */
+            for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+                rs.vap_vsm_vtx_assm |= R300_INPUT_CNTL_COLOR;
+                rs.vap_out_vtx_fmt[0] |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << (2+i);
+                stream_loc_notcl[loc++] = 4 + i;
+            }
+        } else {
+            /* Rasterize two fake texcoords to prevent from the two-sided color
+             * selection. */
+            /* XXX Consider recompiling the vertex shader to save 2 RS units. */
+            for (i = 0; i < 2; i++) {
+                rs.vap_vsm_vtx_assm |= (R300_INPUT_CNTL_TC0 << tex_count);
+                rs.vap_out_vtx_fmt[1] |= (4 << (3 * tex_count));
+                stream_loc_notcl[loc++] = 6 + tex_count;
+
+                /* Rasterize it. */
+                rX00_rs_tex(&rs, tex_count, tex_count, SWIZ_XYZW);
+                tex_count++;
+            }
+        }
+    }
+
+    /* Rasterize texture coordinates. */
+    for (i = 0; i < ATTR_GENERIC_COUNT && tex_count < 8; i++) {
+	bool sprite_coord = !!(r300->sprite_coord_enable & (1 << i));
+
+        if (vs_outputs->generic[i] != ATTR_UNUSED || sprite_coord) {
+            if (!sprite_coord) {
+                /* Set up the texture coordinates in VAP. */
+                rs.vap_vsm_vtx_assm |= (R300_INPUT_CNTL_TC0 << tex_count);
+                rs.vap_out_vtx_fmt[1] |= (4 << (3 * tex_count));
+                stream_loc_notcl[loc++] = 6 + tex_count;
+            }
+
+            /* Rasterize it. */
+            rX00_rs_tex(&rs, tex_count, tex_count,
+			sprite_coord ? SWIZ_XY01 : SWIZ_XYZW);
+
+            /* Write it to the FS input register if it's needed by the FS. */
+            if (fs_inputs->generic[i] != ATTR_UNUSED) {
+                rX00_rs_tex_write(&rs, tex_count, fp_offset);
+                fp_offset++;
+
+                DBG(r300, DBG_RS,
+                    "r300: Rasterized generic %i written to FS%s.\n",
+                    i, sprite_coord ? " (sprite coord)" : "");
+            } else {
+                DBG(r300, DBG_RS,
+                    "r300: Rasterized generic %i unused%s.\n",
+                    i, sprite_coord ? " (sprite coord)" : "");
+            }
+            tex_count++;
+        } else {
+            /* Skip the FS input register, leave it uninitialized. */
+            /* If we try to set it to (0,0,0,1), it will lock up. */
+            if (fs_inputs->generic[i] != ATTR_UNUSED) {
+                fp_offset++;
+
+                DBG(r300, DBG_RS, "r300: FS input generic %i unassigned%s.\n",
+                    i, sprite_coord ? " (sprite coord)" : "");
+            }
+        }
+    }
+
+    /* Rasterize fog coordinates. */
+    if (vs_outputs->fog != ATTR_UNUSED && tex_count < 8) {
+        /* Set up the fog coordinates in VAP. */
+        rs.vap_vsm_vtx_assm |= (R300_INPUT_CNTL_TC0 << tex_count);
+        rs.vap_out_vtx_fmt[1] |= (4 << (3 * tex_count));
+        stream_loc_notcl[loc++] = 6 + tex_count;
+
+        /* Rasterize it. */
+        rX00_rs_tex(&rs, tex_count, tex_count, SWIZ_X001);
+
+        /* Write it to the FS input register if it's needed by the FS. */
+        if (fs_inputs->fog != ATTR_UNUSED) {
+            rX00_rs_tex_write(&rs, tex_count, fp_offset);
+            fp_offset++;
+
+            DBG(r300, DBG_RS, "r300: Rasterized fog written to FS.\n");
+        } else {
+            DBG(r300, DBG_RS, "r300: Rasterized fog unused.\n");
+        }
+        tex_count++;
+    } else {
+        /* Skip the FS input register, leave it uninitialized. */
+        /* If we try to set it to (0,0,0,1), it will lock up. */
+        if (fs_inputs->fog != ATTR_UNUSED) {
+            fp_offset++;
+
+            DBG(r300, DBG_RS, "r300: FS input fog unassigned.\n");
+        }
+    }
+
+    /* Rasterize WPOS. */
+    /* Don't set it in VAP if the FS doesn't need it. */
+    if (fs_inputs->wpos != ATTR_UNUSED && tex_count < 8) {
+        /* Set up the WPOS coordinates in VAP. */
+        rs.vap_vsm_vtx_assm |= (R300_INPUT_CNTL_TC0 << tex_count);
+        rs.vap_out_vtx_fmt[1] |= (4 << (3 * tex_count));
+        stream_loc_notcl[loc++] = 6 + tex_count;
+
+        /* Rasterize it. */
+        rX00_rs_tex(&rs, tex_count, tex_count, SWIZ_XYZW);
+
+        /* Write it to the FS input register. */
+        rX00_rs_tex_write(&rs, tex_count, fp_offset);
+
+        DBG(r300, DBG_RS, "r300: Rasterized WPOS written to FS.\n");
+
+        fp_offset++;
+        tex_count++;
+    }
+
+    /* Invalidate the rest of the no-TCL (GA) stream locations. */
+    for (; loc < 16;) {
+        stream_loc_notcl[loc++] = -1;
+    }
+
+    /* Rasterize at least one color, or bad things happen. */
+    if (col_count == 0 && tex_count == 0) {
+        rX00_rs_col(&rs, 0, 0, SWIZ_0001);
+        col_count++;
+
+        DBG(r300, DBG_RS, "r300: Rasterized color 0 to prevent lockups.\n");
+    }
+
+    DBG(r300, DBG_RS, "r300: --- Rasterizer status ---: colors: %i, "
+        "generics: %i.\n", col_count, tex_count);
+
+    rs.count = (tex_count*4) | (col_count << R300_IC_COUNT_SHIFT) |
+        R300_HIRES_EN;
+
+    count = MAX3(col_count, tex_count, 1);
+    rs.inst_count = count - 1;
+
+    /* Now, after all that, see if we actually need to update the state. */
+    if (memcmp(r300->rs_block_state.state, &rs, sizeof(struct r300_rs_block))) {
+        memcpy(r300->rs_block_state.state, &rs, sizeof(struct r300_rs_block));
+        r300->rs_block_state.size = 11 + count*2;
+    }
+}
+
+static void r300_merge_textures_and_samplers(struct r300_context* r300)
+{
+    struct r300_textures_state *state =
+        (struct r300_textures_state*)r300->textures_state.state;
+    struct r300_texture_sampler_state *texstate;
+    struct r300_sampler_state *sampler;
+    struct r300_sampler_view *view;
+    struct r300_texture *tex;
+    unsigned min_level, max_level, i, size;
+    unsigned count = MIN2(state->sampler_view_count,
+                          state->sampler_state_count);
+    unsigned char depth_swizzle[4] = {
+        UTIL_FORMAT_SWIZZLE_X,
+        UTIL_FORMAT_SWIZZLE_X,
+        UTIL_FORMAT_SWIZZLE_X,
+        UTIL_FORMAT_SWIZZLE_X
+    };
+
+    /* The KIL opcode fix, see below. */
+    if (!count && !r300->screen->caps.is_r500)
+        count = 1;
+
+    state->tx_enable = 0;
+    state->count = 0;
+    size = 2;
+
+    for (i = 0; i < count; i++) {
+        if (state->sampler_views[i] && state->sampler_states[i]) {
+            state->tx_enable |= 1 << i;
+
+            view = state->sampler_views[i];
+            tex = r300_texture(view->base.texture);
+            sampler = state->sampler_states[i];
+
+            texstate = &state->regs[i];
+            texstate->format = view->format;
+            texstate->filter0 = sampler->filter0;
+            texstate->filter1 = sampler->filter1;
+            texstate->border_color = sampler->border_color;
+
+            /* Assign a texture cache region. */
+            texstate->format.format1 |= view->texcache_region;
+
+            /* If compare mode is disabled, the sampler view swizzles
+             * are stored in the format.
+             * Otherwise, swizzles must be applied after the compare mode
+             * in the fragment shader. */
+            if (util_format_is_depth_or_stencil(tex->b.b.format)) {
+                if (sampler->state.compare_mode == PIPE_TEX_COMPARE_NONE) {
+                    texstate->format.format1 |=
+                        r300_get_swizzle_combined(depth_swizzle, view->swizzle);
+                } else {
+                    texstate->format.format1 |=
+                        r300_get_swizzle_combined(depth_swizzle, 0);
+                }
+            }
+
+            /* to emulate 1D textures through 2D ones correctly */
+            if (tex->b.b.target == PIPE_TEXTURE_1D) {
+                texstate->filter0 &= ~R300_TX_WRAP_T_MASK;
+                texstate->filter0 |= R300_TX_WRAP_T(R300_TX_CLAMP_TO_EDGE);
+            }
+
+            if (tex->uses_pitch) {
+                /* NPOT textures don't support mip filter, unfortunately.
+                 * This prevents incorrect rendering. */
+                texstate->filter0 &= ~R300_TX_MIN_FILTER_MIP_MASK;
+
+                /* Mask out the mirrored flag. */
+                if (texstate->filter0 & R300_TX_WRAP_S(R300_TX_MIRRORED)) {
+                    texstate->filter0 &= ~R300_TX_WRAP_S(R300_TX_MIRRORED);
+                }
+                if (texstate->filter0 & R300_TX_WRAP_T(R300_TX_MIRRORED)) {
+                    texstate->filter0 &= ~R300_TX_WRAP_T(R300_TX_MIRRORED);
+                }
+
+                /* Change repeat to clamp-to-edge.
+                 * (the repeat bit has a value of 0, no masking needed). */
+                if ((texstate->filter0 & R300_TX_WRAP_S_MASK) ==
+                    R300_TX_WRAP_S(R300_TX_REPEAT)) {
+                    texstate->filter0 |= R300_TX_WRAP_S(R300_TX_CLAMP_TO_EDGE);
+                }
+                if ((texstate->filter0 & R300_TX_WRAP_T_MASK) ==
+                    R300_TX_WRAP_T(R300_TX_REPEAT)) {
+                    texstate->filter0 |= R300_TX_WRAP_T(R300_TX_CLAMP_TO_EDGE);
+                }
+            } else {
+                /* determine min/max levels */
+                /* the MAX_MIP level is the largest (finest) one */
+                max_level = MIN3(sampler->max_lod + view->base.first_level,
+                                 tex->b.b.last_level, view->base.last_level);
+                min_level = MIN2(sampler->min_lod + view->base.first_level,
+                                 max_level);
+                texstate->format.format0 |= R300_TX_NUM_LEVELS(max_level);
+                texstate->filter0 |= R300_TX_MAX_MIP_LEVEL(min_level);
+            }
+
+            texstate->filter0 |= i << 28;
+
+            size += 16;
+            state->count = i+1;
+        } else {
+            /* For the KIL opcode to work on r3xx-r4xx, the texture unit
+             * assigned to this opcode (it's always the first one) must be
+             * enabled. Otherwise the opcode doesn't work.
+             *
+             * In order to not depend on the fragment shader, we just make
+             * the first unit enabled all the time. */
+            if (i == 0 && !r300->screen->caps.is_r500) {
+                pipe_sampler_view_reference(
+                        (struct pipe_sampler_view**)&state->sampler_views[i],
+                        &r300->texkill_sampler->base);
+
+                state->tx_enable |= 1 << i;
+
+                texstate = &state->regs[i];
+
+                /* Just set some valid state. */
+                texstate->format = r300->texkill_sampler->format;
+                texstate->filter0 =
+                        r300_translate_tex_filters(PIPE_TEX_FILTER_NEAREST,
+                                                   PIPE_TEX_FILTER_NEAREST,
+                                                   PIPE_TEX_FILTER_NEAREST,
+                                                   FALSE);
+                texstate->filter1 = 0;
+                texstate->border_color = 0;
+
+                texstate->filter0 |= i << 28;
+                size += 16;
+                state->count = i+1;
+            }
+        }
+    }
+
+    r300->textures_state.size = size;
+
+    /* Pick a fragment shader based on either the texture compare state
+     * or the uses_pitch flag. */
+    if (r300->fs.state && count) {
+        if (r300_pick_fragment_shader(r300)) {
+            r300_mark_fs_code_dirty(r300);
+        }
+    }
+}
+
+void r300_update_derived_state(struct r300_context* r300)
+{
+    if (r300->textures_state.dirty) {
+        r300_merge_textures_and_samplers(r300);
+    }
+
+    if (r300->rs_block_state.dirty) {
+        r300_update_rs_block(r300);
+
+        if (r300->draw) {
+            memset(&r300->vertex_info, 0, sizeof(struct vertex_info));
+            r300_draw_emit_all_attribs(r300);
+            draw_compute_vertex_size(&r300->vertex_info);
+            r300_swtcl_vertex_psc(r300);
+        }
+    }
+
+    r300_update_hyperz_state(r300);
+}
diff --git a/src/gallium/drivers/r300/r300_state_derived.h b/src/gallium/drivers/r300/r300_state_derived.h
new file mode 100644
index 0000000000..71a4a47b00
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state_derived.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_STATE_DERIVED_H
+#define R300_STATE_DERIVED_H
+
+struct r300_context;
+
+void r300_update_derived_state(struct r300_context* r300);
+
+#endif /* R300_STATE_DERIVED_H */
diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h
new file mode 100644
index 0000000000..03ec127ff7
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state_inlines.h
@@ -0,0 +1,454 @@
+/*
+ * Copyright 2009 Joakim Sindholt <opensource@zhasha.com>
+ *                Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_STATE_INLINES_H
+#define R300_STATE_INLINES_H
+
+#include "draw/draw_vertex.h"
+
+#include "pipe/p_format.h"
+
+#include "util/u_format.h"
+
+#include "r300_reg.h"
+
+#include <stdio.h>
+
+/* Some maths. These should probably find their way to u_math, if needed. */
+
+static INLINE int pack_float_16_6x(float f) {
+    return ((int)(f * 6.0) & 0xffff);
+}
+
+/* Blend state. */
+
+static INLINE uint32_t r300_translate_blend_function(int blend_func)
+{
+    switch (blend_func) {
+        case PIPE_BLEND_ADD:
+            return R300_COMB_FCN_ADD_CLAMP;
+        case PIPE_BLEND_SUBTRACT:
+            return R300_COMB_FCN_SUB_CLAMP;
+        case PIPE_BLEND_REVERSE_SUBTRACT:
+            return R300_COMB_FCN_RSUB_CLAMP;
+        case PIPE_BLEND_MIN:
+            return R300_COMB_FCN_MIN;
+        case PIPE_BLEND_MAX:
+            return R300_COMB_FCN_MAX;
+        default:
+            fprintf(stderr, "r300: Unknown blend function %d\n", blend_func);
+            assert(0);
+            break;
+    }
+    return 0;
+}
+
+/* XXX we can also offer the D3D versions of some of these... */
+static INLINE uint32_t r300_translate_blend_factor(int blend_fact)
+{
+    switch (blend_fact) {
+        case PIPE_BLENDFACTOR_ONE:
+            return R300_BLEND_GL_ONE;
+        case PIPE_BLENDFACTOR_SRC_COLOR:
+            return R300_BLEND_GL_SRC_COLOR;
+        case PIPE_BLENDFACTOR_SRC_ALPHA:
+            return R300_BLEND_GL_SRC_ALPHA;
+        case PIPE_BLENDFACTOR_DST_ALPHA:
+            return R300_BLEND_GL_DST_ALPHA;
+        case PIPE_BLENDFACTOR_DST_COLOR:
+            return R300_BLEND_GL_DST_COLOR;
+        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+            return R300_BLEND_GL_SRC_ALPHA_SATURATE;
+        case PIPE_BLENDFACTOR_CONST_COLOR:
+            return R300_BLEND_GL_CONST_COLOR;
+        case PIPE_BLENDFACTOR_CONST_ALPHA:
+            return R300_BLEND_GL_CONST_ALPHA;
+        case PIPE_BLENDFACTOR_ZERO:
+            return R300_BLEND_GL_ZERO;
+        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+            return R300_BLEND_GL_ONE_MINUS_SRC_COLOR;
+        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+            return R300_BLEND_GL_ONE_MINUS_SRC_ALPHA;
+        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+            return R300_BLEND_GL_ONE_MINUS_DST_ALPHA;
+        case PIPE_BLENDFACTOR_INV_DST_COLOR:
+            return R300_BLEND_GL_ONE_MINUS_DST_COLOR;
+        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+            return R300_BLEND_GL_ONE_MINUS_CONST_COLOR;
+        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+            return R300_BLEND_GL_ONE_MINUS_CONST_ALPHA;
+
+        case PIPE_BLENDFACTOR_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_SRC1_ALPHA:
+        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+            fprintf(stderr, "r300: Implementation error: "
+                "Bad blend factor %d not supported!\n", blend_fact);
+            assert(0);
+            break;
+
+        default:
+            fprintf(stderr, "r300: Unknown blend factor %d\n", blend_fact);
+            assert(0);
+            break;
+    }
+    return 0;
+}
+
+/* DSA state. */
+
+static INLINE uint32_t r300_translate_depth_stencil_function(int zs_func)
+{
+    switch (zs_func) {
+        case PIPE_FUNC_NEVER:
+            return R300_ZS_NEVER;
+        case PIPE_FUNC_LESS:
+            return R300_ZS_LESS;
+        case PIPE_FUNC_EQUAL:
+            return R300_ZS_EQUAL;
+        case PIPE_FUNC_LEQUAL:
+            return R300_ZS_LEQUAL;
+        case PIPE_FUNC_GREATER:
+            return R300_ZS_GREATER;
+        case PIPE_FUNC_NOTEQUAL:
+            return R300_ZS_NOTEQUAL;
+        case PIPE_FUNC_GEQUAL:
+            return R300_ZS_GEQUAL;
+        case PIPE_FUNC_ALWAYS:
+            return R300_ZS_ALWAYS;
+        default:
+            fprintf(stderr, "r300: Unknown depth/stencil function %d\n",
+                zs_func);
+            assert(0);
+            break;
+    }
+    return 0;
+}
+
+static INLINE uint32_t r300_translate_stencil_op(int s_op)
+{
+    switch (s_op) {
+        case PIPE_STENCIL_OP_KEEP:
+            return R300_ZS_KEEP;
+        case PIPE_STENCIL_OP_ZERO:
+            return R300_ZS_ZERO;
+        case PIPE_STENCIL_OP_REPLACE:
+            return R300_ZS_REPLACE;
+        case PIPE_STENCIL_OP_INCR:
+            return R300_ZS_INCR;
+        case PIPE_STENCIL_OP_DECR:
+            return R300_ZS_DECR;
+        case PIPE_STENCIL_OP_INCR_WRAP:
+            return R300_ZS_INCR_WRAP;
+        case PIPE_STENCIL_OP_DECR_WRAP:
+            return R300_ZS_DECR_WRAP;
+        case PIPE_STENCIL_OP_INVERT:
+            return R300_ZS_INVERT;
+        default:
+            fprintf(stderr, "r300: Unknown stencil op %d", s_op);
+            assert(0);
+            break;
+    }
+    return 0;
+}
+
+static INLINE uint32_t r300_translate_alpha_function(int alpha_func)
+{
+    switch (alpha_func) {
+        case PIPE_FUNC_NEVER:
+            return R300_FG_ALPHA_FUNC_NEVER;
+        case PIPE_FUNC_LESS:
+            return R300_FG_ALPHA_FUNC_LESS;
+        case PIPE_FUNC_EQUAL:
+            return R300_FG_ALPHA_FUNC_EQUAL;
+        case PIPE_FUNC_LEQUAL:
+            return R300_FG_ALPHA_FUNC_LE;
+        case PIPE_FUNC_GREATER:
+            return R300_FG_ALPHA_FUNC_GREATER;
+        case PIPE_FUNC_NOTEQUAL:
+            return R300_FG_ALPHA_FUNC_NOTEQUAL;
+        case PIPE_FUNC_GEQUAL:
+            return R300_FG_ALPHA_FUNC_GE;
+        case PIPE_FUNC_ALWAYS:
+            return R300_FG_ALPHA_FUNC_ALWAYS;
+        default:
+            fprintf(stderr, "r300: Unknown alpha function %d", alpha_func);
+            assert(0);
+            break;
+    }
+    return 0;
+}
+
+static INLINE uint32_t
+r300_translate_polygon_mode_front(unsigned mode) {
+    switch (mode)
+    {
+        case PIPE_POLYGON_MODE_FILL:
+            return R300_GA_POLY_MODE_FRONT_PTYPE_TRI;
+        case PIPE_POLYGON_MODE_LINE:
+            return R300_GA_POLY_MODE_FRONT_PTYPE_LINE;
+        case PIPE_POLYGON_MODE_POINT:
+            return R300_GA_POLY_MODE_FRONT_PTYPE_POINT;
+
+        default:
+            fprintf(stderr, "r300: Bad polygon mode %i in %s\n", mode,
+                __FUNCTION__);
+            return R300_GA_POLY_MODE_FRONT_PTYPE_TRI;
+    }
+}
+
+static INLINE uint32_t
+r300_translate_polygon_mode_back(unsigned mode) {
+    switch (mode)
+    {
+        case PIPE_POLYGON_MODE_FILL:
+            return R300_GA_POLY_MODE_BACK_PTYPE_TRI;
+        case PIPE_POLYGON_MODE_LINE:
+            return R300_GA_POLY_MODE_BACK_PTYPE_LINE;
+        case PIPE_POLYGON_MODE_POINT:
+            return R300_GA_POLY_MODE_BACK_PTYPE_POINT;
+
+        default:
+            fprintf(stderr, "r300: Bad polygon mode %i in %s\n", mode,
+                __FUNCTION__);
+            return R300_GA_POLY_MODE_BACK_PTYPE_TRI;
+    }
+}
+
+/* Texture sampler state. */
+
+static INLINE uint32_t r300_translate_wrap(int wrap)
+{
+    switch (wrap) {
+        case PIPE_TEX_WRAP_REPEAT:
+            return R300_TX_REPEAT;
+        case PIPE_TEX_WRAP_CLAMP:
+            return R300_TX_CLAMP;
+        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+            return R300_TX_CLAMP_TO_EDGE;
+        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+            return R300_TX_CLAMP_TO_BORDER;
+        case PIPE_TEX_WRAP_MIRROR_REPEAT:
+            return R300_TX_REPEAT | R300_TX_MIRRORED;
+        case PIPE_TEX_WRAP_MIRROR_CLAMP:
+            return R300_TX_CLAMP | R300_TX_MIRRORED;
+        case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+            return R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
+        case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+            return R300_TX_CLAMP_TO_BORDER | R300_TX_MIRRORED;
+        default:
+            fprintf(stderr, "r300: Unknown texture wrap %d", wrap);
+            assert(0);
+            return 0;
+    }
+}
+
+static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip,
+                                                  int is_anisotropic)
+{
+    uint32_t retval = 0;
+    if (is_anisotropic)
+        retval |= R300_TX_MIN_FILTER_ANISO | R300_TX_MAG_FILTER_ANISO;
+    else {
+        switch (min) {
+        case PIPE_TEX_FILTER_NEAREST:
+            retval |= R300_TX_MIN_FILTER_NEAREST;
+            break;
+        case PIPE_TEX_FILTER_LINEAR:
+            retval |= R300_TX_MIN_FILTER_LINEAR;
+            break;
+        default:
+            fprintf(stderr, "r300: Unknown texture filter %d\n", min);
+            assert(0);
+            break;
+        }
+        switch (mag) {
+        case PIPE_TEX_FILTER_NEAREST:
+            retval |= R300_TX_MAG_FILTER_NEAREST;
+            break;
+        case PIPE_TEX_FILTER_LINEAR:
+            retval |= R300_TX_MAG_FILTER_LINEAR;
+            break;
+        default:
+            fprintf(stderr, "r300: Unknown texture filter %d\n", mag);
+            assert(0);
+            break;
+        }
+    }
+    switch (mip) {
+        case PIPE_TEX_MIPFILTER_NONE:
+            retval |= R300_TX_MIN_FILTER_MIP_NONE;
+            break;
+        case PIPE_TEX_MIPFILTER_NEAREST:
+            retval |= R300_TX_MIN_FILTER_MIP_NEAREST;
+            break;
+        case PIPE_TEX_MIPFILTER_LINEAR:
+            retval |= R300_TX_MIN_FILTER_MIP_LINEAR;
+            break;
+        default:
+            fprintf(stderr, "r300: Unknown texture filter %d\n", mip);
+            assert(0);
+            break;
+    }
+
+    return retval;
+}
+
+static INLINE uint32_t r300_anisotropy(unsigned max_aniso)
+{
+    if (max_aniso >= 16) {
+        return R300_TX_MAX_ANISO_16_TO_1;
+    } else if (max_aniso >= 8) {
+        return R300_TX_MAX_ANISO_8_TO_1;
+    } else if (max_aniso >= 4) {
+        return R300_TX_MAX_ANISO_4_TO_1;
+    } else if (max_aniso >= 2) {
+        return R300_TX_MAX_ANISO_2_TO_1;
+    } else {
+        return R300_TX_MAX_ANISO_1_TO_1;
+    }
+}
+
+static INLINE uint32_t r500_anisotropy(unsigned max_aniso)
+{
+    if (!max_aniso) {
+        return 0;
+    }
+    max_aniso -= 1;
+
+    // Map the range [0, 15] to [0, 63].
+    return R500_TX_MAX_ANISO(MIN2((unsigned)(max_aniso*4.2001), 63)) |
+           R500_TX_ANISO_HIGH_QUALITY;
+}
+
+/* Non-CSO state. (For now.) */
+
+static INLINE uint32_t r300_translate_gb_pipes(int pipe_count)
+{
+    switch (pipe_count) {
+        case 1:
+            return R300_GB_TILE_PIPE_COUNT_RV300;
+        case 2:
+            return R300_GB_TILE_PIPE_COUNT_R300;
+        case 3:
+            return R300_GB_TILE_PIPE_COUNT_R420_3P;
+        case 4:
+            return R300_GB_TILE_PIPE_COUNT_R420;
+    }
+    return 0;
+}
+
+
+/* Translate pipe_formats into PSC vertex types. */
+static INLINE uint16_t
+r300_translate_vertex_data_type(enum pipe_format format) {
+    uint32_t result = 0;
+    const struct util_format_description *desc;
+
+    desc = util_format_description(format);
+
+    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
+        return R300_INVALID_FORMAT;
+    }
+
+    switch (desc->channel[0].type) {
+        /* Half-floats, floats, doubles */
+        case UTIL_FORMAT_TYPE_FLOAT:
+            switch (desc->channel[0].size) {
+                case 16:
+                    /* Supported only on RV350 and later. */
+                    if (desc->nr_channels > 2) {
+                        result = R300_DATA_TYPE_FLT16_4;
+                    } else {
+                        result = R300_DATA_TYPE_FLT16_2;
+                    }
+                    break;
+                case 32:
+                    result = R300_DATA_TYPE_FLOAT_1 + (desc->nr_channels - 1);
+                    break;
+                default:
+                    return R300_INVALID_FORMAT;
+            }
+            break;
+        /* Unsigned ints */
+        case UTIL_FORMAT_TYPE_UNSIGNED:
+        /* Signed ints */
+        case UTIL_FORMAT_TYPE_SIGNED:
+            switch (desc->channel[0].size) {
+                case 8:
+                    result = R300_DATA_TYPE_BYTE;
+                    break;
+                case 16:
+                    if (desc->nr_channels > 2) {
+                        result = R300_DATA_TYPE_SHORT_4;
+                    } else {
+                        result = R300_DATA_TYPE_SHORT_2;
+                    }
+                    break;
+                default:
+                    return R300_INVALID_FORMAT;
+            }
+            break;
+        default:
+            return R300_INVALID_FORMAT;
+    }
+
+    if (desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
+        result |= R300_SIGNED;
+    }
+    if (desc->channel[0].normalized) {
+        result |= R300_NORMALIZE;
+    }
+
+    return result;
+}
+
+static INLINE uint16_t
+r300_translate_vertex_data_swizzle(enum pipe_format format) {
+    const struct util_format_description *desc = util_format_description(format);
+    unsigned i, swizzle = 0;
+
+    assert(format);
+
+    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
+        fprintf(stderr, "r300: Bad format %s in %s:%d\n",
+            util_format_short_name(format), __FUNCTION__, __LINE__);
+        return 0;
+    }
+
+    for (i = 0; i < desc->nr_channels; i++) {
+        swizzle |=
+            MIN2(desc->swizzle[i], R300_SWIZZLE_SELECT_FP_ONE) << (3*i);
+    }
+    /* Set (0,0,0,1) in unused components. */
+    for (; i < 3; i++) {
+        swizzle |= R300_SWIZZLE_SELECT_FP_ZERO << (3*i);
+    }
+    for (; i < 4; i++) {
+        swizzle |= R300_SWIZZLE_SELECT_FP_ONE << (3*i);
+    }
+
+    return swizzle | (0xf << R300_WRITE_ENA_SHIFT);
+}
+
+#endif /* R300_STATE_INLINES_H */
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
new file mode 100644
index 0000000000..e67a0ae244
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2009 Joakim Sindholt <opensource@zhasha.com>
+ *                Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_context.h"
+#include "r300_cs.h"
+#include "r300_reg.h"
+#include "r300_screen.h"
+#include "r300_state_invariant.h"
+
+/* Calculate and emit invariant state. This is data that the 3D engine
+ * will probably want at the beginning of every CS, but it's not currently
+ * handled by any CSO setup, and in addition it doesn't really change much.
+ *
+ * Note that eventually this should be empty, but it's useful for development
+ * and general unduplication of code. */
+void r300_emit_invariant_state(struct r300_context* r300,
+                               unsigned size, void* state)
+{
+    CS_LOCALS(r300);
+
+    BEGIN_CS(12 + (r300->screen->caps.has_tcl ? 2 : 0));
+
+    /*** Graphics Backend (GB) ***/
+    /* Source of fog depth */
+    OUT_CS_REG(R300_GB_SELECT, R300_GB_FOG_SELECT_1_1_W);
+
+    /*** Fog (FG) ***/
+    OUT_CS_REG(R300_FG_FOG_BLEND, 0x0);
+    OUT_CS_REG(R300_FG_FOG_COLOR_R, 0x0);
+    OUT_CS_REG(R300_FG_FOG_COLOR_G, 0x0);
+    OUT_CS_REG(R300_FG_FOG_COLOR_B, 0x0);
+
+    /*** VAP ***/
+    /* Sign/normalize control */
+    OUT_CS_REG(R300_VAP_PSC_SGN_NORM_CNTL, R300_SGN_NORM_NO_ZERO);
+    /* TCL-only stuff */
+    if (r300->screen->caps.has_tcl) {
+        /* Amount of time to wait for vertex fetches in PVS */
+        OUT_CS_REG(VAP_PVS_VTX_TIMEOUT_REG, 0xffff);
+    }
+
+    END_CS;
+
+    /* XXX unsorted stuff from surface_fill */
+    BEGIN_CS(38 + (r300->screen->caps.has_tcl ? 7 : 0) +
+             (r300->screen->caps.is_rv350 ? 4 : 0) +
+             (r300->screen->caps.is_r400 ? 2 : 0));
+
+    if (r300->screen->caps.has_tcl) {
+        /*Flushing PVS is required before the VAP_GB registers can be changed*/
+        OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+        OUT_CS_REG_SEQ(R300_VAP_GB_VERT_CLIP_ADJ, 4);
+        OUT_CS_32F(1.0);
+        OUT_CS_32F(1.0);
+        OUT_CS_32F(1.0);
+        OUT_CS_32F(1.0);
+    }
+    /* XXX line tex stuffing */
+    OUT_CS_REG_SEQ(R300_GA_LINE_S0, 1);
+    OUT_CS_32F(0.0);
+    OUT_CS_REG_SEQ(R300_GA_LINE_S1, 1);
+    OUT_CS_32F(1.0);
+    OUT_CS_REG(R300_GA_TRIANGLE_STIPPLE, 0x5 |
+        (0x5 << R300_GA_TRIANGLE_STIPPLE_Y_SHIFT_SHIFT));
+    /* XXX this big chunk should be refactored into rs_state */
+    OUT_CS_REG(R300_GA_SOLID_RG, 0x00000000);
+    OUT_CS_REG(R300_GA_SOLID_BA, 0x00000000);
+    OUT_CS_REG(R300_GA_ROUND_MODE, 0x00000001);
+    OUT_CS_REG(R300_GA_OFFSET, 0x00000000);
+    OUT_CS_REG(R300_GA_FOG_SCALE, 0x3DBF1412);
+    OUT_CS_REG(R300_GA_FOG_OFFSET, 0x00000000);
+    OUT_CS_REG(R300_SU_TEX_WRAP, 0x00000000);
+    OUT_CS_REG(R300_SU_DEPTH_SCALE, 0x4B7FFFFF);
+    OUT_CS_REG(R300_SU_DEPTH_OFFSET, 0x00000000);
+    OUT_CS_REG(R300_SC_HYPERZ, 0x0000001C);
+    OUT_CS_REG(R300_SC_EDGERULE, 0x2DA49525);
+    OUT_CS_REG(R300_RB3D_AARESOLVE_CTL, 0x00000000);
+
+    if (r300->screen->caps.is_rv350) {
+        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x01010101);
+        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD, 0xFEFEFEFE);
+    }
+
+    OUT_CS_REG(R300_ZB_BW_CNTL, 0x00000000);
+    OUT_CS_REG(R300_ZB_DEPTHCLEARVALUE, 0x00000000);
+    OUT_CS_REG(R300_ZB_HIZ_OFFSET, 0x00000000);
+    OUT_CS_REG(R300_ZB_HIZ_PITCH, 0x00000000);
+    if (r300->screen->caps.is_r400)
+        OUT_CS_REG(R400_US_CODE_BANK, 0);
+    END_CS;
+}
diff --git a/src/gallium/drivers/r300/r300_state_invariant.h b/src/gallium/drivers/r300/r300_state_invariant.h
new file mode 100644
index 0000000000..83d031c7fe
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state_invariant.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_STATE_INVARIANT_H
+#define R300_STATE_INVARIANT_H
+
+struct r300_context;
+
+void r300_emit_invariant_state(struct r300_context* r300,
+                               unsigned size, void* state);
+
+#endif /* R300_STATE_INVARIANT_H */
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
new file mode 100644
index 0000000000..ddb6600056
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -0,0 +1,1149 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/* Always include headers in the reverse order!! ~ M. */
+#include "r300_texture.h"
+
+#include "r300_context.h"
+#include "r300_reg.h"
+#include "r300_transfer.h"
+#include "r300_screen.h"
+#include "r300_winsys.h"
+
+#include "util/u_format.h"
+#include "util/u_format_s3tc.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "pipe/p_screen.h"
+#include "state_tracker/drm_api.h"
+
+enum r300_dim {
+    DIM_WIDTH  = 0,
+    DIM_HEIGHT = 1
+};
+
+unsigned r300_get_swizzle_combined(const unsigned char *swizzle_format,
+                                   const unsigned char *swizzle_view)
+{
+    unsigned i;
+    unsigned char swizzle[4];
+    unsigned result = 0;
+    const uint32_t swizzle_shift[4] = {
+        R300_TX_FORMAT_R_SHIFT,
+        R300_TX_FORMAT_G_SHIFT,
+        R300_TX_FORMAT_B_SHIFT,
+        R300_TX_FORMAT_A_SHIFT
+    };
+    const uint32_t swizzle_bit[4] = {
+        R300_TX_FORMAT_X,
+        R300_TX_FORMAT_Y,
+        R300_TX_FORMAT_Z,
+        R300_TX_FORMAT_W
+    };
+
+    if (swizzle_view) {
+        /* Combine two sets of swizzles. */
+        for (i = 0; i < 4; i++) {
+            swizzle[i] = swizzle_view[i] <= UTIL_FORMAT_SWIZZLE_W ?
+                         swizzle_format[swizzle_view[i]] : swizzle_view[i];
+        }
+    } else {
+        memcpy(swizzle, swizzle_format, 4);
+    }
+
+    /* Get swizzle. */
+    for (i = 0; i < 4; i++) {
+        switch (swizzle[i]) {
+            case UTIL_FORMAT_SWIZZLE_Y:
+                result |= swizzle_bit[1] << swizzle_shift[i];
+                break;
+            case UTIL_FORMAT_SWIZZLE_Z:
+                result |= swizzle_bit[2] << swizzle_shift[i];
+                break;
+            case UTIL_FORMAT_SWIZZLE_W:
+                result |= swizzle_bit[3] << swizzle_shift[i];
+                break;
+            case UTIL_FORMAT_SWIZZLE_0:
+                result |= R300_TX_FORMAT_ZERO << swizzle_shift[i];
+                break;
+            case UTIL_FORMAT_SWIZZLE_1:
+                result |= R300_TX_FORMAT_ONE << swizzle_shift[i];
+                break;
+            default: /* UTIL_FORMAT_SWIZZLE_X */
+                result |= swizzle_bit[0] << swizzle_shift[i];
+        }
+    }
+    return result;
+}
+
+/* Translate a pipe_format into a useful texture format for sampling.
+ *
+ * Some special formats are translated directly using R300_EASY_TX_FORMAT,
+ * but the majority of them is translated in a generic way, automatically
+ * supporting all the formats hw can support.
+ *
+ * R300_EASY_TX_FORMAT swizzles the texture.
+ * Note the signature of R300_EASY_TX_FORMAT:
+ *   R300_EASY_TX_FORMAT(B, G, R, A, FORMAT);
+ *
+ * The FORMAT specifies how the texture sampler will treat the texture, and
+ * makes available X, Y, Z, W, ZERO, and ONE for swizzling. */
+uint32_t r300_translate_texformat(enum pipe_format format,
+                                  const unsigned char *swizzle_view)
+{
+    uint32_t result = 0;
+    const struct util_format_description *desc;
+    unsigned i;
+    boolean uniform = TRUE;
+    const uint32_t sign_bit[4] = {
+        R300_TX_FORMAT_SIGNED_X,
+        R300_TX_FORMAT_SIGNED_Y,
+        R300_TX_FORMAT_SIGNED_Z,
+        R300_TX_FORMAT_SIGNED_W,
+    };
+
+    desc = util_format_description(format);
+
+    /* Colorspace (return non-RGB formats directly). */
+    switch (desc->colorspace) {
+        /* Depth stencil formats.
+         * Swizzles are added in r300_merge_textures_and_samplers. */
+        case UTIL_FORMAT_COLORSPACE_ZS:
+            switch (format) {
+                case PIPE_FORMAT_Z16_UNORM:
+                    return R300_TX_FORMAT_X16;
+                case PIPE_FORMAT_X8Z24_UNORM:
+                case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+                    return R500_TX_FORMAT_Y8X24;
+                default:
+                    return ~0; /* Unsupported. */
+            }
+
+        /* YUV formats. */
+        case UTIL_FORMAT_COLORSPACE_YUV:
+            result |= R300_TX_FORMAT_YUV_TO_RGB;
+
+            switch (format) {
+                case PIPE_FORMAT_UYVY:
+                    return R300_EASY_TX_FORMAT(X, Y, Z, ONE, YVYU422) | result;
+                case PIPE_FORMAT_YUYV:
+                    return R300_EASY_TX_FORMAT(X, Y, Z, ONE, VYUY422) | result;
+                default:
+                    return ~0; /* Unsupported/unknown. */
+            }
+
+        /* Add gamma correction. */
+        case UTIL_FORMAT_COLORSPACE_SRGB:
+            result |= R300_TX_FORMAT_GAMMA;
+            break;
+
+        default:
+            switch (format) {
+                /* Same as YUV but without the YUR->RGB conversion. */
+                case PIPE_FORMAT_R8G8_B8G8_UNORM:
+                    return R300_EASY_TX_FORMAT(X, Y, Z, ONE, YVYU422) | result;
+                case PIPE_FORMAT_G8R8_G8B8_UNORM:
+                    return R300_EASY_TX_FORMAT(X, Y, Z, ONE, VYUY422) | result;
+                default:;
+            }
+    }
+
+    result |= r300_get_swizzle_combined(desc->swizzle, swizzle_view);
+
+    /* S3TC formats. */
+    if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+        if (!util_format_s3tc_enabled) {
+            return ~0; /* Unsupported. */
+        }
+
+        switch (format) {
+            case PIPE_FORMAT_DXT1_RGB:
+            case PIPE_FORMAT_DXT1_RGBA:
+            case PIPE_FORMAT_DXT1_SRGB:
+            case PIPE_FORMAT_DXT1_SRGBA:
+                return R300_TX_FORMAT_DXT1 | result;
+            case PIPE_FORMAT_DXT3_RGBA:
+            case PIPE_FORMAT_DXT3_SRGBA:
+                return R300_TX_FORMAT_DXT3 | result;
+            case PIPE_FORMAT_DXT5_RGBA:
+            case PIPE_FORMAT_DXT5_SRGBA:
+                return R300_TX_FORMAT_DXT5 | result;
+            default:
+                return ~0; /* Unsupported/unknown. */
+        }
+    }
+
+    /* Add sign. */
+    for (i = 0; i < desc->nr_channels; i++) {
+        if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
+            result |= sign_bit[i];
+        }
+    }
+
+    /* This is truly a special format.
+     * It stores R8G8 and B is computed using sqrt(1 - R^2 - G^2)
+     * in the sampler unit. Also known as D3DFMT_CxV8U8. */
+    if (format == PIPE_FORMAT_R8G8Bx_SNORM) {
+        return R300_TX_FORMAT_CxV8U8 | result;
+    }
+
+    /* RGTC formats. */
+    if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
+        switch (format) {
+            case PIPE_FORMAT_RGTC1_UNORM:
+            case PIPE_FORMAT_RGTC1_SNORM:
+                return R500_TX_FORMAT_ATI1N | result;
+            case PIPE_FORMAT_RGTC2_UNORM:
+            case PIPE_FORMAT_RGTC2_SNORM:
+                return R400_TX_FORMAT_ATI2N | result;
+            default:
+                return ~0; /* Unsupported/unknown. */
+        }
+    }
+
+    /* See whether the components are of the same size. */
+    for (i = 1; i < desc->nr_channels; i++) {
+        uniform = uniform && desc->channel[0].size == desc->channel[i].size;
+    }
+
+    /* Non-uniform formats. */
+    if (!uniform) {
+        switch (desc->nr_channels) {
+            case 3:
+                if (desc->channel[0].size == 5 &&
+                    desc->channel[1].size == 6 &&
+                    desc->channel[2].size == 5) {
+                    return R300_TX_FORMAT_Z5Y6X5 | result;
+                }
+                if (desc->channel[0].size == 5 &&
+                    desc->channel[1].size == 5 &&
+                    desc->channel[2].size == 6) {
+                    return R300_TX_FORMAT_Z6Y5X5 | result;
+                }
+                return ~0; /* Unsupported/unknown. */
+
+            case 4:
+                if (desc->channel[0].size == 5 &&
+                    desc->channel[1].size == 5 &&
+                    desc->channel[2].size == 5 &&
+                    desc->channel[3].size == 1) {
+                    return R300_TX_FORMAT_W1Z5Y5X5 | result;
+                }
+                if (desc->channel[0].size == 10 &&
+                    desc->channel[1].size == 10 &&
+                    desc->channel[2].size == 10 &&
+                    desc->channel[3].size == 2) {
+                    return R300_TX_FORMAT_W2Z10Y10X10 | result;
+                }
+        }
+        return ~0; /* Unsupported/unknown. */
+    }
+
+    /* And finally, uniform formats. */
+    switch (desc->channel[0].type) {
+        case UTIL_FORMAT_TYPE_UNSIGNED:
+        case UTIL_FORMAT_TYPE_SIGNED:
+            if (!desc->channel[0].normalized &&
+                desc->colorspace != UTIL_FORMAT_COLORSPACE_SRGB) {
+                return ~0;
+            }
+
+            switch (desc->channel[0].size) {
+                case 4:
+                    switch (desc->nr_channels) {
+                        case 2:
+                            return R300_TX_FORMAT_Y4X4 | result;
+                        case 4:
+                            return R300_TX_FORMAT_W4Z4Y4X4 | result;
+                    }
+                    return ~0;
+
+                case 8:
+                    switch (desc->nr_channels) {
+                        case 1:
+                            return R300_TX_FORMAT_X8 | result;
+                        case 2:
+                            return R300_TX_FORMAT_Y8X8 | result;
+                        case 4:
+                            return R300_TX_FORMAT_W8Z8Y8X8 | result;
+                    }
+                    return ~0;
+
+                case 16:
+                    switch (desc->nr_channels) {
+                        case 1:
+                            return R300_TX_FORMAT_X16 | result;
+                        case 2:
+                            return R300_TX_FORMAT_Y16X16 | result;
+                        case 4:
+                            return R300_TX_FORMAT_W16Z16Y16X16 | result;
+                    }
+            }
+            return ~0;
+
+        case UTIL_FORMAT_TYPE_FLOAT:
+            switch (desc->channel[0].size) {
+                case 16:
+                    switch (desc->nr_channels) {
+                        case 1:
+                            return R300_TX_FORMAT_16F | result;
+                        case 2:
+                            return R300_TX_FORMAT_16F_16F | result;
+                        case 4:
+                            return R300_TX_FORMAT_16F_16F_16F_16F | result;
+                    }
+                    return ~0;
+
+                case 32:
+                    switch (desc->nr_channels) {
+                        case 1:
+                            return R300_TX_FORMAT_32F | result;
+                        case 2:
+                            return R300_TX_FORMAT_32F_32F | result;
+                        case 4:
+                            return R300_TX_FORMAT_32F_32F_32F_32F | result;
+                    }
+            }
+    }
+
+    return ~0; /* Unsupported/unknown. */
+}
+
+uint32_t r500_tx_format_msb_bit(enum pipe_format format)
+{
+    switch (format) {
+        case PIPE_FORMAT_RGTC1_UNORM:
+        case PIPE_FORMAT_RGTC1_SNORM:
+        case PIPE_FORMAT_X8Z24_UNORM:
+        case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+            return R500_TXFORMAT_MSB;
+        default:
+            return 0;
+    }
+}
+
+/* Buffer formats. */
+
+/* Colorbuffer formats. This is the unswizzled format of the RB3D block's
+ * output. For the swizzling of the targets, check the shader's format. */
+static uint32_t r300_translate_colorformat(enum pipe_format format)
+{
+    switch (format) {
+        /* 8-bit buffers. */
+        case PIPE_FORMAT_A8_UNORM:
+        case PIPE_FORMAT_I8_UNORM:
+        case PIPE_FORMAT_L8_UNORM:
+        case PIPE_FORMAT_R8_UNORM:
+        case PIPE_FORMAT_R8_SNORM:
+            return R300_COLOR_FORMAT_I8;
+
+        /* 16-bit buffers. */
+        case PIPE_FORMAT_B5G6R5_UNORM:
+            return R300_COLOR_FORMAT_RGB565;
+
+        case PIPE_FORMAT_B5G5R5A1_UNORM:
+        case PIPE_FORMAT_B5G5R5X1_UNORM:
+            return R300_COLOR_FORMAT_ARGB1555;
+
+        case PIPE_FORMAT_B4G4R4A4_UNORM:
+        case PIPE_FORMAT_B4G4R4X4_UNORM:
+            return R300_COLOR_FORMAT_ARGB4444;
+
+        /* 32-bit buffers. */
+        case PIPE_FORMAT_B8G8R8A8_UNORM:
+        case PIPE_FORMAT_B8G8R8X8_UNORM:
+        case PIPE_FORMAT_A8R8G8B8_UNORM:
+        case PIPE_FORMAT_X8R8G8B8_UNORM:
+        case PIPE_FORMAT_A8B8G8R8_UNORM:
+        case PIPE_FORMAT_R8G8B8A8_SNORM:
+        case PIPE_FORMAT_X8B8G8R8_UNORM:
+        case PIPE_FORMAT_R8G8B8X8_UNORM:
+        case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
+            return R300_COLOR_FORMAT_ARGB8888;
+
+        case PIPE_FORMAT_R10G10B10A2_UNORM:
+        case PIPE_FORMAT_R10G10B10X2_SNORM:
+        case PIPE_FORMAT_B10G10R10A2_UNORM:
+        case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
+            return R500_COLOR_FORMAT_ARGB2101010;  /* R5xx-only? */
+
+        /* 64-bit buffers. */
+        case PIPE_FORMAT_R16G16B16A16_UNORM:
+        case PIPE_FORMAT_R16G16B16A16_SNORM:
+        case PIPE_FORMAT_R16G16B16A16_FLOAT:
+            return R300_COLOR_FORMAT_ARGB16161616;
+
+        /* 128-bit buffers. */
+        case PIPE_FORMAT_R32G32B32A32_FLOAT:
+            return R300_COLOR_FORMAT_ARGB32323232;
+
+        /* YUV buffers. */
+        case PIPE_FORMAT_UYVY:
+            return R300_COLOR_FORMAT_YVYU;
+        case PIPE_FORMAT_YUYV:
+            return R300_COLOR_FORMAT_VYUY;
+        default:
+            return ~0; /* Unsupported. */
+    }
+}
+
+/* Depthbuffer and stencilbuffer. Thankfully, we only support two flavors. */
+static uint32_t r300_translate_zsformat(enum pipe_format format)
+{
+    switch (format) {
+        /* 16-bit depth, no stencil */
+        case PIPE_FORMAT_Z16_UNORM:
+            return R300_DEPTHFORMAT_16BIT_INT_Z;
+        /* 24-bit depth, ignored stencil */
+        case PIPE_FORMAT_X8Z24_UNORM:
+        /* 24-bit depth, 8-bit stencil */
+        case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+            return R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
+        default:
+            return ~0; /* Unsupported. */
+    }
+}
+
+/* Shader output formats. This is essentially the swizzle from the shader
+ * to the RB3D block.
+ *
+ * Note that formats are stored from C3 to C0. */
+static uint32_t r300_translate_out_fmt(enum pipe_format format)
+{
+    uint32_t modifier = 0;
+    unsigned i;
+    const struct util_format_description *desc;
+    static const uint32_t sign_bit[4] = {
+        R300_OUT_SIGN(0x1),
+        R300_OUT_SIGN(0x2),
+        R300_OUT_SIGN(0x4),
+        R300_OUT_SIGN(0x8),
+    };
+
+    desc = util_format_description(format);
+
+    /* Specifies how the shader output is written to the fog unit. */
+    if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) {
+        if (desc->channel[0].size == 32) {
+            modifier |= R300_US_OUT_FMT_C4_32_FP;
+        } else {
+            modifier |= R300_US_OUT_FMT_C4_16_FP;
+        }
+    } else {
+        if (desc->channel[0].size == 16) {
+            modifier |= R300_US_OUT_FMT_C4_16;
+        } else {
+            /* C4_8 seems to be used for the formats whose pixel size
+             * is <= 32 bits. */
+            modifier |= R300_US_OUT_FMT_C4_8;
+        }
+    }
+
+    /* Add sign. */
+    for (i = 0; i < 4; i++)
+        if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
+            modifier |= sign_bit[i];
+        }
+
+    /* Add swizzles and return. */
+    switch (format) {
+        /* 8-bit outputs.
+         * COLORFORMAT_I8 stores the C2 component. */
+        case PIPE_FORMAT_A8_UNORM:
+            return modifier | R300_C2_SEL_A;
+        case PIPE_FORMAT_I8_UNORM:
+        case PIPE_FORMAT_L8_UNORM:
+        case PIPE_FORMAT_R8_UNORM:
+        case PIPE_FORMAT_R8_SNORM:
+            return modifier | R300_C2_SEL_R;
+
+        /* BGRA outputs. */
+        case PIPE_FORMAT_B5G6R5_UNORM:
+        case PIPE_FORMAT_B5G5R5A1_UNORM:
+        case PIPE_FORMAT_B5G5R5X1_UNORM:
+        case PIPE_FORMAT_B4G4R4A4_UNORM:
+        case PIPE_FORMAT_B4G4R4X4_UNORM:
+        case PIPE_FORMAT_B8G8R8A8_UNORM:
+        case PIPE_FORMAT_B8G8R8X8_UNORM:
+        case PIPE_FORMAT_B10G10R10A2_UNORM:
+            return modifier |
+                R300_C0_SEL_B | R300_C1_SEL_G |
+                R300_C2_SEL_R | R300_C3_SEL_A;
+
+        /* ARGB outputs. */
+        case PIPE_FORMAT_A8R8G8B8_UNORM:
+        case PIPE_FORMAT_X8R8G8B8_UNORM:
+            return modifier |
+                R300_C0_SEL_A | R300_C1_SEL_R |
+                R300_C2_SEL_G | R300_C3_SEL_B;
+
+        /* ABGR outputs. */
+        case PIPE_FORMAT_A8B8G8R8_UNORM:
+        case PIPE_FORMAT_X8B8G8R8_UNORM:
+            return modifier |
+                R300_C0_SEL_A | R300_C1_SEL_B |
+                R300_C2_SEL_G | R300_C3_SEL_R;
+
+        /* RGBA outputs. */
+        case PIPE_FORMAT_R8G8B8X8_UNORM:
+        case PIPE_FORMAT_R8G8B8A8_SNORM:
+        case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
+        case PIPE_FORMAT_R10G10B10A2_UNORM:
+        case PIPE_FORMAT_R10G10B10X2_SNORM:
+        case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
+        case PIPE_FORMAT_R16G16B16A16_UNORM:
+        case PIPE_FORMAT_R16G16B16A16_SNORM:
+        case PIPE_FORMAT_R16G16B16A16_FLOAT:
+        case PIPE_FORMAT_R32G32B32A32_FLOAT:
+            return modifier |
+                R300_C0_SEL_R | R300_C1_SEL_G |
+                R300_C2_SEL_B | R300_C3_SEL_A;
+
+        default:
+            return ~0; /* Unsupported. */
+    }
+}
+
+boolean r300_is_colorbuffer_format_supported(enum pipe_format format)
+{
+    return r300_translate_colorformat(format) != ~0 &&
+           r300_translate_out_fmt(format) != ~0;
+}
+
+boolean r300_is_zs_format_supported(enum pipe_format format)
+{
+    return r300_translate_zsformat(format) != ~0;
+}
+
+boolean r300_is_sampler_format_supported(enum pipe_format format)
+{
+    return r300_translate_texformat(format, 0) != ~0;
+}
+
+static void r300_texture_setup_immutable_state(struct r300_screen* screen,
+                                               struct r300_texture* tex)
+{
+    struct r300_texture_format_state* f = &tex->tx_format;
+    struct pipe_resource *pt = &tex->b.b;
+    boolean is_r500 = screen->caps.is_r500;
+
+    /* Set sampler state. */
+    f->format0 = R300_TX_WIDTH((pt->width0 - 1) & 0x7ff) |
+                 R300_TX_HEIGHT((pt->height0 - 1) & 0x7ff);
+
+    if (tex->uses_pitch) {
+        /* rectangles love this */
+        f->format0 |= R300_TX_PITCH_EN;
+        f->format2 = (tex->hwpitch[0] - 1) & 0x1fff;
+    } else {
+        /* power of two textures (3D, mipmaps, and no pitch) */
+        f->format0 |= R300_TX_DEPTH(util_logbase2(pt->depth0) & 0xf);
+    }
+
+    f->format1 = 0;
+    if (pt->target == PIPE_TEXTURE_CUBE) {
+        f->format1 |= R300_TX_FORMAT_CUBIC_MAP;
+    }
+    if (pt->target == PIPE_TEXTURE_3D) {
+        f->format1 |= R300_TX_FORMAT_3D;
+    }
+
+    /* large textures on r500 */
+    if (is_r500)
+    {
+        if (pt->width0 > 2048) {
+            f->format2 |= R500_TXWIDTH_BIT11;
+        }
+        if (pt->height0 > 2048) {
+            f->format2 |= R500_TXHEIGHT_BIT11;
+        }
+    }
+
+    f->tile_config = R300_TXO_MACRO_TILE(tex->macrotile) |
+                     R300_TXO_MICRO_TILE(tex->microtile);
+}
+
+static void r300_texture_setup_fb_state(struct r300_screen* screen,
+                                        struct r300_texture* tex)
+{
+    unsigned i;
+
+    /* Set framebuffer state. */
+    if (util_format_is_depth_or_stencil(tex->b.b.format)) {
+        for (i = 0; i <= tex->b.b.last_level; i++) {
+            tex->fb_state.pitch[i] =
+                tex->hwpitch[i] |
+                R300_DEPTHMACROTILE(tex->mip_macrotile[i]) |
+                R300_DEPTHMICROTILE(tex->microtile);
+        }
+        tex->fb_state.format = r300_translate_zsformat(tex->b.b.format);
+    } else {
+        for (i = 0; i <= tex->b.b.last_level; i++) {
+            tex->fb_state.pitch[i] =
+                tex->hwpitch[i] |
+                r300_translate_colorformat(tex->b.b.format) |
+                R300_COLOR_TILE(tex->mip_macrotile[i]) |
+                R300_COLOR_MICROTILE(tex->microtile);
+        }
+        tex->fb_state.format = r300_translate_out_fmt(tex->b.b.format);
+    }
+}
+
+void r300_texture_reinterpret_format(struct pipe_screen *screen,
+                                     struct pipe_resource *tex,
+                                     enum pipe_format new_format)
+{
+    struct r300_screen *r300screen = r300_screen(screen);
+
+    SCREEN_DBG(r300screen, DBG_TEX,
+        "r300: texture_reinterpret_format: %s -> %s\n",
+        util_format_short_name(tex->format),
+        util_format_short_name(new_format));
+
+    tex->format = new_format;
+
+    r300_texture_setup_fb_state(r300_screen(screen), r300_texture(tex));
+}
+
+unsigned r300_texture_get_offset(struct r300_texture* tex, unsigned level,
+                                 unsigned zslice, unsigned face)
+{
+    unsigned offset = tex->offset[level];
+
+    switch (tex->b.b.target) {
+        case PIPE_TEXTURE_3D:
+            assert(face == 0);
+            return offset + zslice * tex->layer_size[level];
+
+        case PIPE_TEXTURE_CUBE:
+            assert(zslice == 0);
+            return offset + face * tex->layer_size[level];
+
+        default:
+            assert(zslice == 0 && face == 0);
+            return offset;
+    }
+}
+
+/* Returns the number of pixels that the texture should be aligned to
+ * in the given dimension. */
+static unsigned r300_get_pixel_alignment(struct r300_texture *tex,
+                                         enum r300_buffer_tiling macrotile,
+                                         enum r300_dim dim)
+{
+    static const unsigned table[2][5][3][2] =
+    {
+        {
+    /* Macro: linear    linear    linear
+       Micro: linear    tiled  square-tiled */
+            {{ 32, 1}, { 8,  4}, { 0,  0}}, /*   8 bits per pixel */
+            {{ 16, 1}, { 8,  2}, { 4,  4}}, /*  16 bits per pixel */
+            {{  8, 1}, { 4,  2}, { 0,  0}}, /*  32 bits per pixel */
+            {{  4, 1}, { 0,  0}, { 2,  2}}, /*  64 bits per pixel */
+            {{  2, 1}, { 0,  0}, { 0,  0}}  /* 128 bits per pixel */
+        },
+        {
+    /* Macro: tiled     tiled     tiled
+       Micro: linear    tiled  square-tiled */
+            {{256, 8}, {64, 32}, { 0,  0}}, /*   8 bits per pixel */
+            {{128, 8}, {64, 16}, {32, 32}}, /*  16 bits per pixel */
+            {{ 64, 8}, {32, 16}, { 0,  0}}, /*  32 bits per pixel */
+            {{ 32, 8}, { 0,  0}, {16, 16}}, /*  64 bits per pixel */
+            {{ 16, 8}, { 0,  0}, { 0,  0}}  /* 128 bits per pixel */
+        }
+    };
+    static const unsigned aa_block[2] = {4, 8};
+    unsigned res = 0;
+    unsigned pixsize = util_format_get_blocksize(tex->b.b.format);
+
+    assert(macrotile <= R300_BUFFER_TILED);
+    assert(tex->microtile <= R300_BUFFER_SQUARETILED);
+    assert(pixsize <= 16);
+    assert(dim <= DIM_HEIGHT);
+
+    if (tex->b.b.nr_samples > 1) {
+        /* Multisampled textures have their own alignment scheme. */
+        if (pixsize == 4)
+            res = aa_block[dim];
+    } else {
+        /* Standard alignment. */
+        res = table[macrotile][util_logbase2(pixsize)][tex->microtile][dim];
+    }
+
+    assert(res);
+    return res;
+}
+
+/* Return true if macrotiling should be enabled on the miplevel. */
+static boolean r300_texture_macro_switch(struct r300_texture *tex,
+                                         unsigned level,
+                                         boolean rv350_mode,
+                                         enum r300_dim dim)
+{
+    unsigned tile, texdim;
+
+    tile = r300_get_pixel_alignment(tex, R300_BUFFER_TILED, dim);
+    if (dim == DIM_WIDTH) {
+        texdim = u_minify(tex->b.b.width0, level);
+    } else {
+        texdim = u_minify(tex->b.b.height0, level);
+    }
+
+    /* See TX_FILTER1_n.MACRO_SWITCH. */
+    if (rv350_mode) {
+        return texdim >= tile;
+    } else {
+        return texdim > tile;
+    }
+}
+
+/**
+ * Return the stride, in bytes, of the texture images of the given texture
+ * at the given level.
+ */
+unsigned r300_texture_get_stride(struct r300_screen* screen,
+                                 struct r300_texture* tex, unsigned level)
+{
+    unsigned tile_width, width, stride;
+
+    if (tex->stride_override)
+        return tex->stride_override;
+
+    /* Check the level. */
+    if (level > tex->b.b.last_level) {
+        SCREEN_DBG(screen, DBG_TEX, "%s: level (%u) > last_level (%u)\n",
+                   __FUNCTION__, level, tex->b.b.last_level);
+        return 0;
+    }
+
+    width = u_minify(tex->b.b.width0, level);
+
+    if (util_format_is_plain(tex->b.b.format)) {
+        tile_width = r300_get_pixel_alignment(tex, tex->mip_macrotile[level],
+                                              DIM_WIDTH);
+        width = align(width, tile_width);
+
+        stride = util_format_get_stride(tex->b.b.format, width);
+
+        /* Some IGPs need a minimum stride of 64 bytes, hmm...
+         * This doesn't seem to apply to tiled textures, according to r300c. */
+        if (!tex->microtile && !tex->mip_macrotile[level] &&
+            (screen->caps.family == CHIP_FAMILY_RS600 ||
+             screen->caps.family == CHIP_FAMILY_RS690 ||
+             screen->caps.family == CHIP_FAMILY_RS740)) {
+            return stride < 64 ? 64 : stride;
+        }
+
+        /* The alignment to 32 bytes is sort of implied by the layout... */
+        return stride;
+    } else {
+        return align(util_format_get_stride(tex->b.b.format, width), 32);
+    }
+}
+
+static unsigned r300_texture_get_nblocksy(struct r300_texture* tex,
+                                          unsigned level)
+{
+    unsigned height, tile_height;
+
+    height = u_minify(tex->b.b.height0, level);
+
+    if (util_format_is_plain(tex->b.b.format)) {
+        tile_height = r300_get_pixel_alignment(tex, tex->mip_macrotile[level],
+                                               DIM_HEIGHT);
+        height = align(height, tile_height);
+
+        /* This is needed for the kernel checker, unfortunately. */
+        height = util_next_power_of_two(height);
+    }
+
+    return util_format_get_nblocksy(tex->b.b.format, height);
+}
+
+static void r300_texture_3d_fix_mipmapping(struct r300_screen *screen,
+                                           struct r300_texture *tex)
+{
+    /* The kernels <= 2.6.34-rc4 compute the size of mipmapped 3D textures
+     * incorrectly. This is a workaround to prevent CS from being rejected. */
+
+    unsigned i, size;
+
+    if (!screen->rws->get_value(screen->rws, R300_VID_DRM_2_3_0) &&
+        tex->b.b.target == PIPE_TEXTURE_3D &&
+        tex->b.b.last_level > 0) {
+        size = 0;
+
+        for (i = 0; i <= tex->b.b.last_level; i++) {
+            size += r300_texture_get_stride(screen, tex, i) *
+                    r300_texture_get_nblocksy(tex, i);
+        }
+
+        size *= tex->b.b.depth0;
+        tex->size = size;
+    }
+}
+
+static void r300_setup_miptree(struct r300_screen* screen,
+                               struct r300_texture* tex)
+{
+    struct pipe_resource* base = &tex->b.b;
+    unsigned stride, size, layer_size, nblocksy, i;
+    boolean rv350_mode = screen->caps.is_rv350;
+
+    SCREEN_DBG(screen, DBG_TEXALLOC,
+        "r300: Making miptree for texture, format %s\n",
+        util_format_short_name(base->format));
+
+    for (i = 0; i <= base->last_level; i++) {
+        /* Let's see if this miplevel can be macrotiled. */
+        tex->mip_macrotile[i] =
+            (tex->macrotile == R300_BUFFER_TILED &&
+             r300_texture_macro_switch(tex, i, rv350_mode, DIM_WIDTH) &&
+             r300_texture_macro_switch(tex, i, rv350_mode, DIM_HEIGHT)) ?
+             R300_BUFFER_TILED : R300_BUFFER_LINEAR;
+
+        stride = r300_texture_get_stride(screen, tex, i);
+        nblocksy = r300_texture_get_nblocksy(tex, i);
+        layer_size = stride * nblocksy;
+
+        if (base->nr_samples) {
+            layer_size *= base->nr_samples;
+        }
+
+        if (base->target == PIPE_TEXTURE_CUBE)
+            size = layer_size * 6;
+        else
+            size = layer_size * u_minify(base->depth0, i);
+
+        tex->offset[i] = tex->size;
+        tex->size = tex->offset[i] + size;
+        tex->layer_size[i] = layer_size;
+        tex->pitch[i] = stride / util_format_get_blocksize(base->format);
+        tex->hwpitch[i] =
+                tex->pitch[i] * util_format_get_blockwidth(base->format);
+
+        SCREEN_DBG(screen, DBG_TEXALLOC, "r300: Texture miptree: Level %d "
+                "(%dx%dx%d px, pitch %d bytes) %d bytes total, macrotiled %s\n",
+                i, u_minify(base->width0, i), u_minify(base->height0, i),
+                u_minify(base->depth0, i), stride, tex->size,
+                tex->mip_macrotile[i] ? "TRUE" : "FALSE");
+    }
+}
+
+static void r300_setup_flags(struct r300_texture* tex)
+{
+    tex->uses_pitch = !util_is_power_of_two(tex->b.b.width0) ||
+                      !util_is_power_of_two(tex->b.b.height0) ||
+                      tex->stride_override;
+}
+
+static void r300_setup_tiling(struct pipe_screen *screen,
+                              struct r300_texture *tex)
+{
+    struct r300_winsys_screen *rws = (struct r300_winsys_screen *)screen->winsys;
+    enum pipe_format format = tex->b.b.format;
+    boolean rv350_mode = r300_screen(screen)->caps.is_rv350;
+    boolean is_zb = util_format_is_depth_or_stencil(format);
+    boolean dbg_no_tiling = SCREEN_DBG_ON(r300_screen(screen), DBG_NO_TILING);
+
+    if (!util_format_is_plain(format)) {
+        return;
+    }
+
+    /* If height == 1, disable microtiling except for zbuffer. */
+    if (!is_zb && (tex->b.b.height0 == 1 || dbg_no_tiling)) {
+        return;
+    }
+
+    /* Set microtiling. */
+    switch (util_format_get_blocksize(format)) {
+        case 1:
+        case 4:
+            tex->microtile = R300_BUFFER_TILED;
+            break;
+
+        case 2:
+        case 8:
+            if (rws->get_value(rws, R300_VID_SQUARE_TILING_SUPPORT)) {
+                tex->microtile = R300_BUFFER_SQUARETILED;
+            }
+            break;
+    }
+
+    if (dbg_no_tiling) {
+        return;
+    }
+
+    /* Set macrotiling. */
+    if (r300_texture_macro_switch(tex, 0, rv350_mode, DIM_WIDTH) &&
+        r300_texture_macro_switch(tex, 0, rv350_mode, DIM_HEIGHT)) {
+        tex->macrotile = R300_BUFFER_TILED;
+    }
+}
+
+static unsigned r300_texture_is_referenced(struct pipe_context *context,
+					 struct pipe_resource *texture,
+					 unsigned face, unsigned level)
+{
+    struct r300_context *r300 = r300_context(context);
+    struct r300_texture *rtex = (struct r300_texture *)texture;
+
+    if (r300->rws->is_buffer_referenced(r300->rws, rtex->buffer, R300_REF_CS))
+        return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+
+    return PIPE_UNREFERENCED;
+}
+
+static void r300_texture_destroy(struct pipe_screen *screen,
+				 struct pipe_resource* texture)
+{
+    struct r300_texture* tex = (struct r300_texture*)texture;
+    struct r300_winsys_screen *rws = (struct r300_winsys_screen *)texture->screen->winsys;
+
+    rws->buffer_reference(rws, &tex->buffer, NULL);
+    FREE(tex);
+}
+
+static boolean r300_texture_get_handle(struct pipe_screen* screen,
+                                       struct pipe_resource *texture,
+                                       struct winsys_handle *whandle)
+{
+    struct r300_winsys_screen *rws = (struct r300_winsys_screen *)screen->winsys;
+    struct r300_texture* tex = (struct r300_texture*)texture;
+
+    if (!tex) {
+        return FALSE;
+    }
+
+    whandle->stride = r300_texture_get_stride(r300_screen(screen), tex, 0);
+
+    return rws->buffer_get_handle(rws, tex->buffer, whandle);
+}
+
+struct u_resource_vtbl r300_texture_vtbl = 
+{
+   r300_texture_get_handle,	      /* get_handle */
+   r300_texture_destroy,	      /* resource_destroy */
+   r300_texture_is_referenced,	      /* is_resource_referenced */
+   r300_texture_get_transfer,	      /* get_transfer */
+   r300_texture_transfer_destroy,     /* transfer_destroy */
+   r300_texture_transfer_map,	      /* transfer_map */
+   u_default_transfer_flush_region,   /* transfer_flush_region */
+   r300_texture_transfer_unmap,	      /* transfer_unmap */
+   u_default_transfer_inline_write    /* transfer_inline_write */
+};
+
+/* Create a new texture. */
+struct pipe_resource* r300_texture_create(struct pipe_screen* screen,
+                                          const struct pipe_resource* base)
+{
+    struct r300_texture* tex = CALLOC_STRUCT(r300_texture);
+    struct r300_screen* rscreen = r300_screen(screen);
+    struct r300_winsys_screen *rws = (struct r300_winsys_screen *)screen->winsys;
+
+    if (!tex) {
+        return NULL;
+    }
+
+    /* Refuse to create a texture with size 0. */
+    if (!base->width0 ||
+        (!base->height0 && (base->target == PIPE_TEXTURE_2D ||
+                            base->target == PIPE_TEXTURE_CUBE)) ||
+        (!base->depth0 && base->target == PIPE_TEXTURE_3D)) {
+        fprintf(stderr, "r300: texture_create: "
+                "Got invalid texture dimensions: %ix%ix%i\n",
+                base->width0, base->height0, base->depth0);
+        FREE(tex);
+        return NULL;
+    }
+
+    tex->b.b = *base;
+    tex->b.vtbl = &r300_texture_vtbl;
+    pipe_reference_init(&tex->b.b.reference, 1);
+    tex->b.b.screen = screen;
+
+    r300_setup_flags(tex);
+    if (!(base->flags & R300_RESOURCE_FLAG_TRANSFER) &&
+        !(base->bind & PIPE_BIND_SCANOUT)) {
+        r300_setup_tiling(screen, tex);
+    }
+    r300_setup_miptree(rscreen, tex);
+    r300_texture_3d_fix_mipmapping(rscreen, tex);
+    r300_texture_setup_immutable_state(rscreen, tex);
+    r300_texture_setup_fb_state(rscreen, tex);
+
+    SCREEN_DBG(rscreen, DBG_TEX,
+               "r300: texture_create: Macro: %s, Micro: %s, Pitch: %i, "
+               "Dim: %ix%ix%i, LastLevel: %i, Size: %i, Format: %s\n",
+               tex->macrotile ? "YES" : " NO",
+               tex->microtile ? "YES" : " NO",
+               tex->hwpitch[0],
+               base->width0, base->height0, base->depth0, base->last_level,
+               tex->size,
+               util_format_short_name(base->format));
+
+    tex->domain = base->flags & R300_RESOURCE_FLAG_TRANSFER ? R300_DOMAIN_GTT :
+                                                              R300_DOMAIN_VRAM;
+
+    tex->buffer = rws->buffer_create(rws, 2048, base->bind, tex->domain,
+                                     tex->size);
+
+    if (!tex->buffer) {
+	FREE(tex);
+	return NULL;
+    }
+
+    rws->buffer_set_tiling(rws, tex->buffer,
+            tex->pitch[0] * util_format_get_blocksize(tex->b.b.format),
+            tex->microtile,
+            tex->macrotile);
+
+    return (struct pipe_resource*)tex;
+}
+
+/* Not required to implement u_resource_vtbl, consider moving to another file:
+ */
+struct pipe_surface* r300_get_tex_surface(struct pipe_screen* screen,
+					  struct pipe_resource* texture,
+					  unsigned face,
+					  unsigned level,
+					  unsigned zslice,
+					  unsigned flags)
+{
+    struct r300_texture* tex = r300_texture(texture);
+    struct r300_surface* surface = CALLOC_STRUCT(r300_surface);
+
+    if (surface) {
+        pipe_reference_init(&surface->base.reference, 1);
+        pipe_resource_reference(&surface->base.texture, texture);
+        surface->base.format = texture->format;
+        surface->base.width = u_minify(texture->width0, level);
+        surface->base.height = u_minify(texture->height0, level);
+        surface->base.usage = flags;
+        surface->base.zslice = zslice;
+        surface->base.face = face;
+        surface->base.level = level;
+
+        surface->buffer = tex->buffer;
+        surface->domain = tex->domain;
+        surface->offset = r300_texture_get_offset(tex, level, zslice, face);
+        surface->pitch = tex->fb_state.pitch[level];
+        surface->format = tex->fb_state.format;
+    }
+
+    return &surface->base;
+}
+
+/* Not required to implement u_resource_vtbl, consider moving to another file:
+ */
+void r300_tex_surface_destroy(struct pipe_surface* s)
+{
+    pipe_resource_reference(&s->texture, NULL);
+    FREE(s);
+}
+
+struct pipe_resource*
+r300_texture_from_handle(struct pipe_screen* screen,
+			  const struct pipe_resource* base,
+			  struct winsys_handle *whandle)
+{
+    struct r300_winsys_screen *rws = (struct r300_winsys_screen*)screen->winsys;
+    struct r300_screen* rscreen = r300_screen(screen);
+    struct r300_winsys_buffer *buffer;
+    struct r300_texture* tex;
+    boolean override_zb_flags;
+
+    /* Support only 2D textures without mipmaps */
+    if (base->target != PIPE_TEXTURE_2D ||
+        base->depth0 != 1 ||
+        base->last_level != 0) {
+        return NULL;
+    }
+
+    buffer = rws->buffer_from_handle(rws, whandle->handle);
+    if (!buffer) {
+        return NULL;
+    }
+
+    tex = CALLOC_STRUCT(r300_texture);
+    if (!tex) {
+        return NULL;
+    }
+
+    tex->b.b = *base;
+    tex->b.vtbl = &r300_texture_vtbl;
+    pipe_reference_init(&tex->b.b.reference, 1);
+    tex->b.b.screen = screen;
+    tex->domain = R300_DOMAIN_VRAM;
+
+    tex->stride_override = whandle->stride;
+
+    /* one ref already taken */
+    tex->buffer = buffer;
+
+    rws->buffer_get_tiling(rws, buffer, &tex->microtile, &tex->macrotile);
+    r300_setup_flags(tex);
+    SCREEN_DBG(rscreen, DBG_TEX,
+               "r300: texture_from_handle: Macro: %s, Micro: %s, "
+               "Pitch: % 4i, Dim: %ix%i, Format: %s\n",
+               tex->macrotile ? "YES" : " NO",
+               tex->microtile ? "YES" : " NO",
+               whandle->stride / util_format_get_blocksize(base->format),
+               base->width0, base->height0,
+               util_format_short_name(base->format));
+
+    /* Enforce microtiled zbuffer. */
+    override_zb_flags = util_format_is_depth_or_stencil(base->format) &&
+                        tex->microtile == R300_BUFFER_LINEAR;
+
+    if (override_zb_flags) {
+        switch (util_format_get_blocksize(base->format)) {
+            case 4:
+                tex->microtile = R300_BUFFER_TILED;
+                break;
+
+            case 2:
+                if (rws->get_value(rws, R300_VID_SQUARE_TILING_SUPPORT)) {
+                    tex->microtile = R300_BUFFER_SQUARETILED;
+                    break;
+                }
+                /* Pass through. */
+
+            default:
+                override_zb_flags = FALSE;
+        }
+    }
+
+    r300_setup_miptree(rscreen, tex);
+    r300_texture_setup_immutable_state(rscreen, tex);
+    r300_texture_setup_fb_state(rscreen, tex);
+
+    if (override_zb_flags) {
+        rws->buffer_set_tiling(rws, tex->buffer,
+                tex->pitch[0] * util_format_get_blocksize(tex->b.b.format),
+                tex->microtile,
+                tex->macrotile);
+    }
+    return (struct pipe_resource*)tex;
+}
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
new file mode 100644
index 0000000000..99e7694254
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_TEXTURE_H
+#define R300_TEXTURE_H
+
+#include "pipe/p_format.h"
+
+struct pipe_screen;
+struct pipe_resource;
+struct winsys_handle;
+struct r300_texture;
+struct r300_screen;
+
+unsigned r300_get_swizzle_combined(const unsigned char *swizzle_format,
+                                   const unsigned char *swizzle_view);
+
+uint32_t r300_translate_texformat(enum pipe_format format,
+                                  const unsigned char *swizzle_view);
+
+uint32_t r500_tx_format_msb_bit(enum pipe_format format);
+
+unsigned r300_texture_get_stride(struct r300_screen* screen,
+                                 struct r300_texture* tex, unsigned level);
+
+unsigned r300_texture_get_offset(struct r300_texture* tex, unsigned level,
+                                 unsigned zslice, unsigned face);
+
+void r300_texture_reinterpret_format(struct pipe_screen *screen,
+                                     struct pipe_resource *tex,
+                                     enum pipe_format new_format);
+
+boolean r300_is_colorbuffer_format_supported(enum pipe_format format);
+
+boolean r300_is_zs_format_supported(enum pipe_format format);
+
+boolean r300_is_sampler_format_supported(enum pipe_format format);
+
+
+struct pipe_resource*
+r300_texture_from_handle(struct pipe_screen* screen,
+			 const struct pipe_resource* base,
+			 struct winsys_handle *whandle);
+
+struct pipe_resource*
+r300_texture_create(struct pipe_screen* screen,
+		    const struct pipe_resource* templ);
+
+
+struct pipe_surface* r300_get_tex_surface(struct pipe_screen* screen,
+					  struct pipe_resource* texture,
+					  unsigned face,
+					  unsigned level,
+					  unsigned zslice,
+					  unsigned flags);
+
+void r300_tex_surface_destroy(struct pipe_surface* s);
+
+#endif /* R300_TEXTURE_H */
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
new file mode 100644
index 0000000000..5394e04f72
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_tgsi_to_rc.h"
+
+#include "radeon_compiler.h"
+#include "radeon_program.h"
+
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_util.h"
+
+static unsigned translate_opcode(unsigned opcode)
+{
+    switch(opcode) {
+        case TGSI_OPCODE_ARL: return RC_OPCODE_ARL;
+        case TGSI_OPCODE_MOV: return RC_OPCODE_MOV;
+        case TGSI_OPCODE_LIT: return RC_OPCODE_LIT;
+        case TGSI_OPCODE_RCP: return RC_OPCODE_RCP;
+        case TGSI_OPCODE_RSQ: return RC_OPCODE_RSQ;
+        case TGSI_OPCODE_EXP: return RC_OPCODE_EXP;
+        case TGSI_OPCODE_LOG: return RC_OPCODE_LOG;
+        case TGSI_OPCODE_MUL: return RC_OPCODE_MUL;
+        case TGSI_OPCODE_ADD: return RC_OPCODE_ADD;
+        case TGSI_OPCODE_DP3: return RC_OPCODE_DP3;
+        case TGSI_OPCODE_DP4: return RC_OPCODE_DP4;
+        case TGSI_OPCODE_DST: return RC_OPCODE_DST;
+        case TGSI_OPCODE_MIN: return RC_OPCODE_MIN;
+        case TGSI_OPCODE_MAX: return RC_OPCODE_MAX;
+        case TGSI_OPCODE_SLT: return RC_OPCODE_SLT;
+        case TGSI_OPCODE_SGE: return RC_OPCODE_SGE;
+        case TGSI_OPCODE_MAD: return RC_OPCODE_MAD;
+        case TGSI_OPCODE_SUB: return RC_OPCODE_SUB;
+        case TGSI_OPCODE_LRP: return RC_OPCODE_LRP;
+     /* case TGSI_OPCODE_CND: return RC_OPCODE_CND; */
+     /* case TGSI_OPCODE_CND0: return RC_OPCODE_CND0; */
+     /* case TGSI_OPCODE_DP2A: return RC_OPCODE_DP2A; */
+                                        /* gap */
+        case TGSI_OPCODE_FRC: return RC_OPCODE_FRC;
+     /* case TGSI_OPCODE_CLAMP: return RC_OPCODE_CLAMP; */
+        case TGSI_OPCODE_FLR: return RC_OPCODE_FLR;
+     /* case TGSI_OPCODE_ROUND: return RC_OPCODE_ROUND; */
+        case TGSI_OPCODE_EX2: return RC_OPCODE_EX2;
+        case TGSI_OPCODE_LG2: return RC_OPCODE_LG2;
+        case TGSI_OPCODE_POW: return RC_OPCODE_POW;
+        case TGSI_OPCODE_XPD: return RC_OPCODE_XPD;
+                                        /* gap */
+        case TGSI_OPCODE_ABS: return RC_OPCODE_ABS;
+     /* case TGSI_OPCODE_RCC: return RC_OPCODE_RCC; */
+        case TGSI_OPCODE_DPH: return RC_OPCODE_DPH;
+        case TGSI_OPCODE_COS: return RC_OPCODE_COS;
+        case TGSI_OPCODE_DDX: return RC_OPCODE_DDX;
+        case TGSI_OPCODE_DDY: return RC_OPCODE_DDY;
+     /* case TGSI_OPCODE_KILP: return RC_OPCODE_KILP; */
+     /* case TGSI_OPCODE_PK2H: return RC_OPCODE_PK2H; */
+     /* case TGSI_OPCODE_PK2US: return RC_OPCODE_PK2US; */
+     /* case TGSI_OPCODE_PK4B: return RC_OPCODE_PK4B; */
+     /* case TGSI_OPCODE_PK4UB: return RC_OPCODE_PK4UB; */
+     /* case TGSI_OPCODE_RFL: return RC_OPCODE_RFL; */
+        case TGSI_OPCODE_SEQ: return RC_OPCODE_SEQ;
+        case TGSI_OPCODE_SFL: return RC_OPCODE_SFL;
+        case TGSI_OPCODE_SGT: return RC_OPCODE_SGT;
+        case TGSI_OPCODE_SIN: return RC_OPCODE_SIN;
+        case TGSI_OPCODE_SLE: return RC_OPCODE_SLE;
+        case TGSI_OPCODE_SNE: return RC_OPCODE_SNE;
+     /* case TGSI_OPCODE_STR: return RC_OPCODE_STR; */
+        case TGSI_OPCODE_TEX: return RC_OPCODE_TEX;
+        case TGSI_OPCODE_TXD: return RC_OPCODE_TXD;
+        case TGSI_OPCODE_TXP: return RC_OPCODE_TXP;
+     /* case TGSI_OPCODE_UP2H: return RC_OPCODE_UP2H; */
+     /* case TGSI_OPCODE_UP2US: return RC_OPCODE_UP2US; */
+     /* case TGSI_OPCODE_UP4B: return RC_OPCODE_UP4B; */
+     /* case TGSI_OPCODE_UP4UB: return RC_OPCODE_UP4UB; */
+     /* case TGSI_OPCODE_X2D: return RC_OPCODE_X2D; */
+     /* case TGSI_OPCODE_ARA: return RC_OPCODE_ARA; */
+     /* case TGSI_OPCODE_ARR: return RC_OPCODE_ARR; */
+     /* case TGSI_OPCODE_BRA: return RC_OPCODE_BRA; */
+     /* case TGSI_OPCODE_CAL: return RC_OPCODE_CAL; */
+     /* case TGSI_OPCODE_RET: return RC_OPCODE_RET; */
+     /* case TGSI_OPCODE_SSG: return RC_OPCODE_SSG; */
+        case TGSI_OPCODE_CMP: return RC_OPCODE_CMP;
+        case TGSI_OPCODE_SCS: return RC_OPCODE_SCS;
+        case TGSI_OPCODE_TXB: return RC_OPCODE_TXB;
+     /* case TGSI_OPCODE_NRM: return RC_OPCODE_NRM; */
+     /* case TGSI_OPCODE_DIV: return RC_OPCODE_DIV; */
+     /* case TGSI_OPCODE_DP2: return RC_OPCODE_DP2; */
+        case TGSI_OPCODE_TXL: return RC_OPCODE_TXL;
+        case TGSI_OPCODE_BRK: return RC_OPCODE_BRK;
+        case TGSI_OPCODE_IF: return RC_OPCODE_IF;
+        case TGSI_OPCODE_BGNLOOP: return RC_OPCODE_BGNLOOP;
+        case TGSI_OPCODE_ELSE: return RC_OPCODE_ELSE;
+        case TGSI_OPCODE_ENDIF: return RC_OPCODE_ENDIF;
+        case TGSI_OPCODE_ENDLOOP: return RC_OPCODE_ENDLOOP;
+     /* case TGSI_OPCODE_PUSHA: return RC_OPCODE_PUSHA; */
+     /* case TGSI_OPCODE_POPA: return RC_OPCODE_POPA; */
+        case TGSI_OPCODE_CEIL: return RC_OPCODE_CEIL;
+     /* case TGSI_OPCODE_I2F: return RC_OPCODE_I2F; */
+     /* case TGSI_OPCODE_NOT: return RC_OPCODE_NOT; */
+        case TGSI_OPCODE_TRUNC: return RC_OPCODE_FLR;
+     /* case TGSI_OPCODE_SHL: return RC_OPCODE_SHL; */
+     /* case TGSI_OPCODE_ISHR: return RC_OPCODE_SHR; */
+     /* case TGSI_OPCODE_AND: return RC_OPCODE_AND; */
+     /* case TGSI_OPCODE_OR: return RC_OPCODE_OR; */
+     /* case TGSI_OPCODE_MOD: return RC_OPCODE_MOD; */
+     /* case TGSI_OPCODE_XOR: return RC_OPCODE_XOR; */
+     /* case TGSI_OPCODE_SAD: return RC_OPCODE_SAD; */
+     /* case TGSI_OPCODE_TXF: return RC_OPCODE_TXF; */
+     /* case TGSI_OPCODE_TXQ: return RC_OPCODE_TXQ; */
+     /* case TGSI_OPCODE_CONT: return RC_OPCODE_CONT; */
+     /* case TGSI_OPCODE_EMIT: return RC_OPCODE_EMIT; */
+     /* case TGSI_OPCODE_ENDPRIM: return RC_OPCODE_ENDPRIM; */
+     /* case TGSI_OPCODE_BGNLOOP2: return RC_OPCODE_BGNLOOP2; */
+     /* case TGSI_OPCODE_BGNSUB: return RC_OPCODE_BGNSUB; */
+     /* case TGSI_OPCODE_ENDLOOP2: return RC_OPCODE_ENDLOOP2; */
+     /* case TGSI_OPCODE_ENDSUB: return RC_OPCODE_ENDSUB; */
+        case TGSI_OPCODE_NOP: return RC_OPCODE_NOP;
+                                        /* gap */
+     /* case TGSI_OPCODE_NRM4: return RC_OPCODE_NRM4; */
+     /* case TGSI_OPCODE_CALLNZ: return RC_OPCODE_CALLNZ; */
+     /* case TGSI_OPCODE_IFC: return RC_OPCODE_IFC; */
+     /* case TGSI_OPCODE_BREAKC: return RC_OPCODE_BREAKC; */
+        case TGSI_OPCODE_KIL: return RC_OPCODE_KIL;
+    }
+
+    fprintf(stderr, "r300: Unknown TGSI/RC opcode: %s\n", tgsi_get_opcode_name(opcode));
+    return RC_OPCODE_ILLEGAL_OPCODE;
+}
+
+static unsigned translate_saturate(unsigned saturate)
+{
+    switch(saturate) {
+        default:
+            fprintf(stderr, "Unknown saturate mode: %i\n", saturate);
+            /* fall-through */
+        case TGSI_SAT_NONE: return RC_SATURATE_NONE;
+        case TGSI_SAT_ZERO_ONE: return RC_SATURATE_ZERO_ONE;
+    }
+}
+
+static unsigned translate_register_file(unsigned file)
+{
+    switch(file) {
+        case TGSI_FILE_CONSTANT: return RC_FILE_CONSTANT;
+        case TGSI_FILE_IMMEDIATE: return RC_FILE_CONSTANT;
+        case TGSI_FILE_INPUT: return RC_FILE_INPUT;
+        case TGSI_FILE_OUTPUT: return RC_FILE_OUTPUT;
+        default:
+            fprintf(stderr, "Unhandled register file: %i\n", file);
+            /* fall-through */
+        case TGSI_FILE_TEMPORARY: return RC_FILE_TEMPORARY;
+        case TGSI_FILE_ADDRESS: return RC_FILE_ADDRESS;
+    }
+}
+
+static int translate_register_index(
+    struct tgsi_to_rc * ttr,
+    unsigned file,
+    int index)
+{
+    if (file == TGSI_FILE_IMMEDIATE)
+        return ttr->immediate_offset + index;
+
+    return index;
+}
+
+static void transform_dstreg(
+    struct tgsi_to_rc * ttr,
+    struct rc_dst_register * dst,
+    struct tgsi_full_dst_register * src)
+{
+    dst->File = translate_register_file(src->Register.File);
+    dst->Index = translate_register_index(ttr, src->Register.File, src->Register.Index);
+    dst->WriteMask = src->Register.WriteMask;
+    dst->RelAddr = src->Register.Indirect;
+}
+
+static void transform_srcreg(
+    struct tgsi_to_rc * ttr,
+    struct rc_src_register * dst,
+    struct tgsi_full_src_register * src)
+{
+    unsigned i, j;
+
+    dst->File = translate_register_file(src->Register.File);
+    dst->Index = translate_register_index(ttr, src->Register.File, src->Register.Index);
+    dst->RelAddr = src->Register.Indirect;
+    dst->Swizzle = tgsi_util_get_full_src_register_swizzle(src, 0);
+    dst->Swizzle |= tgsi_util_get_full_src_register_swizzle(src, 1) << 3;
+    dst->Swizzle |= tgsi_util_get_full_src_register_swizzle(src, 2) << 6;
+    dst->Swizzle |= tgsi_util_get_full_src_register_swizzle(src, 3) << 9;
+    dst->Abs = src->Register.Absolute;
+    dst->Negate = src->Register.Negate ? RC_MASK_XYZW : 0;
+
+    if (src->Register.File == TGSI_FILE_IMMEDIATE) {
+        for (i = 0; i < ttr->imms_to_swizzle_count; i++) {
+            if (ttr->imms_to_swizzle[i].index == src->Register.Index) {
+                dst->File = RC_FILE_TEMPORARY;
+                dst->Index = 0;
+                dst->Swizzle = 0;
+                for (j = 0; j < 4; j++) {
+                    dst->Swizzle |= GET_SWZ(ttr->imms_to_swizzle[i].swizzle,
+                        tgsi_util_get_full_src_register_swizzle(src, j)) << (j * 3);
+                }
+                break;
+            }
+        }
+    }
+}
+
+static void transform_texture(struct rc_instruction * dst, struct tgsi_instruction_texture src,
+                              uint32_t *shadowSamplers)
+{
+    switch(src.Texture) {
+        case TGSI_TEXTURE_1D:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_1D;
+            break;
+        case TGSI_TEXTURE_2D:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_2D;
+            break;
+        case TGSI_TEXTURE_3D:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_3D;
+            break;
+        case TGSI_TEXTURE_CUBE:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_CUBE;
+            break;
+        case TGSI_TEXTURE_RECT:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_RECT;
+            break;
+        case TGSI_TEXTURE_SHADOW1D:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_1D;
+            dst->U.I.TexShadow = 1;
+            *shadowSamplers |= 1 << dst->U.I.TexSrcUnit;
+            break;
+        case TGSI_TEXTURE_SHADOW2D:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_2D;
+            dst->U.I.TexShadow = 1;
+            *shadowSamplers |= 1 << dst->U.I.TexSrcUnit;
+            break;
+        case TGSI_TEXTURE_SHADOWRECT:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_RECT;
+            dst->U.I.TexShadow = 1;
+            *shadowSamplers |= 1 << dst->U.I.TexSrcUnit;
+            break;
+    }
+}
+
+static void transform_instruction(struct tgsi_to_rc * ttr, struct tgsi_full_instruction * src)
+{
+    struct rc_instruction * dst;
+    int i;
+
+    dst = rc_insert_new_instruction(ttr->compiler, ttr->compiler->Program.Instructions.Prev);
+    dst->U.I.Opcode = translate_opcode(src->Instruction.Opcode);
+    dst->U.I.SaturateMode = translate_saturate(src->Instruction.Saturate);
+
+    if (src->Instruction.NumDstRegs)
+        transform_dstreg(ttr, &dst->U.I.DstReg, &src->Dst[0]);
+
+    for(i = 0; i < src->Instruction.NumSrcRegs; ++i) {
+        if (src->Src[i].Register.File == TGSI_FILE_SAMPLER)
+            dst->U.I.TexSrcUnit = src->Src[i].Register.Index;
+        else
+            transform_srcreg(ttr, &dst->U.I.SrcReg[i], &src->Src[i]);
+    }
+
+    /* Texturing. */
+    if (src->Instruction.Texture)
+        transform_texture(dst, src->Texture,
+                          &ttr->compiler->Program.ShadowSamplers);
+}
+
+static void handle_immediate(struct tgsi_to_rc * ttr,
+                             struct tgsi_full_immediate * imm,
+                             unsigned index)
+{
+    struct rc_constant constant;
+    unsigned swizzle = 0;
+    boolean can_swizzle = TRUE;
+    unsigned i;
+
+    for (i = 0; i < 4; i++) {
+        if (imm->u[i].Float == 0.0f) {
+            swizzle |= RC_SWIZZLE_ZERO << (i * 3);
+        } else if (imm->u[i].Float == 0.5f && ttr->use_half_swizzles) {
+            swizzle |= RC_SWIZZLE_HALF << (i * 3);
+        } else if (imm->u[i].Float == 1.0f) {
+            swizzle |= RC_SWIZZLE_ONE << (i * 3);
+        } else {
+            can_swizzle = FALSE;
+            break;
+        }
+    }
+
+    if (can_swizzle) {
+        ttr->imms_to_swizzle[ttr->imms_to_swizzle_count].index = index;
+        ttr->imms_to_swizzle[ttr->imms_to_swizzle_count].swizzle = swizzle;
+        ttr->imms_to_swizzle_count++;
+    } else {
+        constant.Type = RC_CONSTANT_IMMEDIATE;
+        constant.Size = 4;
+        for(i = 0; i < 4; ++i)
+            constant.u.Immediate[i] = imm->u[i].Float;
+        rc_constants_add(&ttr->compiler->Program.Constants, &constant);
+    }
+}
+
+void r300_tgsi_to_rc(struct tgsi_to_rc * ttr,
+                     const struct tgsi_token * tokens)
+{
+    struct tgsi_full_instruction *inst;
+    struct tgsi_parse_context parser;
+    unsigned imm_index = 0;
+    int i;
+
+    /* Allocate constants placeholders.
+     *
+     * Note: What if declared constants are not contiguous? */
+    for(i = 0; i <= ttr->info->file_max[TGSI_FILE_CONSTANT]; ++i) {
+        struct rc_constant constant;
+        memset(&constant, 0, sizeof(constant));
+        constant.Type = RC_CONSTANT_EXTERNAL;
+        constant.Size = 4;
+        constant.u.External = i;
+        rc_constants_add(&ttr->compiler->Program.Constants, &constant);
+    }
+
+    ttr->immediate_offset = ttr->compiler->Program.Constants.Count;
+
+    ttr->imms_to_swizzle = malloc(ttr->info->immediate_count * sizeof(struct swizzled_imms));
+    ttr->imms_to_swizzle_count = 0;
+
+    tgsi_parse_init(&parser, tokens);
+
+    while (!tgsi_parse_end_of_tokens(&parser)) {
+        tgsi_parse_token(&parser);
+
+        switch (parser.FullToken.Token.Type) {
+            case TGSI_TOKEN_TYPE_DECLARATION:
+                break;
+            case TGSI_TOKEN_TYPE_IMMEDIATE:
+                handle_immediate(ttr, &parser.FullToken.FullImmediate, imm_index);
+                imm_index++;
+                break;
+            case TGSI_TOKEN_TYPE_INSTRUCTION:
+                inst = &parser.FullToken.FullInstruction;
+                /* This hack with the RET opcode woudn't work with
+                 * conditionals. */
+                if (inst->Instruction.Opcode == TGSI_OPCODE_END ||
+                    inst->Instruction.Opcode == TGSI_OPCODE_RET) {
+                    break;
+                }
+
+                transform_instruction(ttr, inst);
+                break;
+        }
+    }
+
+    tgsi_parse_free(&parser);
+
+    free(ttr->imms_to_swizzle);
+
+    rc_calculate_inputs_outputs(ttr->compiler);
+}
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.h b/src/gallium/drivers/r300/r300_tgsi_to_rc.h
new file mode 100644
index 0000000000..97641a954b
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_TGSI_TO_RC_H
+#define R300_TGSI_TO_RC_H
+
+#include "pipe/p_compiler.h"
+
+struct radeon_compiler;
+
+struct tgsi_full_declaration;
+struct tgsi_shader_info;
+struct tgsi_token;
+
+struct swizzled_imms {
+    unsigned index;
+    unsigned swizzle;
+};
+
+struct tgsi_to_rc {
+    struct radeon_compiler * compiler;
+    const struct tgsi_shader_info * info;
+
+    int immediate_offset;
+    struct swizzled_imms * imms_to_swizzle;
+    unsigned imms_to_swizzle_count;
+
+    /* Vertex shaders have no half swizzles, and no way to handle them, so
+     * until rc grows proper support, indicate if they're safe to use. */
+    boolean use_half_swizzles;
+};
+
+void r300_tgsi_to_rc(struct tgsi_to_rc * ttr, const struct tgsi_token * tokens);
+
+#endif /* R300_TGSI_TO_RC_H */
diff --git a/src/gallium/drivers/r300/r300_transfer.c b/src/gallium/drivers/r300/r300_transfer.c
new file mode 100644
index 0000000000..d41f258836
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_transfer.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_transfer.h"
+#include "r300_texture.h"
+#include "r300_screen_buffer.h"
+
+#include "util/u_memory.h"
+#include "util/u_format.h"
+
+struct r300_transfer {
+    /* Parent class */
+    struct pipe_transfer transfer;
+
+    /* Offset from start of buffer. */
+    unsigned offset;
+
+    /* Detiled texture. */
+    struct r300_texture *detiled_texture;
+};
+
+/* Convenience cast wrapper. */
+static INLINE struct r300_transfer*
+r300_transfer(struct pipe_transfer* transfer)
+{
+    return (struct r300_transfer*)transfer;
+}
+
+/* Copy from a tiled texture to a detiled one. */
+static void r300_copy_from_tiled_texture(struct pipe_context *ctx,
+                                         struct r300_transfer *r300transfer)
+{
+    struct pipe_transfer *transfer = (struct pipe_transfer*)r300transfer;
+    struct pipe_resource *tex = transfer->resource;
+    struct pipe_subresource subdst;
+
+    subdst.face = 0;
+    subdst.level = 0;
+
+    ctx->resource_copy_region(ctx, &r300transfer->detiled_texture->b.b, subdst,
+			      0, 0, 0,
+			      tex, transfer->sr,
+			      transfer->box.x, transfer->box.y, transfer->box.z,
+			      transfer->box.width, transfer->box.height);
+}
+
+/* Copy a detiled texture to a tiled one. */
+static void r300_copy_into_tiled_texture(struct pipe_context *ctx,
+                                         struct r300_transfer *r300transfer)
+{
+    struct pipe_transfer *transfer = (struct pipe_transfer*)r300transfer;
+    struct pipe_resource *tex = transfer->resource;
+    struct pipe_subresource subsrc;
+
+    subsrc.face = 0;
+    subsrc.level = 0;
+
+    ctx->resource_copy_region(ctx, tex, transfer->sr,
+			      transfer->box.x, transfer->box.y, transfer->box.z,
+			      &r300transfer->detiled_texture->b.b, subsrc,
+			      0, 0, 0,
+			      transfer->box.width, transfer->box.height);
+}
+
+struct pipe_transfer*
+r300_texture_get_transfer(struct pipe_context *ctx,
+			  struct pipe_resource *texture,
+			  struct pipe_subresource sr,
+			  unsigned usage,
+			  const struct pipe_box *box)
+{
+    struct r300_texture *tex = r300_texture(texture);
+    struct r300_screen *r300screen = r300_screen(ctx->screen);
+    struct r300_transfer *trans;
+    struct pipe_resource base;
+    boolean referenced_cs, referenced_hw, blittable;
+
+    referenced_cs = r300screen->rws->is_buffer_referenced(
+                                r300screen->rws, tex->buffer, R300_REF_CS);
+    if (referenced_cs) {
+        referenced_hw = TRUE;
+    } else {
+        referenced_hw = r300screen->rws->is_buffer_referenced(
+                                r300screen->rws, tex->buffer, R300_REF_HW);
+    }
+
+    blittable = ctx->screen->is_format_supported(
+            ctx->screen, texture->format, texture->target, 0,
+            PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET, 0);
+
+    trans = CALLOC_STRUCT(r300_transfer);
+    if (trans) {
+        /* Initialize the transfer object. */
+        pipe_resource_reference(&trans->transfer.resource, texture);
+        trans->transfer.sr = sr;
+        trans->transfer.usage = usage;
+        trans->transfer.box = *box;
+
+        /* If the texture is tiled, we must create a temporary detiled texture
+         * for this transfer.
+         * Also make write transfers pipelined. */
+        if (tex->microtile || tex->macrotile ||
+            ((referenced_hw & !(usage & PIPE_TRANSFER_READ)) && blittable)) {
+            base.target = PIPE_TEXTURE_2D;
+            base.format = texture->format;
+            base.width0 = box->width;
+            base.height0 = box->height;
+            base.depth0 = 0;
+            base.last_level = 0;
+            base.nr_samples = 0;
+            base.usage = PIPE_USAGE_DYNAMIC;
+            base.bind = 0;
+	    base.flags = R300_RESOURCE_FLAG_TRANSFER;
+
+            /* For texture reading, the temporary (detiled) texture is used as
+             * a render target when blitting from a tiled texture. */
+            if (usage & PIPE_TRANSFER_READ) {
+                base.bind |= PIPE_BIND_RENDER_TARGET;
+            }
+            /* For texture writing, the temporary texture is used as a sampler
+             * when blitting into a tiled texture. */
+            if (usage & PIPE_TRANSFER_WRITE) {
+                base.bind |= PIPE_BIND_SAMPLER_VIEW;
+            }
+
+            /* Create the temporary texture. */
+            trans->detiled_texture = r300_texture(
+               ctx->screen->resource_create(ctx->screen,
+                                            &base));
+
+            if (!trans->detiled_texture) {
+                /* Oh crap, the thing can't create the texture.
+                 * Let's flush and try again. */
+                ctx->flush(ctx, 0, NULL);
+
+                trans->detiled_texture = r300_texture(
+                   ctx->screen->resource_create(ctx->screen,
+                                                &base));
+
+                if (!trans->detiled_texture) {
+                    /* For linear textures, it's safe to fallback to
+                     * an unpipelined transfer. */
+                    if (!tex->microtile && !tex->macrotile) {
+                        goto unpipelined;
+                    }
+
+                    /* Otherwise, go to hell. */
+                    fprintf(stderr,
+                        "r300: Failed to create a transfer object, praise.\n");
+                    FREE(trans);
+                    return NULL;
+                }
+            }
+
+            assert(!trans->detiled_texture->microtile &&
+                   !trans->detiled_texture->macrotile);
+
+            /* Set the stride.
+	     *
+	     * Even though we are using an internal texture for this,
+	     * the transfer sr, box and usage parameters still reflect
+	     * the arguments received to get_transfer.  We just do the
+	     * right thing internally.
+	     */
+            trans->transfer.stride =
+                r300_texture_get_stride(r300screen, trans->detiled_texture, 0);
+
+            if (usage & PIPE_TRANSFER_READ) {
+                /* We cannot map a tiled texture directly because the data is
+                 * in a different order, therefore we do detiling using a blit. */
+                r300_copy_from_tiled_texture(ctx, trans);
+
+                /* Always referenced in the blit. */
+                ctx->flush(ctx, 0, NULL);
+            }
+            return &trans->transfer;
+        }
+
+    unpipelined:
+        /* Unpipelined transfer. */
+        trans->transfer.stride =
+                r300_texture_get_stride(r300screen, tex, sr.level);
+        trans->offset = r300_texture_get_offset(tex, sr.level, box->z, sr.face);
+
+        if (referenced_cs && (usage & PIPE_TRANSFER_READ))
+            ctx->flush(ctx, PIPE_FLUSH_RENDER_CACHE, NULL);
+        return &trans->transfer;
+    }
+    return NULL;
+}
+
+void r300_texture_transfer_destroy(struct pipe_context *ctx,
+				   struct pipe_transfer *trans)
+{
+    struct r300_transfer *r300transfer = r300_transfer(trans);
+
+    if (r300transfer->detiled_texture) {
+        if (trans->usage & PIPE_TRANSFER_WRITE) {
+            r300_copy_into_tiled_texture(ctx, r300transfer);
+        }
+
+        pipe_resource_reference(
+            (struct pipe_resource**)&r300transfer->detiled_texture, NULL);
+    }
+    pipe_resource_reference(&trans->resource, NULL);
+    FREE(trans);
+}
+
+void* r300_texture_transfer_map(struct pipe_context *ctx,
+				struct pipe_transfer *transfer)
+{
+    struct r300_winsys_screen *rws = (struct r300_winsys_screen *)ctx->winsys;
+    struct r300_transfer *r300transfer = r300_transfer(transfer);
+    struct r300_texture *tex = r300_texture(transfer->resource);
+    char *map;
+    enum pipe_format format = tex->b.b.format;
+
+    if (r300transfer->detiled_texture) {
+        /* The detiled texture is of the same size as the region being mapped
+         * (no offset needed). */
+        return rws->buffer_map(rws,
+                               r300transfer->detiled_texture->buffer,
+                               transfer->usage);
+    } else {
+        /* Tiling is disabled. */
+        map = rws->buffer_map(rws, tex->buffer,
+                              transfer->usage);
+
+        if (!map) {
+            return NULL;
+        }
+
+        return map + r300_transfer(transfer)->offset +
+            transfer->box.y / util_format_get_blockheight(format) * transfer->stride +
+            transfer->box.x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
+    }
+}
+
+void r300_texture_transfer_unmap(struct pipe_context *ctx,
+				 struct pipe_transfer *transfer)
+{
+    struct r300_winsys_screen *rws = (struct r300_winsys_screen *)ctx->winsys;
+    struct r300_transfer *r300transfer = r300_transfer(transfer);
+    struct r300_texture *tex = r300_texture(transfer->resource);
+
+    if (r300transfer->detiled_texture) {
+	rws->buffer_unmap(rws, r300transfer->detiled_texture->buffer);
+    } else {
+        rws->buffer_unmap(rws, tex->buffer);
+    }
+}
diff --git a/src/gallium/drivers/r300/r300_transfer.h b/src/gallium/drivers/r300/r300_transfer.h
new file mode 100644
index 0000000000..0d32a68d1f
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_transfer.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_TRANSFER
+#define R300_TRANSFER
+
+#include "pipe/p_context.h"
+
+struct r300_context;
+
+struct pipe_transfer*
+r300_texture_get_transfer(struct pipe_context *ctx,
+			  struct pipe_resource *texture,
+			  struct pipe_subresource sr,
+			  unsigned usage,
+			  const struct pipe_box *box);
+
+void
+r300_texture_transfer_destroy(struct pipe_context *ctx,
+			      struct pipe_transfer *trans);
+
+void*
+r300_texture_transfer_map(struct pipe_context *ctx,
+			  struct pipe_transfer *transfer);
+
+void
+r300_texture_transfer_unmap(struct pipe_context *ctx,
+			    struct pipe_transfer *transfer);
+
+
+#endif
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
new file mode 100644
index 0000000000..b25c786d6b
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright 2009 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_vs.h"
+
+#include "r300_context.h"
+#include "r300_screen.h"
+#include "r300_tgsi_to_rc.h"
+#include "r300_reg.h"
+
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_ureg.h"
+
+#include "radeon_compiler.h"
+
+/* Convert info about VS output semantics into r300_shader_semantics. */
+static void r300_shader_read_vs_outputs(
+    struct tgsi_shader_info* info,
+    struct r300_shader_semantics* vs_outputs)
+{
+    int i;
+    unsigned index;
+
+    r300_shader_semantics_reset(vs_outputs);
+
+    for (i = 0; i < info->num_outputs; i++) {
+        index = info->output_semantic_index[i];
+
+        switch (info->output_semantic_name[i]) {
+            case TGSI_SEMANTIC_POSITION:
+                assert(index == 0);
+                vs_outputs->pos = i;
+                break;
+
+            case TGSI_SEMANTIC_PSIZE:
+                assert(index == 0);
+                vs_outputs->psize = i;
+                break;
+
+            case TGSI_SEMANTIC_COLOR:
+                assert(index < ATTR_COLOR_COUNT);
+                vs_outputs->color[index] = i;
+                break;
+
+            case TGSI_SEMANTIC_BCOLOR:
+                assert(index < ATTR_COLOR_COUNT);
+                vs_outputs->bcolor[index] = i;
+                break;
+
+            case TGSI_SEMANTIC_GENERIC:
+                assert(index < ATTR_GENERIC_COUNT);
+                vs_outputs->generic[index] = i;
+                break;
+
+            case TGSI_SEMANTIC_FOG:
+                assert(index == 0);
+                vs_outputs->fog = i;
+                break;
+
+            case TGSI_SEMANTIC_EDGEFLAG:
+                assert(index == 0);
+                fprintf(stderr, "r300 VP: cannot handle edgeflag output.\n");
+                break;
+
+            default:
+                fprintf(stderr, "r300 VP: unknown vertex output semantic: %i.\n",
+                        info->output_semantic_name[i]);
+        }
+    }
+
+    /* WPOS is a straight copy of POSITION and it's always emitted. */
+    vs_outputs->wpos = i;
+}
+
+static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
+{
+    struct r300_vertex_shader * vs = c->UserData;
+    struct r300_shader_semantics* outputs = &vs->outputs;
+    struct tgsi_shader_info* info = &vs->info;
+    int i, reg = 0;
+    boolean any_bcolor_used = outputs->bcolor[0] != ATTR_UNUSED ||
+                              outputs->bcolor[1] != ATTR_UNUSED;
+
+    /* Fill in the input mapping */
+    for (i = 0; i < info->num_inputs; i++)
+        c->code->inputs[i] = i;
+
+    /* Position. */
+    if (outputs->pos != ATTR_UNUSED) {
+        c->code->outputs[outputs->pos] = reg++;
+    } else {
+        assert(0);
+    }
+
+    /* Point size. */
+    if (outputs->psize != ATTR_UNUSED) {
+        c->code->outputs[outputs->psize] = reg++;
+    }
+
+    /* If we're writing back facing colors we need to send
+     * four colors to make front/back face colors selection work.
+     * If the vertex program doesn't write all 4 colors, lets
+     * pretend it does by skipping output index reg so the colors
+     * get written into appropriate output vectors.
+     */
+
+    /* Colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (outputs->color[i] != ATTR_UNUSED) {
+            c->code->outputs[outputs->color[i]] = reg++;
+        } else if (any_bcolor_used ||
+                   outputs->color[1] != ATTR_UNUSED) {
+            reg++;
+        }
+    }
+
+    /* Back-face colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (outputs->bcolor[i] != ATTR_UNUSED) {
+            c->code->outputs[outputs->bcolor[i]] = reg++;
+        } else if (any_bcolor_used) {
+            reg++;
+        }
+    }
+
+    /* Texture coordinates. */
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        if (outputs->generic[i] != ATTR_UNUSED) {
+            c->code->outputs[outputs->generic[i]] = reg++;
+        }
+    }
+
+    /* Fog coordinates. */
+    if (outputs->fog != ATTR_UNUSED) {
+        c->code->outputs[outputs->fog] = reg++;
+    }
+
+    /* WPOS. */
+    c->code->outputs[outputs->wpos] = reg++;
+}
+
+void r300_init_vs_outputs(struct r300_vertex_shader *vs)
+{
+    tgsi_scan_shader(vs->state.tokens, &vs->info);
+    r300_shader_read_vs_outputs(&vs->info, &vs->outputs);
+}
+
+static void r300_dummy_vertex_shader(
+    struct r300_context* r300,
+    struct r300_vertex_shader* shader)
+{
+    struct ureg_program *ureg;
+    struct ureg_dst dst;
+    struct ureg_src imm;
+
+    /* Make a simple vertex shader which outputs (0, 0, 0, 1),
+     * effectively rendering nothing. */
+    ureg = ureg_create(TGSI_PROCESSOR_VERTEX);
+    dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0);
+    imm = ureg_imm4f(ureg, 0, 0, 0, 1);
+
+    ureg_MOV(ureg, dst, imm);
+    ureg_END(ureg);
+
+    shader->state.tokens = tgsi_dup_tokens(ureg_finalize(ureg));
+    ureg_destroy(ureg);
+
+    shader->dummy = TRUE;
+    r300_init_vs_outputs(shader);
+    r300_translate_vertex_shader(r300, shader);
+}
+
+void r300_translate_vertex_shader(struct r300_context *r300,
+                                  struct r300_vertex_shader *vs)
+{
+    struct r300_vertex_program_compiler compiler;
+    struct tgsi_to_rc ttr;
+
+    /* Setup the compiler */
+    rc_init(&compiler.Base);
+
+    compiler.Base.Debug = DBG_ON(r300, DBG_VP);
+    compiler.code = &vs->code;
+    compiler.UserData = vs;
+    compiler.Base.is_r500 = r300->screen->caps.is_r500;
+    compiler.Base.max_temp_regs = 32;
+
+    if (compiler.Base.Debug) {
+        debug_printf("r300: Initial vertex program\n");
+        tgsi_dump(vs->state.tokens, 0);
+    }
+
+    /* Translate TGSI to our internal representation */
+    ttr.compiler = &compiler.Base;
+    ttr.info = &vs->info;
+    ttr.use_half_swizzles = FALSE;
+
+    r300_tgsi_to_rc(&ttr, vs->state.tokens);
+
+    compiler.RequiredOutputs = ~(~0 << (vs->info.num_outputs + 1));
+    compiler.SetHwInputOutput = &set_vertex_inputs_outputs;
+
+    /* Insert the WPOS output. */
+    rc_copy_output(&compiler.Base, 0, vs->outputs.wpos);
+
+    /* Invoke the compiler */
+    r3xx_compile_vertex_program(&compiler);
+    if (compiler.Base.Error) {
+        /* XXX We should fallback using Draw. */
+        fprintf(stderr, "r300 VP: Compiler error:\n%sUsing a dummy shader"
+                " instead.\nIf there's an 'unknown opcode' message, please"
+                " file a bug report and attach this log.\n", compiler.Base.ErrorMsg);
+
+        if (vs->dummy) {
+            fprintf(stderr, "r300 VP: Cannot compile the dummy shader! "
+                    "Giving up...\n");
+            abort();
+        }
+
+        rc_destroy(&compiler.Base);
+        r300_dummy_vertex_shader(r300, vs);
+        return;
+    }
+
+    /* Initialize numbers of constants for each type. */
+    vs->externals_count = ttr.immediate_offset;
+    vs->immediates_count = vs->code.constants.Count - vs->externals_count;
+
+    /* And, finally... */
+    rc_destroy(&compiler.Base);
+}
diff --git a/src/gallium/drivers/r300/r300_vs.h b/src/gallium/drivers/r300/r300_vs.h
new file mode 100644
index 0000000000..170de6c79d
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_vs.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2009 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_VS_H
+#define R300_VS_H
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+#include "radeon_code.h"
+
+#include "r300_context.h"
+#include "r300_shader_semantics.h"
+
+struct r300_context;
+
+struct r300_vertex_shader {
+    /* Parent class */
+    struct pipe_shader_state state;
+
+    struct tgsi_shader_info info;
+    struct r300_shader_semantics outputs;
+
+    /* Whether the shader was replaced by a dummy one due to a shader
+     * compilation failure. */
+    boolean dummy;
+
+    /* Numbers of constants for each type. */
+    unsigned externals_count;
+    unsigned immediates_count;
+
+    /* HWTCL-specific.  */
+    /* Machine code (if translated) */
+    struct r300_vertex_program_code code;
+
+    /* SWTCL-specific. */
+    void *draw_vs;
+};
+
+void r300_init_vs_outputs(struct r300_vertex_shader *vs);
+
+void r300_translate_vertex_shader(struct r300_context *r300,
+                                  struct r300_vertex_shader *vs);
+
+void r300_draw_init_vertex_shader(struct draw_context *draw,
+                                  struct r300_vertex_shader *vs);
+
+#endif /* R300_VS_H */
diff --git a/src/gallium/drivers/r300/r300_vs_draw.c b/src/gallium/drivers/r300/r300_vs_draw.c
new file mode 100644
index 0000000000..d64040b891
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_vs_draw.c
@@ -0,0 +1,358 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* This file contains the vertex shader tranformations for SW TCL needed
+ * to overcome the limitations of the r300 rasterizer.
+ *
+ * Transformations:
+ * 1) If the secondary color output is present, the primary color must be
+ *    inserted before it.
+ * 2) If any back-face color output is present, there must be all 4 color
+ *    outputs and missing ones must be inserted.
+ * 3) Insert a trailing texcoord output containing a copy of POS, for WPOS.
+ *
+ * I know this code is cumbersome, but I don't know of any nicer way
+ * of transforming TGSI shaders. ~ M.
+ */
+
+#include "r300_vs.h"
+
+#include <stdio.h>
+
+#include "tgsi/tgsi_transform.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "draw/draw_context.h"
+
+struct vs_transform_context {
+    struct tgsi_transform_context base;
+
+    boolean color_used[2];
+    boolean bcolor_used[2];
+    boolean temp_used[128];
+
+    /* Index of the pos output, typically 0. */
+    unsigned pos_output;
+    /* Index of the pos temp where all writes of pos are redirected to. */
+    unsigned pos_temp;
+    /* The index of the last generic output, after which we insert a new
+     * output for WPOS. */
+    int last_generic;
+
+    unsigned num_outputs;
+    /* Used to shift output decl. indices when inserting new ones. */
+    unsigned decl_shift;
+    /* Used to remap writes to output decls if their indices changed. */
+    unsigned out_remap[32];
+
+    /* First instruction processed? */
+    boolean first_instruction;
+    /* End instruction processed? */
+    boolean end_instruction;
+};
+
+static void emit_temp(struct tgsi_transform_context *ctx, unsigned reg)
+{
+    struct tgsi_full_declaration decl;
+
+    decl = tgsi_default_full_declaration();
+    decl.Declaration.File = TGSI_FILE_TEMPORARY;
+    decl.Range.First = decl.Range.Last = reg;
+    ctx->emit_declaration(ctx, &decl);
+}
+
+static void emit_output(struct tgsi_transform_context *ctx,
+                        unsigned name, unsigned index, unsigned interp,
+                        unsigned reg)
+{
+    struct vs_transform_context *vsctx = (struct vs_transform_context *)ctx;
+    struct tgsi_full_declaration decl;
+
+    decl = tgsi_default_full_declaration();
+    decl.Declaration.File = TGSI_FILE_OUTPUT;
+    decl.Declaration.Interpolate = interp;
+    decl.Declaration.Semantic = TRUE;
+    decl.Semantic.Name = name;
+    decl.Semantic.Index = index;
+    decl.Range.First = decl.Range.Last = reg;
+    ctx->emit_declaration(ctx, &decl);
+    ++vsctx->num_outputs;
+}
+
+static void insert_output(struct tgsi_transform_context *ctx,
+                          struct tgsi_full_declaration *before,
+                          unsigned name, unsigned index, unsigned interp)
+{
+    struct vs_transform_context *vsctx = (struct vs_transform_context *)ctx;
+    unsigned i;
+
+    /* Make a place for the new output. */
+    for (i = before->Range.First; i < Elements(vsctx->out_remap); i++) {
+        ++vsctx->out_remap[i];
+    }
+
+    /* Insert the new output. */
+    emit_output(ctx, name, index, interp, before->Range.First);
+
+    ++vsctx->decl_shift;
+}
+
+static void insert_trailing_bcolor(struct tgsi_transform_context *ctx,
+                                   struct tgsi_full_declaration *before)
+{
+    struct vs_transform_context *vsctx = (struct vs_transform_context *)ctx;
+
+    /* If BCOLOR0 is used, make sure BCOLOR1 is present too. Otherwise
+     * the rasterizer doesn't do the color selection correctly. */
+    if (vsctx->bcolor_used[0] && !vsctx->bcolor_used[1]) {
+        if (before) {
+            insert_output(ctx, before, TGSI_SEMANTIC_BCOLOR, 1,
+                          TGSI_INTERPOLATE_LINEAR);
+        } else {
+            emit_output(ctx, TGSI_SEMANTIC_BCOLOR, 1,
+                        TGSI_INTERPOLATE_LINEAR, vsctx->num_outputs);
+        }
+        vsctx->bcolor_used[1] = TRUE;
+    }
+}
+
+static void transform_decl(struct tgsi_transform_context *ctx,
+                           struct tgsi_full_declaration *decl)
+{
+    struct vs_transform_context *vsctx = (struct vs_transform_context *)ctx;
+    unsigned i;
+
+    if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
+        switch (decl->Semantic.Name) {
+            case TGSI_SEMANTIC_POSITION:
+                vsctx->pos_output = decl->Range.First;
+                break;
+
+            case TGSI_SEMANTIC_COLOR:
+                assert(decl->Semantic.Index < 2);
+                vsctx->color_used[decl->Semantic.Index] = TRUE;
+
+                /* We must rasterize the first color if the second one is
+                 * used, otherwise the rasterizer doesn't do the color
+                 * selection correctly. Declare it, but don't write to it. */
+                if (decl->Semantic.Index == 1 && !vsctx->color_used[0]) {
+                    insert_output(ctx, decl, TGSI_SEMANTIC_COLOR, 0,
+                                  TGSI_INTERPOLATE_LINEAR);
+                    vsctx->color_used[0] = TRUE;
+                }
+                break;
+
+            case TGSI_SEMANTIC_BCOLOR:
+                assert(decl->Semantic.Index < 2);
+                vsctx->bcolor_used[decl->Semantic.Index] = TRUE;
+
+                /* We must rasterize all 4 colors if back-face colors are
+                 * used, otherwise the rasterizer doesn't do the color
+                 * selection correctly. Declare it, but don't write to it. */
+                if (!vsctx->color_used[0]) {
+                    insert_output(ctx, decl, TGSI_SEMANTIC_COLOR, 0,
+                                  TGSI_INTERPOLATE_LINEAR);
+                    vsctx->color_used[0] = TRUE;
+                }
+                if (!vsctx->color_used[1]) {
+                    insert_output(ctx, decl, TGSI_SEMANTIC_COLOR, 1,
+                                  TGSI_INTERPOLATE_LINEAR);
+                    vsctx->color_used[1] = TRUE;
+                }
+                if (decl->Semantic.Index == 1 && !vsctx->bcolor_used[0]) {
+                    insert_output(ctx, decl, TGSI_SEMANTIC_BCOLOR, 0,
+                                  TGSI_INTERPOLATE_LINEAR);
+                    vsctx->color_used[2] = TRUE;
+                }
+                /* One more case is handled in insert_trailing_bcolor. */
+                break;
+
+            case TGSI_SEMANTIC_GENERIC:
+                vsctx->last_generic = MAX2(vsctx->last_generic, decl->Semantic.Index);
+                break;
+        }
+
+        if (decl->Semantic.Name != TGSI_SEMANTIC_BCOLOR) {
+            /* Insert it as soon as possible. */
+            insert_trailing_bcolor(ctx, decl);
+        }
+
+        /* Since we're inserting new outputs in between, the following outputs
+         * should be moved to the right so that they don't overlap with
+         * the newly added ones. */
+        decl->Range.First += vsctx->decl_shift;
+        decl->Range.Last += vsctx->decl_shift;
+
+        ++vsctx->num_outputs;
+    } else if (decl->Declaration.File == TGSI_FILE_TEMPORARY) {
+        for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+           vsctx->temp_used[i] = TRUE;
+        }
+    }
+
+    ctx->emit_declaration(ctx, decl);
+}
+
+static void transform_inst(struct tgsi_transform_context *ctx,
+                           struct tgsi_full_instruction *inst)
+{
+    struct vs_transform_context *vsctx = (struct vs_transform_context *) ctx;
+    struct tgsi_full_instruction new_inst;
+    unsigned i;
+
+    if (!vsctx->first_instruction) {
+        vsctx->first_instruction = TRUE;
+
+        /* The trailing BCOLOR should be inserted before the code
+         * if it hasn't already been done so. */
+        insert_trailing_bcolor(ctx, NULL);
+
+        /* Insert the generic output for WPOS. */
+        emit_output(ctx, TGSI_SEMANTIC_GENERIC, vsctx->last_generic + 1,
+                    TGSI_INTERPOLATE_PERSPECTIVE, vsctx->num_outputs);
+
+        /* Find a free temp for POSITION. */
+        for (i = 0; i < Elements(vsctx->temp_used); i++) {
+            if (!vsctx->temp_used[i]) {
+                emit_temp(ctx, i);
+                vsctx->pos_temp = i;
+                break;
+            }
+        }
+    }
+
+    if (inst->Instruction.Opcode == TGSI_OPCODE_END) {
+        /* MOV OUT[pos_output], TEMP[pos_temp]; */
+        new_inst = tgsi_default_full_instruction();
+        new_inst.Instruction.Opcode = TGSI_OPCODE_MOV;
+        new_inst.Instruction.NumDstRegs = 1;
+        new_inst.Dst[0].Register.File = TGSI_FILE_OUTPUT;
+        new_inst.Dst[0].Register.Index = vsctx->pos_output;
+        new_inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+        new_inst.Instruction.NumSrcRegs = 1;
+        new_inst.Src[0].Register.File = TGSI_FILE_TEMPORARY;
+        new_inst.Src[0].Register.Index = vsctx->pos_temp;
+        ctx->emit_instruction(ctx, &new_inst);
+
+        /* MOV OUT[n-1], TEMP[pos_temp]; */
+        new_inst = tgsi_default_full_instruction();
+        new_inst.Instruction.Opcode = TGSI_OPCODE_MOV;
+        new_inst.Instruction.NumDstRegs = 1;
+        new_inst.Dst[0].Register.File = TGSI_FILE_OUTPUT;
+        new_inst.Dst[0].Register.Index = vsctx->num_outputs - 1;
+        new_inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+        new_inst.Instruction.NumSrcRegs = 1;
+        new_inst.Src[0].Register.File = TGSI_FILE_TEMPORARY;
+        new_inst.Src[0].Register.Index = vsctx->pos_temp;
+        ctx->emit_instruction(ctx, &new_inst);
+
+        vsctx->end_instruction = TRUE;
+    } else {
+        /* Not an END instruction. */
+        /* Fix writes to outputs. */
+        for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+            struct tgsi_full_dst_register *dst = &inst->Dst[i];
+            if (dst->Register.File == TGSI_FILE_OUTPUT) {
+                if (dst->Register.Index == vsctx->pos_output) {
+                    /* Replace writes to OUT[pos_output] with TEMP[pos_temp]. */
+                    dst->Register.File = TGSI_FILE_TEMPORARY;
+                    dst->Register.Index = vsctx->pos_temp;
+                } else {
+                    /* Not a position, good...
+                     * Since we were changing the indices of output decls,
+                     * we must redirect writes into them too. */
+                    dst->Register.Index = vsctx->out_remap[dst->Register.Index];
+                }
+            }
+        }
+
+        /* Inserting 2 instructions before the END opcode moves all following
+         * labels by 2. Subroutines are always after the END opcode so
+         * they're always moved. */
+        if (inst->Instruction.Opcode == TGSI_OPCODE_CAL) {
+            inst->Label.Label += 2;
+        }
+        /* The labels of the following opcodes are moved only after
+         * the END opcode. */
+        if (vsctx->end_instruction &&
+            (inst->Instruction.Opcode == TGSI_OPCODE_IF ||
+             inst->Instruction.Opcode == TGSI_OPCODE_ELSE ||
+             inst->Instruction.Opcode == TGSI_OPCODE_BGNLOOP ||
+             inst->Instruction.Opcode == TGSI_OPCODE_ENDLOOP)) {
+            inst->Label.Label += 2;
+        }
+    }
+
+    ctx->emit_instruction(ctx, inst);
+}
+
+void r300_draw_init_vertex_shader(struct draw_context *draw,
+                                  struct r300_vertex_shader *vs)
+{
+    struct pipe_shader_state new_vs;
+    struct vs_transform_context transform;
+    const uint newLen = tgsi_num_tokens(vs->state.tokens) + 100 /* XXX */;
+    unsigned i;
+
+    new_vs.tokens = tgsi_alloc_tokens(newLen);
+    if (new_vs.tokens == NULL)
+        return;
+
+    memset(&transform, 0, sizeof(transform));
+    for (i = 0; i < Elements(transform.out_remap); i++) {
+        transform.out_remap[i] = i;
+    }
+    transform.last_generic = -1;
+    transform.base.transform_instruction = transform_inst;
+    transform.base.transform_declaration = transform_decl;
+
+    tgsi_transform_shader(vs->state.tokens,
+                          (struct tgsi_token*)new_vs.tokens,
+                          newLen, &transform.base);
+
+#if 0
+    printf("----------------------------------------------\norig shader:\n");
+    tgsi_dump(vs->state.tokens, 0);
+    printf("----------------------------------------------\nnew shader:\n");
+    tgsi_dump(new_vs.tokens, 0);
+    printf("----------------------------------------------\n");
+#endif
+
+    /* Free old tokens. */
+    FREE((void*)vs->state.tokens);
+
+    vs->draw_vs = draw_create_vertex_shader(draw, &new_vs);
+
+    /* Instead of duplicating and freeing the tokens, copy the pointer directly. */
+    vs->state.tokens = new_vs.tokens;
+
+    /* Init the VS output table for the rasterizer. */
+    r300_init_vs_outputs(vs);
+
+    /* Make the last generic be WPOS. */
+    vs->outputs.wpos = vs->outputs.generic[transform.last_generic + 1];
+    vs->outputs.generic[transform.last_generic + 1] = ATTR_UNUSED;
+}
diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h
new file mode 100644
index 0000000000..77c1c13ef9
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_winsys.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_WINSYS_H
+#define R300_WINSYS_H
+
+/* The public interface header for the r300 pipe driver.
+ * Any winsys hosting this pipe needs to implement r300_winsys and then
+ * call r300_create_screen to start things. */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "r300_defines.h"
+
+struct r300_winsys_buffer;
+
+enum r300_value_id {
+    R300_VID_PCI_ID,
+    R300_VID_GB_PIPES,
+    R300_VID_Z_PIPES,
+    R300_VID_SQUARE_TILING_SUPPORT,
+    R300_VID_DRM_2_3_0,
+};
+
+enum r300_reference_domain { /* bitfield */
+    R300_REF_CS = 1,
+    R300_REF_HW = 2
+};
+
+struct r300_winsys_screen {
+    void (*destroy)(struct r300_winsys_screen *ws);
+    
+    /**
+     * Buffer management. Buffer attributes are mostly fixed over its lifetime.
+     *
+     * Remember that gallium gets to choose the interface it needs, and the
+     * window systems must then implement that interface (rather than the
+     * other way around...).
+     *
+     * usage is a bitmask of R300_WINSYS_BUFFER_USAGE_PIXEL/VERTEX/INDEX/CONSTANT. This
+     * usage argument is only an optimization hint, not a guarantee, therefore
+     * proper behavior must be observed in all circumstances.
+     *
+     * alignment indicates the client's alignment requirements, eg for
+     * SSE instructions.
+     */
+    struct r300_winsys_buffer *(*buffer_create)(struct r300_winsys_screen *ws,
+						unsigned alignment,
+						unsigned usage,
+                                                enum r300_buffer_domain domain,
+						unsigned size);
+
+    /**
+     * Map the entire data store of a buffer object into the client's address.
+     * flags is bitmask of R300_WINSYS_BUFFER_USAGE_CPU_READ/WRITE flags.
+     */
+    void *(*buffer_map)( struct r300_winsys_screen *ws,
+			 struct r300_winsys_buffer *buf,
+			 unsigned usage);
+
+    void (*buffer_unmap)( struct r300_winsys_screen *ws,
+			  struct r300_winsys_buffer *buf );
+
+    void (*buffer_destroy)( struct r300_winsys_buffer *buf );
+
+
+    void (*buffer_reference)(struct r300_winsys_screen *rws,
+			     struct r300_winsys_buffer **pdst,
+			     struct r300_winsys_buffer *src);
+
+    void (*buffer_wait)(struct r300_winsys_screen *rws,
+                        struct r300_winsys_buffer *buf);
+
+    /* Add a pipe_resource to the list of buffer objects to validate. */
+    boolean (*add_buffer)(struct r300_winsys_screen *winsys,
+                          struct r300_winsys_buffer *buf,
+                          enum r300_buffer_domain rd,
+                          enum r300_buffer_domain wd);
+
+
+    /* Revalidate all currently setup pipe_buffers.
+     * Returns TRUE if a flush is required. */
+    boolean (*validate)(struct r300_winsys_screen* winsys);
+
+    /* Return the number of free dwords in CS. */
+    unsigned (*get_cs_free_dwords)(struct r300_winsys_screen *winsys);
+
+    /* Return the pointer to the first free dword in CS and assume a pipe
+     * driver wants to fill "count" dwords. */
+    uint32_t *(*get_cs_pointer)(struct r300_winsys_screen *winsys,
+                                unsigned count);
+
+    /* Write a dword to the command buffer. */
+    void (*write_cs_dword)(struct r300_winsys_screen* winsys, uint32_t dword);
+
+    /* Write a table of dwords to the command buffer. */
+    void (*write_cs_table)(struct r300_winsys_screen* winsys,
+                           const void *dwords, unsigned count);
+
+    /* Write a relocated dword to the command buffer. */
+    void (*write_cs_reloc)(struct r300_winsys_screen *winsys,
+                           struct r300_winsys_buffer *buf,
+                           enum r300_buffer_domain rd,
+                           enum r300_buffer_domain wd,
+                           uint32_t flags);
+
+    /* Flush the CS. */
+    void (*flush_cs)(struct r300_winsys_screen* winsys);
+
+    /* winsys flush - callback from winsys when flush required */
+    void (*set_flush_cb)(struct r300_winsys_screen *winsys,
+			 void (*flush_cb)(void *), void *data);
+
+    void (*reset_bos)(struct r300_winsys_screen *winsys);
+
+    void (*buffer_get_tiling)(struct r300_winsys_screen *winsys,
+                              struct r300_winsys_buffer *buffer,
+                              enum r300_buffer_tiling *microtiled,
+                              enum r300_buffer_tiling *macrotiled);
+
+    void (*buffer_set_tiling)(struct r300_winsys_screen *winsys,
+                              struct r300_winsys_buffer *buffer,
+                              uint32_t pitch,
+                              enum r300_buffer_tiling microtiled,
+                              enum r300_buffer_tiling macrotiled);
+
+    uint32_t (*get_value)(struct r300_winsys_screen *winsys,
+			  enum r300_value_id vid);
+
+    struct r300_winsys_buffer *(*buffer_from_handle)(struct r300_winsys_screen *winsys,
+                                                     unsigned handle);
+
+    boolean (*buffer_get_handle)(struct r300_winsys_screen *winsys,
+				 struct r300_winsys_buffer *buffer,
+				 struct winsys_handle *whandle);
+
+    boolean (*is_buffer_referenced)(struct r300_winsys_screen *winsys,
+                                    struct r300_winsys_buffer *buffer,
+                                    enum r300_reference_domain domain);
+};
+
+struct r300_winsys_screen *
+r300_winsys_screen(struct pipe_screen *screen);
+
+/* Creates a new r300 screen. */
+struct pipe_screen* r300_create_screen(struct r300_winsys_screen *rws);
+
+#endif /* R300_WINSYS_H */
diff --git a/src/gallium/drivers/r600/Makefile b/src/gallium/drivers/r600/Makefile
new file mode 100644
index 0000000000..aae31a6a6e
--- /dev/null
+++ b/src/gallium/drivers/r600/Makefile
@@ -0,0 +1,27 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = r600
+
+LIBRARY_INCLUDES = \
+	$(shell pkg-config libdrm --cflags-only-I)
+
+C_SOURCES = \
+	r600_buffer.c \
+	r600_context.c \
+	r600_draw.c \
+	r600_blit.c \
+	r600_helper.c \
+	r600_query.c \
+	r600_resource.c \
+	r600_screen.c \
+	r600_state.c \
+	r600_texture.c \
+	r600_shader.c \
+	r600_compiler.c \
+	r600_compiler_tgsi.c \
+	r600_compiler_dump.c \
+	r600_compiler_r600.c \
+	r600_compiler_r700.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/r600/SConscript b/src/gallium/drivers/r600/SConscript
new file mode 100644
index 0000000000..26e2f1941c
--- /dev/null
+++ b/src/gallium/drivers/r600/SConscript
@@ -0,0 +1,37 @@
+Import('*')
+
+env = env.Clone()
+
+try:
+    env.ParseConfig('pkg-config --cflags libdrm_radeon')
+except OSError:
+    print 'warning: not building r600'
+    Return()
+
+env.Append(CPPPATH = [
+    '#/include', 
+    '#/src/mesa',
+])
+
+r600 = env.ConvenienceLibrary(
+    target = 'r600',
+    source = [
+        'r600_buffer.c',
+        'r600_context.c',
+        'r600_draw.c',
+        'r600_blit.c',
+        'r600_helper.c',
+        'r600_query.c',
+        'r600_resource.c',
+        'r600_screen.c',
+        'r600_state.c',
+        'r600_texture.c',
+        'r600_shader.c',
+        'r600_compiler.c',
+        'r600_compiler_tgsi.c',
+        'r600_compiler_dump.c',
+        'r600_compiler_r600.c',
+        'r600_compiler_r700.c'
+    ])
+
+Export('r600')
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
new file mode 100644
index 0000000000..1dcb19babc
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ *      Marek Olšák
+ */
+#include <pipe/p_screen.h>
+#include <util/u_blitter.h>
+#include <util/u_inlines.h>
+#include <util/u_memory.h>
+#include "util/u_surface.h"
+#include "r600_screen.h"
+#include "r600_context.h"
+
+static void r600_blitter_save_states(struct r600_context *rctx)
+{
+	util_blitter_save_blend(rctx->blitter,
+					rctx->draw->state[R600_BLEND]);
+	util_blitter_save_depth_stencil_alpha(rctx->blitter,
+					rctx->draw->state[R600_DSA]);
+	util_blitter_save_stencil_ref(rctx->blitter, &rctx->stencil_ref);
+	util_blitter_save_rasterizer(rctx->blitter,
+					rctx->draw->state[R600_RASTERIZER]);
+	util_blitter_save_fragment_shader(rctx->blitter,
+					rctx->ps_shader);
+	util_blitter_save_vertex_shader(rctx->blitter,
+					rctx->vs_shader);
+	util_blitter_save_vertex_elements(rctx->blitter,
+					rctx->vertex_elements);
+	util_blitter_save_viewport(rctx->blitter,
+					&rctx->viewport);
+	/* XXX util_blitter_save_clip(rctx->blitter, &rctx->clip); */
+	util_blitter_save_vertex_buffers(rctx->blitter, rctx->nvertex_buffer,
+					 rctx->vertex_buffer);
+}
+
+static void r600_clear(struct pipe_context *ctx, unsigned buffers,
+		       const float *rgba, double depth, unsigned stencil)
+{
+	struct r600_context *rctx = r600_context(ctx);
+	struct pipe_framebuffer_state *fb = &rctx->fb_state;
+
+	r600_blitter_save_states(rctx);
+	util_blitter_clear(rctx->blitter, fb->width, fb->height,
+				fb->nr_cbufs, buffers, rgba, depth,
+				stencil);
+}
+
+static void r600_clear_render_target(struct pipe_context *pipe,
+				     struct pipe_surface *dst,
+				     const float *rgba,
+				     unsigned dstx, unsigned dsty,
+				     unsigned width, unsigned height)
+{
+	struct r600_context *rctx = r600_context(pipe);
+
+	r600_blitter_save_states(rctx);
+	util_blitter_save_framebuffer(rctx->blitter, &rctx->fb_state);
+
+	util_blitter_clear_render_target(rctx->blitter, dst, rgba,
+					 dstx, dsty, width, height);
+}
+
+static void r600_clear_depth_stencil(struct pipe_context *pipe,
+				     struct pipe_surface *dst,
+				     unsigned clear_flags,
+				     double depth,
+				     unsigned stencil,
+				     unsigned dstx, unsigned dsty,
+				     unsigned width, unsigned height)
+{
+	struct r600_context *rctx = r600_context(pipe);
+
+	r600_blitter_save_states(rctx);
+	util_blitter_save_framebuffer(rctx->blitter, &rctx->fb_state);
+
+	util_blitter_clear_depth_stencil(rctx->blitter, dst, clear_flags, depth, stencil,
+					 dstx, dsty, width, height);
+}
+
+static void r600_resource_copy_region(struct pipe_context *pipe,
+				      struct pipe_resource *dst,
+				      struct pipe_subresource subdst,
+				      unsigned dstx, unsigned dsty, unsigned dstz,
+				      struct pipe_resource *src,
+				      struct pipe_subresource subsrc,
+				      unsigned srcx, unsigned srcy, unsigned srcz,
+				      unsigned width, unsigned height)
+{
+	util_resource_copy_region(pipe, dst, subdst, dstx, dsty, dstz,
+				  src, subsrc, srcx, srcy, srcz, width, height);
+}
+
+void r600_init_blit_functions(struct r600_context *rctx)
+{
+	rctx->context.clear = r600_clear;
+	rctx->context.clear_render_target = r600_clear_render_target;
+	rctx->context.clear_depth_stencil = r600_clear_depth_stencil;
+	rctx->context.resource_copy_region = r600_resource_copy_region;
+}
diff --git a/src/gallium/drivers/r600/r600_buffer.c b/src/gallium/drivers/r600/r600_buffer.c
new file mode 100644
index 0000000000..272f4dd673
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_buffer.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ *      Corbin Simpson <MostAwesomeDude@gmail.com>
+ */
+#include <pipe/p_screen.h>
+#include <util/u_format.h>
+#include <util/u_math.h>
+#include <util/u_inlines.h>
+#include <util/u_memory.h>
+#include "state_tracker/drm_api.h"
+#include "r600_screen.h"
+#include "r600_context.h"
+
+extern struct u_resource_vtbl r600_buffer_vtbl;
+
+static u32 r600_domain_from_usage(unsigned usage)
+{
+	u32 domain = RADEON_GEM_DOMAIN_GTT;
+
+	if (usage & PIPE_BIND_RENDER_TARGET) {
+	    domain |= RADEON_GEM_DOMAIN_VRAM;
+	}
+	if (usage & PIPE_BIND_DEPTH_STENCIL) {
+	    domain |= RADEON_GEM_DOMAIN_VRAM;
+	}
+	if (usage & PIPE_BIND_SAMPLER_VIEW) {
+	    domain |= RADEON_GEM_DOMAIN_VRAM;
+	}
+	/* also need BIND_BLIT_SOURCE/DESTINATION ? */
+	if (usage & PIPE_BIND_VERTEX_BUFFER) {
+	    domain |= RADEON_GEM_DOMAIN_GTT;
+	}
+	if (usage & PIPE_BIND_INDEX_BUFFER) {
+	    domain |= RADEON_GEM_DOMAIN_GTT;
+	}
+
+	return domain;
+}
+
+struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
+					 const struct pipe_resource *templ)
+{
+	struct r600_screen *rscreen = r600_screen(screen);
+	struct r600_buffer *rbuffer;
+	struct radeon_bo *bo;
+	struct pb_desc desc;
+	/* XXX We probably want a different alignment for buffers and textures. */
+	unsigned alignment = 4096;
+
+	rbuffer = CALLOC_STRUCT(r600_buffer);
+	if (rbuffer == NULL)
+		return NULL;
+
+	rbuffer->b.b = *templ;
+	pipe_reference_init(&rbuffer->b.b.reference, 1);
+	rbuffer->b.b.screen = screen;
+	rbuffer->b.vtbl = &r600_buffer_vtbl;
+
+	if (rbuffer->b.b.bind & PIPE_BIND_CONSTANT_BUFFER) {
+		desc.alignment = alignment;
+		desc.usage = rbuffer->b.b.bind;
+		rbuffer->pb = pb_malloc_buffer_create(rbuffer->b.b.width0,
+						      &desc);
+		if (rbuffer->pb == NULL) {
+			free(rbuffer);
+			return NULL;
+		}
+		return &rbuffer->b.b;
+	}
+	rbuffer->domain = r600_domain_from_usage(rbuffer->b.b.bind);
+	bo = radeon_bo(rscreen->rw, 0, rbuffer->b.b.width0, alignment, NULL);
+	if (bo == NULL) {
+		FREE(rbuffer);
+		return NULL;
+	}
+	rbuffer->bo = bo;
+	return &rbuffer->b.b;
+}
+
+struct pipe_resource *r600_user_buffer_create(struct pipe_screen *screen,
+					      void *ptr, unsigned bytes,
+					      unsigned bind)
+{
+	struct r600_buffer *rbuffer;
+	struct r600_screen *rscreen = r600_screen(screen);
+	struct pipe_resource templ;
+
+	memset(&templ, 0, sizeof(struct pipe_resource));
+	templ.target = PIPE_BUFFER;
+	templ.format = PIPE_FORMAT_R8_UNORM;
+	templ.usage = PIPE_USAGE_IMMUTABLE;
+	templ.bind = bind;
+	templ.width0 = bytes;
+	templ.height0 = 1;
+	templ.depth0 = 1;
+
+	rbuffer = (struct r600_buffer*)r600_buffer_create(screen, &templ);
+	if (rbuffer == NULL) {
+		return NULL;
+	}
+	radeon_bo_map(rscreen->rw, rbuffer->bo);
+	memcpy(rbuffer->bo->data, ptr, bytes);
+	radeon_bo_unmap(rscreen->rw, rbuffer->bo);
+	return &rbuffer->b.b;
+}
+
+static void r600_buffer_destroy(struct pipe_screen *screen,
+				struct pipe_resource *buf)
+{
+	struct r600_buffer *rbuffer = (struct r600_buffer*)buf;
+	struct r600_screen *rscreen = r600_screen(screen);
+
+	if (rbuffer->pb) {
+		pipe_reference_init(&rbuffer->pb->base.reference, 0);
+		pb_destroy(rbuffer->pb);
+		rbuffer->pb = NULL;
+	}
+	if (rbuffer->bo) {
+		radeon_bo_decref(rscreen->rw, rbuffer->bo);
+	}
+	FREE(rbuffer);
+}
+
+static void *r600_buffer_transfer_map(struct pipe_context *pipe,
+				      struct pipe_transfer *transfer)
+{
+	struct r600_buffer *rbuffer = (struct r600_buffer*)transfer->resource;
+	struct r600_screen *rscreen = r600_screen(pipe->screen);
+	int write = 0;
+
+	if (rbuffer->pb) {
+		return (uint8_t*)pb_map(rbuffer->pb, transfer->usage) + transfer->box.x;
+	}
+	if (transfer->usage & PIPE_TRANSFER_DONTBLOCK) {
+		/* FIXME */
+	}
+	if (transfer->usage & PIPE_TRANSFER_WRITE) {
+		write = 1;
+	}
+	if (radeon_bo_map(rscreen->rw, rbuffer->bo)) {
+		return NULL;
+	}
+	return (uint8_t*)rbuffer->bo->data + transfer->box.x;
+}
+
+static void r600_buffer_transfer_unmap(struct pipe_context *pipe,
+				       struct pipe_transfer *transfer)
+{
+	struct r600_buffer *rbuffer = (struct r600_buffer*)transfer->resource;
+	struct r600_screen *rscreen = r600_screen(pipe->screen);
+
+	if (rbuffer->pb) {
+		pb_unmap(rbuffer->pb);
+	} else {
+		radeon_bo_unmap(rscreen->rw, rbuffer->bo);
+	}
+}
+
+static void r600_buffer_transfer_flush_region(struct pipe_context *pipe,
+					      struct pipe_transfer *transfer,
+					      const struct pipe_box *box)
+{
+}
+
+unsigned r600_buffer_is_referenced_by_cs(struct pipe_context *context,
+					 struct pipe_resource *buf,
+					 unsigned face, unsigned level)
+{
+	/* XXX */
+	return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+}
+
+struct pipe_resource *r600_buffer_from_handle(struct pipe_screen *screen,
+					      struct winsys_handle *whandle)
+{
+	struct radeon *rw = (struct radeon*)screen->winsys;
+	struct r600_buffer *rbuffer;
+	struct radeon_bo *bo = NULL;
+
+	bo = radeon_bo(rw, whandle->handle, 0, 0, NULL);
+	if (bo == NULL) {
+		return NULL;
+	}
+
+	rbuffer = CALLOC_STRUCT(r600_buffer);
+	if (rbuffer == NULL) {
+		radeon_bo_decref(rw, bo);
+		return NULL;
+	}
+
+	pipe_reference_init(&rbuffer->b.b.reference, 1);
+	rbuffer->b.b.target = PIPE_BUFFER;
+	rbuffer->b.b.screen = screen;
+	rbuffer->b.vtbl = &r600_buffer_vtbl;
+	rbuffer->bo = bo;
+	return &rbuffer->b.b;
+}
+
+struct u_resource_vtbl r600_buffer_vtbl =
+{
+	u_default_resource_get_handle,		/* get_handle */
+	r600_buffer_destroy,			/* resource_destroy */
+	r600_buffer_is_referenced_by_cs,	/* is_buffer_referenced */
+	u_default_get_transfer,			/* get_transfer */
+	u_default_transfer_destroy,		/* transfer_destroy */
+	r600_buffer_transfer_map,		/* transfer_map */
+	r600_buffer_transfer_flush_region,	/* transfer_flush_region */
+	r600_buffer_transfer_unmap,		/* transfer_unmap */
+	u_default_transfer_inline_write		/* transfer_inline_write */
+};
diff --git a/src/gallium/drivers/r600/r600_compiler.c b/src/gallium/drivers/r600/r600_compiler.c
new file mode 100644
index 0000000000..f1be2bbdf4
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_compiler.c
@@ -0,0 +1,446 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <errno.h>
+#include "r600_compiler.h"
+
+struct c_vector *c_vector_new(void)
+{
+	struct c_vector *v = calloc(1, sizeof(struct c_vector));
+
+	if (v == NULL) {
+		return NULL;
+	}
+	c_list_init(v);
+	return v;
+}
+
+static unsigned c_opcode_is_alu(unsigned opcode)
+{
+	switch (opcode) {
+	case C_OPCODE_MOV:
+	case C_OPCODE_MUL:
+	case C_OPCODE_MAD:
+	case C_OPCODE_ARL:
+	case C_OPCODE_LIT:
+	case C_OPCODE_RCP:
+	case C_OPCODE_RSQ:
+	case C_OPCODE_EXP:
+	case C_OPCODE_LOG:
+	case C_OPCODE_ADD:
+	case C_OPCODE_DP3:
+	case C_OPCODE_DP4:
+	case C_OPCODE_DST:
+	case C_OPCODE_MIN:
+	case C_OPCODE_MAX:
+	case C_OPCODE_SLT:
+	case C_OPCODE_SGE:
+	case C_OPCODE_SUB:
+	case C_OPCODE_LRP:
+	case C_OPCODE_CND:
+	case C_OPCODE_DP2A:
+	case C_OPCODE_FRC:
+	case C_OPCODE_CLAMP:
+	case C_OPCODE_FLR:
+	case C_OPCODE_ROUND:
+	case C_OPCODE_EX2:
+	case C_OPCODE_LG2:
+	case C_OPCODE_POW:
+	case C_OPCODE_XPD:
+	case C_OPCODE_ABS:
+	case C_OPCODE_RCC:
+	case C_OPCODE_DPH:
+	case C_OPCODE_COS:
+	case C_OPCODE_DDX:
+	case C_OPCODE_DDY:
+	case C_OPCODE_PK2H:
+	case C_OPCODE_PK2US:
+	case C_OPCODE_PK4B:
+	case C_OPCODE_PK4UB:
+	case C_OPCODE_RFL:
+	case C_OPCODE_SEQ:
+	case C_OPCODE_SFL:
+	case C_OPCODE_SGT:
+	case C_OPCODE_SIN:
+	case C_OPCODE_SLE:
+	case C_OPCODE_SNE:
+	case C_OPCODE_STR:
+	case C_OPCODE_UP2H:
+	case C_OPCODE_UP2US:
+	case C_OPCODE_UP4B:
+	case C_OPCODE_UP4UB:
+	case C_OPCODE_X2D:
+	case C_OPCODE_ARA:
+	case C_OPCODE_ARR:
+	case C_OPCODE_BRA:
+	case C_OPCODE_SSG:
+	case C_OPCODE_CMP:
+	case C_OPCODE_SCS:
+	case C_OPCODE_NRM:
+	case C_OPCODE_DIV:
+	case C_OPCODE_DP2:
+	case C_OPCODE_CEIL:
+	case C_OPCODE_I2F:
+	case C_OPCODE_NOT:
+	case C_OPCODE_TRUNC:
+	case C_OPCODE_SHL:
+	case C_OPCODE_AND:
+	case C_OPCODE_OR:
+	case C_OPCODE_MOD:
+	case C_OPCODE_XOR:
+	case C_OPCODE_SAD:
+	case C_OPCODE_NRM4:
+	case C_OPCODE_F2I:
+	case C_OPCODE_IDIV:
+	case C_OPCODE_IMAX:
+	case C_OPCODE_IMIN:
+	case C_OPCODE_INEG:
+	case C_OPCODE_ISGE:
+	case C_OPCODE_ISHR:
+	case C_OPCODE_ISLT:
+	case C_OPCODE_F2U:
+	case C_OPCODE_U2F:
+	case C_OPCODE_UADD:
+	case C_OPCODE_UDIV:
+	case C_OPCODE_UMAD:
+	case C_OPCODE_UMAX:
+	case C_OPCODE_UMIN:
+	case C_OPCODE_UMOD:
+	case C_OPCODE_UMUL:
+	case C_OPCODE_USEQ:
+	case C_OPCODE_USGE:
+	case C_OPCODE_USHR:
+	case C_OPCODE_USLT:
+	case C_OPCODE_USNE:
+		return 1;
+	case C_OPCODE_END:
+	case C_OPCODE_VFETCH:
+	case C_OPCODE_KILP:
+	case C_OPCODE_CAL:
+	case C_OPCODE_RET:
+	case C_OPCODE_TXB:
+	case C_OPCODE_TXL:
+	case C_OPCODE_BRK:
+	case C_OPCODE_IF:
+	case C_OPCODE_BGNFOR:
+	case C_OPCODE_REP:
+	case C_OPCODE_ELSE:
+	case C_OPCODE_ENDIF:
+	case C_OPCODE_ENDFOR:
+	case C_OPCODE_ENDREP:
+	case C_OPCODE_PUSHA:
+	case C_OPCODE_POPA:
+	case C_OPCODE_TXF:
+	case C_OPCODE_TXQ:
+	case C_OPCODE_CONT:
+	case C_OPCODE_EMIT:
+	case C_OPCODE_ENDPRIM:
+	case C_OPCODE_BGNLOOP:
+	case C_OPCODE_BGNSUB:
+	case C_OPCODE_ENDLOOP:
+	case C_OPCODE_ENDSUB:
+	case C_OPCODE_NOP:
+	case C_OPCODE_CALLNZ:
+	case C_OPCODE_IFC:
+	case C_OPCODE_BREAKC:
+	case C_OPCODE_KIL:
+	case C_OPCODE_TEX:
+	case C_OPCODE_TXD:
+	case C_OPCODE_TXP:
+	case C_OPCODE_SWITCH:
+	case C_OPCODE_CASE:
+	case C_OPCODE_DEFAULT:
+	case C_OPCODE_ENDSWITCH:
+	default:
+		return 0;
+	}
+}
+
+
+/* NEW */
+void c_node_init(struct c_node *node)
+{
+	memset(node, 0, sizeof(struct c_node));
+	c_list_init(&node->predecessors);
+	c_list_init(&node->successors);
+	c_list_init(&node->childs);
+	c_list_init(&node->insts);
+	node->parent = NULL;
+}
+
+static struct c_node_link *c_node_link_new(struct c_node *node)
+{
+	struct c_node_link *link;
+
+	link = calloc(1, sizeof(struct c_node_link));
+	if (link == NULL)
+		return NULL;
+	c_list_init(link);
+	link->node = node;
+	return link;
+}
+
+int c_node_cfg_link(struct c_node *predecessor, struct c_node *successor)
+{
+	struct c_node_link *pedge, *sedge;
+
+	pedge = c_node_link_new(successor);
+	sedge = c_node_link_new(predecessor);
+	if (sedge == NULL || pedge == NULL) {
+		free(sedge);
+		free(pedge);
+		return -ENOMEM;
+	}
+	c_list_add_tail(pedge, &predecessor->successors);
+	c_list_add_tail(sedge, &successor->predecessors);
+	return 0;
+}
+
+int c_node_add_new_instruction_head(struct c_node *node, struct c_instruction *instruction)
+{
+	struct c_instruction *inst = calloc(1, sizeof(struct c_instruction));
+
+	if (inst == NULL)
+		return -ENOMEM;
+	memcpy(inst, instruction, sizeof(struct c_instruction));
+	c_list_add(inst, &node->insts);
+	return 0;
+}
+
+int c_node_add_new_instruction(struct c_node *node, struct c_instruction *instruction)
+{
+	struct c_instruction *inst = calloc(1, sizeof(struct c_instruction));
+
+	if (inst == NULL)
+		return -ENOMEM;
+	memcpy(inst, instruction, sizeof(struct c_instruction));
+	c_list_add_tail(inst, &node->insts);
+	return 0;
+}
+
+struct c_node *c_shader_cfg_new_node_after(struct c_shader *shader, struct c_node *predecessor)
+{
+	struct c_node *node = calloc(1, sizeof(struct c_node));
+
+	if (node == NULL)
+		return NULL;
+	c_node_init(node);
+	if (c_node_cfg_link(predecessor, node)) {
+		free(node);
+		return NULL;
+	}
+	c_list_add_tail(node, &shader->nodes);
+	return node;
+}
+
+int c_shader_init(struct c_shader *shader, unsigned type)
+{
+	unsigned i;
+	int r;
+
+	shader->type = type;
+	for (i = 0; i < C_FILE_COUNT; i++) {
+		shader->files[i].nvectors = 0;
+		c_list_init(&shader->files[i].vectors);
+	}
+	c_list_init(&shader->nodes);
+	c_node_init(&shader->entry);
+	c_node_init(&shader->end);
+	shader->entry.opcode = C_OPCODE_ENTRY;
+	shader->end.opcode = C_OPCODE_END;
+	r = c_node_cfg_link(&shader->entry, &shader->end);
+	if (r)
+		return r;
+	return 0;
+}
+
+struct c_vector *c_shader_vector_new(struct c_shader *shader, unsigned file, unsigned name, int sid)
+{
+	struct c_vector *v = calloc(1, sizeof(struct c_vector));
+	int i;
+
+	if (v == NULL) {
+		return NULL;
+	}
+	for (i = 0; i < 4; i++) {
+		v->channel[i] = calloc(1, sizeof(struct c_channel));
+		if (v->channel[i] == NULL)
+			goto out_err;
+		v->channel[i]->vindex = i;
+		v->channel[i]->vector = v;
+	}
+	v->file = file;
+	v->name = name;
+	v->sid = sid;
+	shader->files[v->file].nvectors++;
+	v->id = shader->nvectors++;
+	c_list_add_tail(v, &shader->files[v->file].vectors);
+	return v;
+out_err:
+	for (i = 0; i < 4; i++) {
+		free(v->channel[i]);
+	}
+	free(v);
+	return NULL;
+}
+
+static void c_node_remove_link(struct c_node_link *head, struct c_node *node)
+{
+	struct c_node_link *link, *tmp;
+
+	c_list_for_each_safe(link, tmp, head) {
+		if (link->node == node) {
+			c_list_del(link);
+			free(link);
+		}
+	}
+}
+
+static void c_node_destroy(struct c_node *node)
+{
+	struct c_instruction *i, *ni;
+	struct c_node_link *link, *tmp;
+
+	c_list_for_each_safe(i, ni, &node->insts) {
+		c_list_del(i);
+		free(i);
+	}
+	if (node->parent)
+		c_node_remove_link(&node->parent->childs, node);
+	node->parent = NULL;
+	c_list_for_each_safe(link, tmp, &node->predecessors) {
+		c_node_remove_link(&link->node->successors, node);
+		c_list_del(link);
+		free(link);
+	}
+	c_list_for_each_safe(link, tmp, &node->successors) {
+		c_node_remove_link(&link->node->predecessors, node);
+		c_list_del(link);
+		free(link);
+	}
+	c_list_for_each_safe(link, tmp, &node->childs) {
+		link->node->parent = NULL;
+		c_list_del(link);
+		free(link);
+	}
+}
+
+void c_shader_destroy(struct c_shader *shader)
+{
+	struct c_node *n, *nn;
+	struct c_vector *v, *nv;
+	unsigned i;
+
+	for (i = 0; i < C_FILE_COUNT; i++) {
+		shader->files[i].nvectors = 0;
+		c_list_for_each_safe(v, nv, &shader->files[i].vectors) {
+			c_list_del(v);
+			free(v->channel[0]);
+			free(v->channel[1]);
+			free(v->channel[2]);
+			free(v->channel[3]);
+			free(v);
+		}
+	}
+	c_list_for_each_safe(n, nn, &shader->nodes) {
+		c_list_del(n);
+		c_node_destroy(n);
+	}
+	memset(shader, 0, sizeof(struct c_shader));
+}
+
+static void c_shader_dfs_without_rec(struct c_node *entry, struct c_node *node)
+{
+	struct c_node_link *link;
+
+	if (entry == node || entry->visited)
+		return;
+	entry->visited = 1;
+	c_list_for_each(link, &entry->successors) {
+		c_shader_dfs_without_rec(link->node, node);
+	}
+}
+
+static void c_shader_dfs_without(struct c_shader *shader, struct c_node *node)
+{
+	struct c_node *n;
+
+	shader->entry.visited = 0;
+	shader->end.visited = 0;
+	c_list_for_each(n, &shader->nodes) {
+		n->visited = 0;
+	}
+	c_shader_dfs_without_rec(&shader->entry, node);
+}
+
+static int c_shader_build_dominator_tree_rec(struct c_shader *shader, struct c_node *node)
+{
+	struct c_node_link *link, *nlink;
+	unsigned found = 0;
+	int r;
+
+	if (node->done)
+		return 0;
+	node->done = 1;
+	c_list_for_each(link, &node->predecessors) {
+		/* if we remove this predecessor can we reach the current node ? */
+		c_shader_dfs_without(shader, link->node);
+		if (node->visited == 0) {
+			/* we were unable to visit current node thus current
+			 * predecessor  is the immediate dominator of node, as
+			 * their can be only one immediate dominator we break
+			 */
+			node->parent = link->node;
+			nlink = c_node_link_new(node);
+			if (nlink == NULL)
+				return -ENOMEM;
+			c_list_add_tail(nlink, &link->node->childs);
+			found = 1;
+			break;
+		}
+	}
+	/* this shouldn't happen there should at least be 1 denominator for each node */
+	if (!found && node->opcode != C_OPCODE_ENTRY) {
+		fprintf(stderr, "invalid flow control graph node %p (%d) has no immediate dominator\n",
+			node, node->opcode);
+		return -EINVAL;
+	}
+	c_list_for_each(link, &node->predecessors) {
+		r = c_shader_build_dominator_tree_rec(shader, link->node);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
+int c_shader_build_dominator_tree(struct c_shader *shader)
+{
+	struct c_node *node;
+	c_list_for_each(node, &shader->nodes) {
+		node->done = 0;
+	}
+	return c_shader_build_dominator_tree_rec(shader, &shader->end);
+}
diff --git a/src/gallium/drivers/r600/r600_compiler.h b/src/gallium/drivers/r600/r600_compiler.h
new file mode 100644
index 0000000000..3de19970c3
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_compiler.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef R600_COMPILER_H
+#define R600_COMPILER_H
+
+struct c_vector;
+
+/* operand are the basic source/destination of each operation */
+struct c_channel {
+	struct c_channel	*next;
+	struct c_channel	*prev;
+	unsigned		vindex;		/**< index in vector X,Y,Z,W (0,1,2,3) */
+	unsigned		value;		/**< immediate value 32bits */
+	struct c_vector		*vector;	/**< vector to which it belongs */
+};
+
+/* in GPU world most of the time operand are grouped into vector
+ * of 4 component this structure is mostly and handler to group
+ * operand into a same vector
+ */
+struct c_vector {
+	struct c_vector		*next;
+	struct c_vector		*prev;
+	unsigned		id;		/**< vector uniq id */
+	unsigned		name;		/**< semantic name */
+	unsigned		file;		/**< operand file C_FILE_* */
+	int			sid;		/**< semantic id */
+	struct c_channel	*channel[4];	/**< operands */
+};
+
+#define c_list_init(e) do { (e)->next = e; (e)->prev = e; } while(0)
+#define c_list_add(e, h) do { (e)->next = (h)->next; (e)->prev = h;  (h)->next = e; (e)->next->prev = e; } while(0)
+#define c_list_add_tail(e, h) do { (e)->next = h; (e)->prev = (h)->prev;  (h)->prev = e; (e)->prev->next = e; } while(0)
+#define c_list_del(e) do { (e)->next->prev = (e)->prev; (e)->prev->next = (e)->next; c_list_init(e); } while(0)
+#define c_list_for_each(p, h) for (p = (h)->next; p != (h); p = p->next)
+#define c_list_for_each_from(p, s, h) for (p = s; p != (h); p = p->next)
+#define c_list_for_each_safe(p, n, h) for (p = (h)->next, n = p->next; p != (h); p = n, n = p->next)
+#define c_list_empty(h) ((h)->next == h)
+
+
+#define C_PROGRAM_TYPE_VS	0
+#define C_PROGRAM_TYPE_FS	1
+#define C_PROGRAM_TYPE_COUNT	2
+
+#define C_NODE_FLAG_ALU		1
+#define C_NODE_FLAG_FETCH	2
+
+#define C_SWIZZLE_X		0
+#define C_SWIZZLE_Y		1
+#define C_SWIZZLE_Z		2
+#define C_SWIZZLE_W		3
+#define C_SWIZZLE_0		4
+#define C_SWIZZLE_1		5
+#define C_SWIZZLE_D		6
+
+#define C_FILE_NULL		0
+#define C_FILE_CONSTANT		1
+#define C_FILE_INPUT		2
+#define C_FILE_OUTPUT		3
+#define C_FILE_TEMPORARY	4
+#define C_FILE_SAMPLER		5
+#define C_FILE_ADDRESS		6
+#define C_FILE_IMMEDIATE	7
+#define C_FILE_LOOP		8
+#define C_FILE_PREDICATE	9
+#define C_FILE_SYSTEM_VALUE	10
+#define C_FILE_RESOURCE		11
+#define C_FILE_COUNT		12
+
+#define C_SEMANTIC_POSITION	0
+#define C_SEMANTIC_COLOR	1
+#define C_SEMANTIC_BCOLOR	2  /**< back-face color */
+#define C_SEMANTIC_FOG		3
+#define C_SEMANTIC_PSIZE	4
+#define C_SEMANTIC_GENERIC	5
+#define C_SEMANTIC_NORMAL	6
+#define C_SEMANTIC_FACE		7
+#define C_SEMANTIC_EDGEFLAG	8
+#define C_SEMANTIC_PRIMID	9
+#define C_SEMANTIC_INSTANCEID	10
+#define C_SEMANTIC_VERTEXID	11
+#define C_SEMANTIC_COUNT	12 /**< number of semantic values */
+
+#define C_OPCODE_NOP		0
+#define C_OPCODE_MOV		1
+#define C_OPCODE_LIT		2
+#define C_OPCODE_RCP		3
+#define C_OPCODE_RSQ		4
+#define C_OPCODE_EXP		5
+#define C_OPCODE_LOG		6
+#define C_OPCODE_MUL		7
+#define C_OPCODE_ADD		8
+#define C_OPCODE_DP3		9
+#define C_OPCODE_DP4		10
+#define C_OPCODE_DST		11
+#define C_OPCODE_MIN		12
+#define C_OPCODE_MAX		13
+#define C_OPCODE_SLT		14
+#define C_OPCODE_SGE		15
+#define C_OPCODE_MAD		16
+#define C_OPCODE_SUB		17
+#define C_OPCODE_LRP		18
+#define C_OPCODE_CND		19
+/* gap */
+#define C_OPCODE_DP2A		21
+/* gap */
+#define C_OPCODE_FRC		24
+#define C_OPCODE_CLAMP		25
+#define C_OPCODE_FLR		26
+#define C_OPCODE_ROUND		27
+#define C_OPCODE_EX2		28
+#define C_OPCODE_LG2		29
+#define C_OPCODE_POW		30
+#define C_OPCODE_XPD		31
+/* gap */
+#define C_OPCODE_ABS		33
+#define C_OPCODE_RCC		34
+#define C_OPCODE_DPH		35
+#define C_OPCODE_COS		36
+#define C_OPCODE_DDX		37
+#define C_OPCODE_DDY		38
+#define C_OPCODE_KILP		39		/* predicated kill */
+#define C_OPCODE_PK2H		40
+#define C_OPCODE_PK2US		41
+#define C_OPCODE_PK4B		42
+#define C_OPCODE_PK4UB		43
+#define C_OPCODE_RFL		44
+#define C_OPCODE_SEQ		45
+#define C_OPCODE_SFL		46
+#define C_OPCODE_SGT		47
+#define C_OPCODE_SIN		48
+#define C_OPCODE_SLE		49
+#define C_OPCODE_SNE		50
+#define C_OPCODE_STR		51
+#define C_OPCODE_TEX		52
+#define C_OPCODE_TXD		53
+#define C_OPCODE_TXP		54
+#define C_OPCODE_UP2H		55
+#define C_OPCODE_UP2US		56
+#define C_OPCODE_UP4B		57
+#define C_OPCODE_UP4UB		58
+#define C_OPCODE_X2D		59
+#define C_OPCODE_ARA		60
+#define C_OPCODE_ARR		61
+#define C_OPCODE_BRA		62
+#define C_OPCODE_CAL		63
+#define C_OPCODE_RET		64
+#define C_OPCODE_SSG		65		/* SGN */
+#define C_OPCODE_CMP		66
+#define C_OPCODE_SCS		67
+#define C_OPCODE_TXB		68
+#define C_OPCODE_NRM		69
+#define C_OPCODE_DIV		70
+#define C_OPCODE_DP2		71
+#define C_OPCODE_TXL		72
+#define C_OPCODE_BRK		73
+#define C_OPCODE_IF		74
+#define C_OPCODE_BGNFOR		75
+#define C_OPCODE_REP		76
+#define C_OPCODE_ELSE		77
+#define C_OPCODE_ENDIF		78
+#define C_OPCODE_ENDFOR		79
+#define C_OPCODE_ENDREP		80
+#define C_OPCODE_PUSHA		81
+#define C_OPCODE_POPA		82
+#define C_OPCODE_CEIL		83
+#define C_OPCODE_I2F		84
+#define C_OPCODE_NOT		85
+#define C_OPCODE_TRUNC		86
+#define C_OPCODE_SHL		87
+/* gap */
+#define C_OPCODE_AND		89
+#define C_OPCODE_OR		90
+#define C_OPCODE_MOD		91
+#define C_OPCODE_XOR		92
+#define C_OPCODE_SAD		93
+#define C_OPCODE_TXF		94
+#define C_OPCODE_TXQ		95
+#define C_OPCODE_CONT		96
+#define C_OPCODE_EMIT		97
+#define C_OPCODE_ENDPRIM	98
+#define C_OPCODE_BGNLOOP	99
+#define C_OPCODE_BGNSUB		100
+#define C_OPCODE_ENDLOOP	101
+#define C_OPCODE_ENDSUB		102
+/* gap */
+#define C_OPCODE_NRM4		112
+#define C_OPCODE_CALLNZ		113
+#define C_OPCODE_IFC		114
+#define C_OPCODE_BREAKC		115
+#define C_OPCODE_KIL		116	/* conditional kill */
+#define C_OPCODE_END		117	/* aka HALT */
+/* gap */
+#define C_OPCODE_F2I		119
+#define C_OPCODE_IDIV		120
+#define C_OPCODE_IMAX		121
+#define C_OPCODE_IMIN		122
+#define C_OPCODE_INEG		123
+#define C_OPCODE_ISGE		124
+#define C_OPCODE_ISHR		125
+#define C_OPCODE_ISLT		126
+#define C_OPCODE_F2U		127
+#define C_OPCODE_U2F		128
+#define C_OPCODE_UADD		129
+#define C_OPCODE_UDIV		130
+#define C_OPCODE_UMAD		131
+#define C_OPCODE_UMAX		132
+#define C_OPCODE_UMIN		133
+#define C_OPCODE_UMOD		134
+#define C_OPCODE_UMUL		135
+#define C_OPCODE_USEQ		136
+#define C_OPCODE_USGE		137
+#define C_OPCODE_USHR		138
+#define C_OPCODE_USLT		139
+#define C_OPCODE_USNE		140
+#define C_OPCODE_SWITCH		141
+#define C_OPCODE_CASE		142
+#define C_OPCODE_DEFAULT	143
+#define C_OPCODE_ENDSWITCH	144
+#define C_OPCODE_VFETCH		145
+#define C_OPCODE_ENTRY		146
+#define C_OPCODE_ARL		147
+#define C_OPCODE_LAST		148
+
+#define C_OPERAND_FLAG_ABS		(1 << 0)
+#define C_OPERAND_FLAG_NEG		(1 << 1)
+
+struct c_operand {
+	struct c_vector		*vector;
+	unsigned		swizzle;
+	unsigned		flag;
+};
+
+struct c_op {
+	unsigned		ninput;
+	struct c_operand	input[3];
+	struct c_operand	output;
+	unsigned		opcode;
+};
+
+struct c_instruction {
+	struct c_instruction	*next, *prev;
+	unsigned		nop;
+	struct c_op		op[5];
+};
+
+struct c_node;
+
+struct c_node_link {
+	struct c_node_link	*next;
+	struct c_node_link	*prev;
+	struct c_node		*node;
+};
+
+/**
+ * struct c_node
+ *
+ * @next:		all node are in a double linked list, this point to
+ * 			next node
+ * @next:		all node are in a double linked list, this point to
+ * 			previous node
+ * @predecessors:	list of all predecessor nodes in the flow graph
+ * @successors:		list of all sucessor nodes in the flow graph
+ * @parent:		parent node in the depth first walk tree
+ * @childs:		child nodes in the depth first walk tree
+ */
+struct c_node {
+	struct c_node		*next, *prev;
+	struct c_node_link	predecessors;
+	struct c_node_link	successors;
+	struct c_node		*parent;
+	struct c_node_link	childs;
+	struct c_instruction	insts;
+	unsigned		opcode;
+	unsigned		visited;
+	unsigned		done;
+	void			*backend;
+};
+
+struct c_file {
+	unsigned		nvectors;
+	struct c_vector		vectors;
+};
+
+struct c_shader {
+	unsigned			nvectors;
+	struct c_file			files[C_FILE_COUNT];
+	struct c_node			nodes;
+	struct c_node			entry;
+	struct c_node			end;
+	unsigned			type;
+};
+
+int c_shader_init(struct c_shader *shader, unsigned type);
+void c_shader_destroy(struct c_shader *shader);
+struct c_vector *c_shader_vector_new(struct c_shader *shader, unsigned file, unsigned name, int sid);
+int c_shader_build_dominator_tree(struct c_shader *shader);
+void c_shader_dump(struct c_shader *shader);
+
+void c_node_init(struct c_node *node);
+int c_node_add_new_instruction(struct c_node *node, struct c_instruction *instruction);
+int c_node_add_new_instruction_head(struct c_node *node, struct c_instruction *instruction);
+
+/* control flow graph functions */
+int c_node_cfg_link(struct c_node *predecessor, struct c_node *successor);
+struct c_node *c_node_cfg_new_after(struct c_node *predecessor);
+struct c_node *c_shader_cfg_new_node_after(struct c_shader *shader, struct c_node *predecessor);
+
+struct c_vector *c_vector_new(void);
+
+#endif
diff --git a/src/gallium/drivers/r600/r600_compiler_dump.c b/src/gallium/drivers/r600/r600_compiler_dump.c
new file mode 100644
index 0000000000..485032088c
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_compiler_dump.c
@@ -0,0 +1,267 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "r600_compiler.h"
+
+static const char *c_file_swz[] = {
+	"x",
+	"y",
+	"z",
+	"w",
+	"0",
+	"1",
+	".",
+};
+
+static const char *c_file_str[] = {
+	"NULL",
+	"CONSTANT",
+	"INPUT",
+	"OUTPUT",
+	"TEMPORARY",
+	"SAMPLER",
+	"ADDRESS",
+	"IMMEDIATE",
+	"LOOP",
+	"PREDICATE",
+	"SYSTEM_VALUE",
+};
+
+static const char *c_semantic_str[] = {
+	"POSITION",
+	"COLOR",
+	"BCOLOR",
+	"FOG",
+	"PSIZE",
+	"GENERIC",
+	"NORMAL",
+	"FACE",
+	"EDGEFLAG",
+	"PRIMID",
+	"INSTANCEID",
+};
+
+static const char *c_opcode_str[] = {
+	"ARL",
+	"MOV",
+	"LIT",
+	"RCP",
+	"RSQ",
+	"EXP",
+	"LOG",
+	"MUL",
+	"ADD",
+	"DP3",
+	"DP4",
+	"DST",
+	"MIN",
+	"MAX",
+	"SLT",
+	"SGE",
+	"MAD",
+	"SUB",
+	"LRP",
+	"CND",
+	"(INVALID)",
+	"DP2A",
+	"(INVALID)",
+	"(INVALID)",
+	"FRC",
+	"CLAMP",
+	"FLR",
+	"ROUND",
+	"EX2",
+	"LG2",
+	"POW",
+	"XPD",
+	"(INVALID)",
+	"ABS",
+	"RCC",
+	"DPH",
+	"COS",
+	"DDX",
+	"DDY",
+	"KILP",
+	"PK2H",
+	"PK2US",
+	"PK4B",
+	"PK4UB",
+	"RFL",
+	"SEQ",
+	"SFL",
+	"SGT",
+	"SIN",
+	"SLE",
+	"SNE",
+	"STR",
+	"TEX",
+	"TXD",
+	"TXP",
+	"UP2H",
+	"UP2US",
+	"UP4B",
+	"UP4UB",
+	"X2D",
+	"ARA",
+	"ARR",
+	"BRA",
+	"CAL",
+	"RET",
+	"SSG",
+	"CMP",
+	"SCS",
+	"TXB",
+	"NRM",
+	"DIV",
+	"DP2",
+	"TXL",
+	"BRK",
+	"IF",
+	"BGNFOR",
+	"REP",
+	"ELSE",
+	"ENDIF",
+	"ENDFOR",
+	"ENDREP",
+	"PUSHA",
+	"POPA",
+	"CEIL",
+	"I2F",
+	"NOT",
+	"TRUNC",
+	"SHL",
+	"(INVALID)",
+	"AND",
+	"OR",
+	"MOD",
+	"XOR",
+	"SAD",
+	"TXF",
+	"TXQ",
+	"CONT",
+	"EMIT",
+	"ENDPRIM",
+	"BGNLOOP",
+	"BGNSUB",
+	"ENDLOOP",
+	"ENDSUB",
+	"(INVALID)",
+	"(INVALID)",
+	"(INVALID)",
+	"(INVALID)",
+	"NOP",
+	"(INVALID)",
+	"(INVALID)",
+	"(INVALID)",
+	"(INVALID)",
+	"NRM4",
+	"CALLNZ",
+	"IFC",
+	"BREAKC",
+	"KIL",
+	"END",
+	"(INVALID)",
+	"F2I",
+	"IDIV",
+	"IMAX",
+	"IMIN",
+	"INEG",
+	"ISGE",
+	"ISHR",
+	"ISLT",
+	"F2U",
+	"U2F",
+	"UADD",
+	"UDIV",
+	"UMAD",
+	"UMAX",
+	"UMIN",
+	"UMOD",
+	"UMUL",
+	"USEQ",
+	"USGE",
+	"USHR",
+	"USLT",
+	"USNE",
+	"SWITCH",
+	"CASE",
+	"DEFAULT",
+	"ENDSWITCH",
+	"VFETCH",
+	"ENTRY",
+};
+
+static inline const char *c_get_name(const char *name[], unsigned i)
+{
+	return name[i];
+}
+
+static void pindent(unsigned indent)
+{
+	unsigned i;
+	for (i = 0; i < indent; i++)
+		fprintf(stderr, " ");
+}
+
+static void c_node_dump(struct c_node *node, unsigned indent)
+{
+	struct c_instruction *i;
+	unsigned j, k;
+
+	pindent(indent); fprintf(stderr, "# node %s\n", c_get_name(c_opcode_str, node->opcode));
+	c_list_for_each(i, &node->insts) {
+		for (k = 0; k < i->nop; k++) {
+			pindent(indent);
+			fprintf(stderr, "%s", c_get_name(c_opcode_str, i->op[k].opcode));
+			fprintf(stderr, " %s[%d][%s]",
+				c_get_name(c_file_str, i->op[k].output.vector->file),
+				i->op[k].output.vector->id,
+				c_get_name(c_file_swz, i->op[k].output.swizzle));
+			for (j = 0; j < i->op[k].ninput; j++) {
+				fprintf(stderr, " %s[%d][%s]",
+						c_get_name(c_file_str, i->op[k].input[j].vector->file),
+						i->op[k].input[j].vector->id,
+						c_get_name(c_file_swz, i->op[k].input[j].swizzle));
+			}
+			fprintf(stderr, ";\n");
+		}
+	}
+}
+
+static void c_shader_dump_rec(struct c_shader *shader, struct c_node *node, unsigned indent)
+{
+	struct c_node_link *link;
+
+	c_node_dump(node, indent);
+	c_list_for_each(link, &node->childs) {
+		c_shader_dump_rec(shader, link->node, indent + 1);
+	}
+}
+
+void c_shader_dump(struct c_shader *shader)
+{
+	c_shader_dump_rec(shader, &shader->entry, 0);
+}
diff --git a/src/gallium/drivers/r600/r600_compiler_r600.c b/src/gallium/drivers/r600/r600_compiler_r600.c
new file mode 100644
index 0000000000..14ea8ab6e8
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_compiler_r600.c
@@ -0,0 +1,891 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <errno.h>
+#include <util/u_format.h>
+#include "r600_screen.h"
+#include "r600_context.h"
+#include "r600_sq.h"
+
+
+struct r600_alu_instruction {
+	unsigned			copcode;
+	enum r600_instruction		instruction;
+};
+
+static int r600_shader_alu_translate(struct r600_shader *rshader,
+					struct r600_shader_node *node,
+					struct c_instruction *instruction);
+struct r600_alu_instruction r600_alu_instruction[C_OPCODE_LAST];
+struct r600_instruction_info r600_instruction_info[];
+
+int r600_shader_insert_fetch(struct c_shader *shader)
+{
+	struct c_vector *vi, *vr, *v, *nv;
+	struct c_instruction instruction;
+	int r;
+
+	if (shader->type != C_PROGRAM_TYPE_VS)
+		return 0;
+	vi = c_shader_vector_new(shader, C_FILE_INPUT, C_SEMANTIC_VERTEXID, -1);
+	if (vi == NULL)
+		return -ENOMEM;
+	c_list_for_each_safe(v, nv, &shader->files[C_FILE_INPUT].vectors) {
+		if (v == vi)
+			continue;
+		vr = c_shader_vector_new(shader, C_FILE_RESOURCE, C_SEMANTIC_GENERIC, -1);
+		if (vr == NULL)
+			return -ENOMEM;
+		memset(&instruction, 0, sizeof(struct c_instruction));
+		instruction.nop = 4;
+		instruction.op[0].opcode = C_OPCODE_VFETCH;
+		instruction.op[1].opcode = C_OPCODE_VFETCH;
+		instruction.op[2].opcode = C_OPCODE_VFETCH;
+		instruction.op[3].opcode = C_OPCODE_VFETCH;
+		instruction.op[0].ninput = 2;
+		instruction.op[1].ninput = 2;
+		instruction.op[2].ninput = 2;
+		instruction.op[3].ninput = 2;
+		instruction.op[0].output.vector = v;
+		instruction.op[1].output.vector = v;
+		instruction.op[2].output.vector = v;
+		instruction.op[3].output.vector = v;
+		instruction.op[0].input[0].vector = vi;
+		instruction.op[0].input[1].vector = vr;
+		instruction.op[1].input[0].vector = vi;
+		instruction.op[1].input[1].vector = vr;
+		instruction.op[2].input[0].vector = vi;
+		instruction.op[2].input[1].vector = vr;
+		instruction.op[3].input[0].vector = vi;
+		instruction.op[3].input[1].vector = vr;
+		instruction.op[0].output.swizzle = C_SWIZZLE_X;
+		instruction.op[1].output.swizzle = C_SWIZZLE_Y;
+		instruction.op[2].output.swizzle = C_SWIZZLE_Z;
+		instruction.op[3].output.swizzle = C_SWIZZLE_W;
+		r = c_node_add_new_instruction_head(&shader->entry, &instruction);
+		if (r)
+			return r;
+		c_list_del(v);
+		shader->files[C_FILE_INPUT].nvectors--;
+		c_list_add_tail(v, &shader->files[C_FILE_TEMPORARY].vectors);
+		shader->files[C_FILE_TEMPORARY].nvectors++;
+		v->file = C_FILE_TEMPORARY;
+	}
+	return 0;
+}
+
+void r600_shader_cleanup(struct r600_shader *rshader)
+{
+	struct r600_shader_node *n, *nn;
+	struct r600_shader_vfetch *vf, *nvf;
+	struct r600_shader_alu *alu, *nalu;
+	int i;
+
+	if (rshader == NULL)
+		return;
+	if (rshader->gpr) {
+		for (i = 0; i < rshader->nvector; i++) {
+			free(rshader->gpr[i]);
+		}
+		free(rshader->gpr);
+		rshader->gpr = NULL;
+	}
+	c_list_for_each_safe(n, nn, &rshader->nodes) {
+		c_list_del(n);
+		c_list_for_each_safe(vf, nvf, &n->vfetch) {
+			c_list_del(vf);
+			free(vf);
+		}
+		c_list_for_each_safe(alu, nalu, &n->alu) {
+			c_list_del(alu);
+			free(alu);
+		}
+		free(n);
+	}
+	free(rshader->bcode);
+	return;
+}
+
+int r600_shader_vfetch_bytecode(struct r600_shader *rshader,
+				struct r600_shader_node *rnode,
+				struct r600_shader_vfetch *vfetch,
+				unsigned *cid)
+{
+	unsigned id = *cid;
+
+	vfetch->cf_addr = id;
+	rshader->bcode[id++] = S_SQ_VTX_WORD0_BUFFER_ID(vfetch->src[1].sel) |
+				S_SQ_VTX_WORD0_SRC_GPR(vfetch->src[0].sel) |
+				S_SQ_VTX_WORD0_SRC_SEL_X(vfetch->src[0].sel) |
+				S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(0x1F);
+	rshader->bcode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vfetch->dst[0].chan) |
+				S_SQ_VTX_WORD1_DST_SEL_Y(vfetch->dst[1].chan) |
+				S_SQ_VTX_WORD1_DST_SEL_Z(vfetch->dst[2].chan) |
+				S_SQ_VTX_WORD1_DST_SEL_W(vfetch->dst[3].chan) |
+				S_SQ_VTX_WORD1_USE_CONST_FIELDS(1) |
+				S_SQ_VTX_WORD1_GPR_DST_GPR(vfetch->dst[0].sel);
+	rshader->bcode[id++] = S_SQ_VTX_WORD2_MEGA_FETCH(1);
+	rshader->bcode[id++] = 0;
+	*cid = id;
+	return 0;
+}
+
+int r600_shader_update(struct r600_shader *rshader, enum pipe_format *resource_format)
+{
+	struct r600_shader_node *rnode;
+	struct r600_shader_vfetch *vfetch;
+	unsigned i;
+
+	memcpy(rshader->resource_format, resource_format,
+		rshader->nresource * sizeof(enum pipe_format));
+	c_list_for_each(rnode, &rshader->nodes) {
+		c_list_for_each(vfetch, &rnode->vfetch) {
+			const struct util_format_description *desc;
+			i = vfetch->cf_addr + 1;
+			rshader->bcode[i] &= C_SQ_VTX_WORD1_DST_SEL_X;
+			rshader->bcode[i] &= C_SQ_VTX_WORD1_DST_SEL_Y;
+			rshader->bcode[i] &= C_SQ_VTX_WORD1_DST_SEL_Z;
+			rshader->bcode[i] &= C_SQ_VTX_WORD1_DST_SEL_W;
+			desc = util_format_description(resource_format[vfetch->src[1].sel]);
+			if (desc == NULL) {
+				fprintf(stderr, "%s unknown format %d\n", __func__, resource_format[vfetch->src[1].sel]);
+				continue;
+			}
+			/* WARNING so far TGSI swizzle match R600 ones */
+			rshader->bcode[i] |= S_SQ_VTX_WORD1_DST_SEL_X(desc->swizzle[0]);
+			rshader->bcode[i] |= S_SQ_VTX_WORD1_DST_SEL_Y(desc->swizzle[1]);
+			rshader->bcode[i] |= S_SQ_VTX_WORD1_DST_SEL_Z(desc->swizzle[2]);
+			rshader->bcode[i] |= S_SQ_VTX_WORD1_DST_SEL_W(desc->swizzle[3]);
+		}
+	}
+	return 0;
+}
+
+int r600_shader_register(struct r600_shader *rshader)
+{
+	struct c_vector *v, *nv;
+	unsigned tid, cid, rid, i;
+
+	rshader->nvector = rshader->cshader.nvectors;
+	rshader->gpr = calloc(rshader->nvector, sizeof(void*));
+	if (rshader->gpr == NULL)
+		return -ENOMEM;
+	tid = 0;
+	cid = 0;
+	rid = 0;
+	/* alloc input first */
+	c_list_for_each(v, &rshader->cshader.files[C_FILE_INPUT].vectors) {
+		nv = c_vector_new();
+		if (nv == NULL) {
+			return -ENOMEM;
+		}
+		memcpy(nv, v, sizeof(struct c_vector));
+		nv->id = tid++;
+		rshader->gpr[v->id] = nv;
+	}
+	for (i = 0; i < C_FILE_COUNT; i++) {
+		if (i == C_FILE_INPUT || i == C_FILE_IMMEDIATE)
+			continue;
+		c_list_for_each(v, &rshader->cshader.files[i].vectors) {
+			switch (v->file) {
+			case C_FILE_OUTPUT:
+			case C_FILE_TEMPORARY:
+				nv = c_vector_new();
+				if (nv == NULL) {
+					return -ENOMEM;
+				}
+				memcpy(nv, v, sizeof(struct c_vector));
+				nv->id = tid++;
+				rshader->gpr[v->id] = nv;
+				break;
+			case C_FILE_CONSTANT:
+				nv = c_vector_new();
+				if (nv == NULL) {
+					return -ENOMEM;
+				}
+				memcpy(nv, v, sizeof(struct c_vector));
+				nv->id = (cid++) + 256;
+				rshader->gpr[v->id] = nv;
+				break;
+			case C_FILE_RESOURCE:
+				nv = c_vector_new();
+				if (nv == NULL) {
+					return -ENOMEM;
+				}
+				memcpy(nv, v, sizeof(struct c_vector));
+				nv->id = (rid++);
+				rshader->gpr[v->id] = nv;
+				break;
+			default:
+				fprintf(stderr, "%s:%d unsupported file %d\n", __func__, __LINE__, v->file);
+				return -EINVAL;
+			}
+		}
+	}
+	rshader->ngpr = tid;
+	rshader->nconstant = cid;
+	rshader->nresource = rid;
+	return 0;
+}
+
+int r600_shader_find_gpr(struct r600_shader *rshader, struct c_vector *v, unsigned swizzle,
+			struct r600_shader_operand *operand)
+{
+	struct c_vector *tmp;
+
+	/* Values [0,127] correspond to GPR[0..127]. 
+	 * Values [256,511] correspond to cfile constants c[0..255]. 
+	 * Other special values are shown in the list below.
+	 * 248	SQ_ALU_SRC_0: special constant 0.0.
+	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
+	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
+	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
+	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
+	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
+	 * 254	SQ_ALU_SRC_PV: previous vector result.
+	 * 255	SQ_ALU_SRC_PS: previous scalar result.
+	 */
+	operand->vector = v;
+	operand->sel = 248;
+	operand->chan = 0;
+	operand->neg = 0;
+	operand->abs = 0;
+	if (v == NULL)
+		return 0;
+	if (v->file == C_FILE_IMMEDIATE) {
+		operand->sel = 253;
+	} else {
+		tmp = rshader->gpr[v->id];
+		if (tmp == NULL) {
+			fprintf(stderr, "%s %d unknown register\n", __FILE__, __LINE__);
+			return -EINVAL;
+		}
+		operand->sel = tmp->id;
+	}
+	operand->chan = swizzle;
+	switch (swizzle) {
+	case C_SWIZZLE_X:
+	case C_SWIZZLE_Y:
+	case C_SWIZZLE_Z:
+	case C_SWIZZLE_W:
+		break;
+	case C_SWIZZLE_0:
+		operand->sel = 248;
+		operand->chan = 0;
+		break;
+	case C_SWIZZLE_1:
+		operand->sel = 249;
+		operand->chan = 0;
+		break;
+	default:
+		fprintf(stderr, "%s %d invalid swizzle %d\n", __FILE__, __LINE__, swizzle);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct r600_shader_node *r600_shader_new_node(struct r600_shader *rshader, struct c_node *node)
+{
+	struct r600_shader_node *rnode;
+
+	rnode = CALLOC_STRUCT(r600_shader_node);
+	if (rnode == NULL)
+		return NULL;
+	rnode->node = node;
+	c_list_init(&rnode->vfetch);
+	c_list_init(&rnode->alu);
+	c_list_add_tail(rnode, &rshader->nodes);
+	return rnode;
+}
+
+static int r600_shader_add_vfetch(struct r600_shader *rshader,
+				struct r600_shader_node *node,
+				struct c_instruction *instruction)
+{
+	struct r600_shader_vfetch *vfetch;
+	struct r600_shader_node *rnode;
+	int r;
+
+	if (instruction == NULL)
+		return 0;
+	if (instruction->op[0].opcode != C_OPCODE_VFETCH)
+		return 0;
+	if (!c_list_empty(&node->alu)) {
+		rnode = r600_shader_new_node(rshader, node->node);
+		if (rnode == NULL)
+			return -ENOMEM;
+		node = rnode;
+	}
+	vfetch = calloc(1, sizeof(struct r600_shader_vfetch));
+	if (vfetch == NULL)
+		return -ENOMEM;
+	r = r600_shader_find_gpr(rshader, instruction->op[0].output.vector, 0, &vfetch->dst[0]);
+	if (r)
+		return r;
+	r = r600_shader_find_gpr(rshader, instruction->op[0].input[0].vector, 0, &vfetch->src[0]);
+	if (r)
+		return r;
+	r = r600_shader_find_gpr(rshader, instruction->op[0].input[1].vector, 0, &vfetch->src[1]);
+	if (r)
+		return r;
+	vfetch->dst[0].chan = C_SWIZZLE_X;
+	vfetch->dst[1].chan = C_SWIZZLE_Y;
+	vfetch->dst[2].chan = C_SWIZZLE_Z;
+	vfetch->dst[3].chan = C_SWIZZLE_W;
+	c_list_add_tail(vfetch, &node->vfetch);
+	node->nslot += 2;
+	return 0;
+}
+
+static int r600_node_translate(struct r600_shader *rshader, struct c_node *node)
+{
+	struct c_instruction *instruction;
+	struct r600_shader_node *rnode;
+	int r;
+
+	rnode = r600_shader_new_node(rshader, node);
+	if (rnode == NULL)
+		return -ENOMEM;
+	c_list_for_each(instruction, &node->insts) {
+		switch (instruction->op[0].opcode) {
+		case C_OPCODE_VFETCH:
+			r = r600_shader_add_vfetch(rshader, rnode, instruction);
+			if (r) {
+				fprintf(stderr, "%s %d vfetch failed\n", __func__, __LINE__);
+				return r;
+			}
+			break;
+		default:
+			r = r600_shader_alu_translate(rshader, rnode, instruction);
+			if (r) {
+				fprintf(stderr, "%s %d alu failed\n", __func__, __LINE__);
+				return r;
+			}
+			break;
+		}
+	}
+	return 0;
+}
+
+int r600_shader_translate_rec(struct r600_shader *rshader, struct c_node *node)
+{
+	struct c_node_link *link;
+	int r;
+
+	if (node->opcode == C_OPCODE_END)
+		return 0;
+	r = r600_node_translate(rshader, node);
+	if (r)
+		return r;
+	c_list_for_each(link, &node->childs) {
+		r = r600_shader_translate_rec(rshader, link->node);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
+static struct r600_shader_alu *r600_shader_insert_alu(struct r600_shader *rshader, struct r600_shader_node *node)
+{
+	struct r600_shader_alu *alu;
+
+	alu = CALLOC_STRUCT(r600_shader_alu);
+	if (alu == NULL)
+		return NULL;
+	alu->alu[0].inst = INST_NOP;
+	alu->alu[1].inst = INST_NOP;
+	alu->alu[2].inst = INST_NOP;
+	alu->alu[3].inst = INST_NOP;
+	alu->alu[4].inst = INST_NOP;
+	alu->alu[0].opcode = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
+	alu->alu[1].opcode = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
+	alu->alu[2].opcode = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
+	alu->alu[3].opcode = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
+	alu->alu[4].opcode = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
+	c_list_add_tail(alu, &node->alu);
+	return alu;
+}
+
+static int r600_shader_alu_translate(struct r600_shader *rshader,
+					struct r600_shader_node *node,
+					struct c_instruction *instruction)
+{
+	struct r600_shader_node *rnode;
+	struct r600_shader_alu *alu;
+	int i, j, r, comp, litteral_lastcomp = -1;
+
+	if (!c_list_empty(&node->vfetch)) {
+		rnode = r600_shader_new_node(rshader, node->node);
+		if (rnode == NULL) {
+			fprintf(stderr, "%s %d new node failed\n", __func__, __LINE__);
+			return -ENOMEM;
+		}
+		node = rnode;
+	}
+
+	/* initialize alu */
+	alu = r600_shader_insert_alu(rshader, node);
+
+	/* check special operation like lit */
+
+	/* go through operation */
+	for (i = 0; i < instruction->nop; i++) {
+		struct r600_alu_instruction *ainfo = &r600_alu_instruction[instruction->op[i].opcode];
+		struct r600_instruction_info *iinfo = &r600_instruction_info[ainfo->instruction];
+		unsigned comp;
+
+		/* check that output is a valid component */
+		comp = instruction->op[i].output.swizzle;
+		switch (comp) {
+		case C_SWIZZLE_X:
+		case C_SWIZZLE_Y:
+		case C_SWIZZLE_Z:
+		case C_SWIZZLE_W:
+			break;
+		case C_SWIZZLE_0:
+		case C_SWIZZLE_1:
+		default:
+			fprintf(stderr, "%s %d invalid output\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+		alu->alu[comp].inst = ainfo->instruction;
+		alu->alu[comp].opcode = iinfo->opcode;
+		alu->alu[comp].is_op3 = iinfo->is_op3;
+		for (j = 0; j < instruction->op[i].ninput; j++) {
+			r = r600_shader_find_gpr(rshader, instruction->op[i].input[j].vector,
+					instruction->op[i].input[j].swizzle, &alu->alu[comp].src[j]);
+			if (r) {
+				fprintf(stderr, "%s %d register failed\n", __FILE__, __LINE__);
+				return r;
+			}
+			if (instruction->op[i].input[j].vector->file == C_FILE_IMMEDIATE) {
+				r = instruction->op[i].input[j].swizzle;
+				switch (r) {
+				case C_SWIZZLE_X:
+				case C_SWIZZLE_Y:
+				case C_SWIZZLE_Z:
+				case C_SWIZZLE_W:
+					break;
+				case C_SWIZZLE_0:
+				case C_SWIZZLE_1:
+				default:
+					fprintf(stderr, "%s %d invalid input\n", __func__, __LINE__);
+					return -EINVAL;
+				}
+				alu->literal[r] = instruction->op[i].input[j].vector->channel[r]->value;
+				if (r > litteral_lastcomp) {
+					litteral_lastcomp = r;
+				}
+			}
+		}
+		r = r600_shader_find_gpr(rshader, instruction->op[i].output.vector,
+				instruction->op[i].output.swizzle, &alu->alu[comp].dst);
+		if (r) {
+			fprintf(stderr, "%s %d register failed\n", __FILE__, __LINE__);
+			return r;
+		}
+	}
+	switch (litteral_lastcomp) {
+	case 0:
+	case 1:
+		alu->nliteral = 2;
+		break;
+	case 2:
+	case 3:
+		alu->nliteral = 4;
+		break;
+	case -1:
+	default:
+		break;
+	}
+printf("nliteral: %d\n", alu->nliteral);
+	for (i = instruction->nop; i >= 0; i--) {
+		if (alu->alu[i].inst != INST_NOP) {
+			alu->alu[i].last = 1;
+			alu->nalu = i + 1;
+			break;
+		}
+	}
+	return 0;
+}
+
+void r600_shader_node_place(struct r600_shader *rshader)
+{
+	struct r600_shader_node *node, *nnode;
+	struct r600_shader_alu *alu, *nalu;
+	struct r600_shader_vfetch *vfetch, *nvfetch;
+	unsigned cf_id = 0, cf_addr = 0;
+
+	rshader->ncf = 0;
+	rshader->nslot = 0;
+	c_list_for_each_safe(node, nnode, &rshader->nodes) {
+		c_list_for_each_safe(alu, nalu, &node->alu) {
+			node->nslot += alu->nalu;
+			node->nslot += alu->nliteral >> 1;
+		}
+		node->nfetch = 0;
+		c_list_for_each_safe(vfetch, nvfetch, &node->vfetch) {
+			node->nslot += 2;
+			node->nfetch += 1;
+		}
+		if (!c_list_empty(&node->vfetch)) {
+			/* fetch node need to be 16 bytes aligned*/
+			cf_addr += 1;
+			cf_addr &= 0xFFFFFFFEUL;
+		}
+		node->cf_id = cf_id;
+		node->cf_addr = cf_addr;
+		cf_id += 2;
+		cf_addr += node->nslot * 2;
+		rshader->ncf++;
+	}
+	rshader->nslot = cf_addr;
+	c_list_for_each_safe(node, nnode, &rshader->nodes) {
+		node->cf_addr += cf_id * 2;
+	}
+	rshader->ncf += rshader->cshader.files[C_FILE_OUTPUT].nvectors;
+	rshader->ndw = rshader->ncf * 2 + rshader->nslot * 2;
+}
+
+int r600_shader_legalize(struct r600_shader *rshader)
+{
+	return 0;
+}
+
+
+static int r600_cshader_legalize_rec(struct c_shader *shader, struct c_node *node)
+{
+	struct c_node_link *link;
+	struct c_instruction *i;
+	struct c_operand operand;
+	unsigned k;
+	int r;
+
+	c_list_for_each(i, &node->insts) {
+		for (k = 0; k < i->nop; k++) {
+			switch (i->op[k].opcode) {
+			case C_OPCODE_SLT:
+				i->op[k].opcode = C_OPCODE_SGT;
+				memcpy(&operand, &i->op[k].input[0], sizeof(struct c_operand));
+				memcpy(&i->op[k].input[0], &i->op[k].input[1], sizeof(struct c_operand));
+				memcpy(&i->op[k].input[1], &operand, sizeof(struct c_operand));
+				break;
+			default:
+				break;
+			}
+		}
+	}
+	c_list_for_each(link, &node->childs) {
+		r = r600_cshader_legalize_rec(shader, link->node);
+		if (r) {
+			return r;
+		}
+	}
+	return 0;
+}
+
+int r600_cshader_legalize(struct c_shader *shader)
+{
+	return r600_cshader_legalize_rec(shader, &shader->entry);
+}
+
+
+struct r600_instruction_info r600_instruction_info[] = {
+	{INST_ADD,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD,			0, 0},
+	{INST_MUL,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL,			0, 0},
+	{INST_MUL_IEEE,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_IEEE,		0, 0},
+	{INST_MAX,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX,			0, 0},
+	{INST_MIN,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN,			0, 0},
+	{INST_MAX_DX10,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_DX10,		0, 0},
+	{INST_MIN_DX10,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_DX10,		0, 0},
+	{INST_SETE,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE,			0, 0},
+	{INST_SETGT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT,			0, 0},
+	{INST_SETGE,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE,			0, 0},
+	{INST_SETNE,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE,			0, 0},
+	{INST_SETE_DX10,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_DX10,		0, 0},
+	{INST_SETGT_DX10,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_DX10,		0, 0},
+	{INST_SETGE_DX10,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_DX10,		0, 0},
+	{INST_SETNE_DX10,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_DX10,		0, 0},
+	{INST_FRACT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT,			0, 0},
+	{INST_TRUNC,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC,			0, 0},
+	{INST_CEIL,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL,			0, 0},
+	{INST_RNDNE,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE,			0, 0},
+	{INST_FLOOR,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR,			0, 0},
+	{INST_MOVA,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA,			0, 0},
+	{INST_MOVA_FLOOR,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR,		0, 0},
+	{INST_MOVA_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT,		0, 0},
+	{INST_MOV,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV,			0, 0},
+	{INST_NOP,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP,			0, 0},
+	{INST_PRED_SETGT_UINT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT,		0, 0},
+	{INST_PRED_SETGE_UINT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT,		0, 0},
+	{INST_PRED_SETE,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE,		0, 0},
+	{INST_PRED_SETGT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT,		0, 0},
+	{INST_PRED_SETGE,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE,		0, 0},
+	{INST_PRED_SETNE,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE,		0, 0},
+	{INST_PRED_SET_INV,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV,		0, 0},
+	{INST_PRED_SET_POP,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP,		0, 0},
+	{INST_PRED_SET_CLR,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR,		0, 0},
+	{INST_PRED_SET_RESTORE,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE,	0, 0},
+	{INST_PRED_SETE_PUSH,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH,		0, 0},
+	{INST_PRED_SETGT_PUSH,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH,		0, 0},
+	{INST_PRED_SETGE_PUSH,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH,		0, 0},
+	{INST_PRED_SETNE_PUSH,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH,		0, 0},
+	{INST_KILLE,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE,			0, 0},
+	{INST_KILLGT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT,			0, 0},
+	{INST_KILLGE,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE,			0, 0},
+	{INST_KILLNE,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE,			0, 0},
+	{INST_AND_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT,			0, 0},
+	{INST_OR_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT,			0, 0},
+	{INST_XOR_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT,			0, 0},
+	{INST_NOT_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT,			0, 0},
+	{INST_ADD_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT,			0, 0},
+	{INST_SUB_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT,			0, 0},
+	{INST_MAX_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT,			0, 0},
+	{INST_MIN_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT,			0, 0},
+	{INST_MAX_UINT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT,		0, 0},
+	{INST_MIN_UINT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT,		0, 0},
+	{INST_SETE_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT,		0, 0},
+	{INST_SETGT_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT,		0, 0},
+	{INST_SETGE_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT,		0, 0},
+	{INST_SETNE_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT,		0, 0},
+	{INST_SETGT_UINT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT,		0, 0},
+	{INST_SETGE_UINT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT,		0, 0},
+	{INST_KILLGT_UINT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT,		0, 0},
+	{INST_KILLGE_UINT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT,		0, 0},
+	{INST_PRED_SETE_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT,		0, 0},
+	{INST_PRED_SETGT_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT,		0, 0},
+	{INST_PRED_SETGE_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT,		0, 0},
+	{INST_PRED_SETNE_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT,		0, 0},
+	{INST_KILLE_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT,		0, 0},
+	{INST_KILLGT_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT,		0, 0},
+	{INST_KILLGE_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT,		0, 0},
+	{INST_KILLNE_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT,		0, 0},
+	{INST_PRED_SETE_PUSH_INT,	V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT,	0, 0},
+	{INST_PRED_SETGT_PUSH_INT,	V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT,	0, 0},
+	{INST_PRED_SETGE_PUSH_INT,	V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT,	0, 0},
+	{INST_PRED_SETNE_PUSH_INT,	V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT,	0, 0},
+	{INST_PRED_SETLT_PUSH_INT,	V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT,	0, 0},
+	{INST_PRED_SETLE_PUSH_INT,	V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT,	0, 0},
+	{INST_DOT4,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4,			0, 0},
+	{INST_DOT4_IEEE,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE,		0, 0},
+	{INST_CUBE,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE,			0, 0},
+	{INST_MAX4,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4,			0, 0},
+	{INST_MOVA_GPR_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT,		0, 0},
+	{INST_EXP_IEEE,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE,		1, 0},
+	{INST_LOG_CLAMPED,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED,		1, 0},
+	{INST_LOG_IEEE,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE,		1, 0},
+	{INST_RECIP_CLAMPED,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED,		1, 0},
+	{INST_RECIP_FF,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF,		1, 0},
+	{INST_RECIP_IEEE,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE,		1, 0},
+	{INST_RECIPSQRT_CLAMPED,	V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED,	1, 0},
+	{INST_RECIPSQRT_FF,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF,		1, 0},
+	{INST_RECIPSQRT_IEEE,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE,		1, 0},
+	{INST_SQRT_IEEE,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE,		1, 0},
+	{INST_FLT_TO_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT,		1, 0},
+	{INST_INT_TO_FLT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT,		1, 0},
+	{INST_UINT_TO_FLT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT,		1, 0},
+	{INST_SIN,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN,			1, 0},
+	{INST_COS,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS,			1, 0},
+	{INST_ASHR_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT,		1, 0},
+	{INST_LSHR_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT,		1, 0},
+	{INST_LSHL_INT,			V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT,		1, 0},
+	{INST_MULLO_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT,		1, 0},
+	{INST_MULHI_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT,		1, 0},
+	{INST_MULLO_UINT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT,		1, 0},
+	{INST_MULHI_UINT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT,		1, 0},
+	{INST_RECIP_INT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT,		1, 0},
+	{INST_RECIP_UINT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT,		1, 0},
+	{INST_FLT_TO_UINT,		V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT,		1, 0},
+	{INST_MUL_LIT,			V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT,			1, 1},
+	{INST_MUL_LIT_M2,		V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M2,		1, 1},
+	{INST_MUL_LIT_M4,		V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M4,		1, 1},
+	{INST_MUL_LIT_D2,		V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_D2,		1, 1},
+	{INST_MULADD,			V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD,			0, 1},
+	{INST_MULADD_M2,		V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_M2,		0, 1},
+	{INST_MULADD_M4,		V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_M4,		0, 1},
+	{INST_MULADD_D2,		V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_D2,		0, 1},
+	{INST_MULADD_IEEE,		V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_IEEE,		0, 1},
+	{INST_MULADD_IEEE_M2,		V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_IEEE_M2,		0, 1},
+	{INST_MULADD_IEEE_M4,		V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_IEEE_M4,		0, 1},
+	{INST_MULADD_IEEE_D2,		V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_IEEE_D2,		0, 1},
+	{INST_CNDE,			V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE,			0, 1},
+	{INST_CNDGT,			V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT,			0, 1},
+	{INST_CNDGE,			V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE,			0, 1},
+	{INST_CNDE_INT,			V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT,		0, 1},
+	{INST_CNDGT_INT,		V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT,		0, 1},
+	{INST_CNDGE_INT,		V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT,		0, 1},
+};
+
+struct r600_alu_instruction r600_alu_instruction[C_OPCODE_LAST] = {
+	{C_OPCODE_NOP,		INST_NOP},
+	{C_OPCODE_MOV,		INST_MOV},
+	{C_OPCODE_LIT,		INST_NOP},
+	{C_OPCODE_RCP,		INST_RECIP_IEEE},
+	{C_OPCODE_RSQ,		INST_RECIPSQRT_IEEE},
+	{C_OPCODE_EXP,		INST_EXP_IEEE},
+	{C_OPCODE_LOG,		INST_LOG_IEEE},
+	{C_OPCODE_MUL,		INST_MUL},
+	{C_OPCODE_ADD,		INST_ADD},
+	{C_OPCODE_DP3,		INST_DOT4},
+	{C_OPCODE_DP4,		INST_DOT4},
+	{C_OPCODE_DST,		INST_NOP},
+	{C_OPCODE_MIN,		INST_MIN},
+	{C_OPCODE_MAX,		INST_MAX},
+	{C_OPCODE_SLT,		INST_NOP},
+	{C_OPCODE_SGE,		INST_NOP},
+	{C_OPCODE_MAD,		INST_MULADD},
+	{C_OPCODE_SUB,		INST_COUNT},
+	{C_OPCODE_LRP,		INST_NOP},
+	{C_OPCODE_CND,		INST_NOP},
+	{20,			INST_NOP},
+	{C_OPCODE_DP2A,		INST_NOP},
+	{22,			INST_NOP},
+	{23,			INST_NOP},
+	{C_OPCODE_FRC,		INST_NOP},
+	{C_OPCODE_CLAMP,	INST_NOP},
+	{C_OPCODE_FLR,		INST_NOP},
+	{C_OPCODE_ROUND,	INST_NOP},
+	{C_OPCODE_EX2,		INST_NOP},
+	{C_OPCODE_LG2,		INST_NOP},
+	{C_OPCODE_POW,		INST_NOP},
+	{C_OPCODE_XPD,		INST_NOP},
+	{32,			INST_NOP},
+	{C_OPCODE_ABS,		INST_COUNT},
+	{C_OPCODE_RCC,		INST_NOP},
+	{C_OPCODE_DPH,		INST_NOP},
+	{C_OPCODE_COS,		INST_COS},
+	{C_OPCODE_DDX,		INST_NOP},
+	{C_OPCODE_DDY,		INST_NOP},
+	{C_OPCODE_KILP,		INST_NOP},
+	{C_OPCODE_PK2H,		INST_NOP},
+	{C_OPCODE_PK2US,	INST_NOP},
+	{C_OPCODE_PK4B,		INST_NOP},
+	{C_OPCODE_PK4UB,	INST_NOP},
+	{C_OPCODE_RFL,		INST_NOP},
+	{C_OPCODE_SEQ,		INST_NOP},
+	{C_OPCODE_SFL,		INST_NOP},
+	{C_OPCODE_SGT,		INST_SETGT},
+	{C_OPCODE_SIN,		INST_SIN},
+	{C_OPCODE_SLE,		INST_NOP},
+	{C_OPCODE_SNE,		INST_NOP},
+	{C_OPCODE_STR,		INST_NOP},
+	{C_OPCODE_TEX,		INST_NOP},
+	{C_OPCODE_TXD,		INST_NOP},
+	{C_OPCODE_TXP,		INST_NOP},
+	{C_OPCODE_UP2H,		INST_NOP},
+	{C_OPCODE_UP2US,	INST_NOP},
+	{C_OPCODE_UP4B,		INST_NOP},
+	{C_OPCODE_UP4UB,	INST_NOP},
+	{C_OPCODE_X2D,		INST_NOP},
+	{C_OPCODE_ARA,		INST_NOP},
+	{C_OPCODE_ARR,		INST_NOP},
+	{C_OPCODE_BRA,		INST_NOP},
+	{C_OPCODE_CAL,		INST_NOP},
+	{C_OPCODE_RET,		INST_NOP},
+	{C_OPCODE_SSG,		INST_NOP},
+	{C_OPCODE_CMP,		INST_NOP},
+	{C_OPCODE_SCS,		INST_NOP},
+	{C_OPCODE_TXB,		INST_NOP},
+	{C_OPCODE_NRM,		INST_NOP},
+	{C_OPCODE_DIV,		INST_NOP},
+	{C_OPCODE_DP2,		INST_NOP},
+	{C_OPCODE_TXL,		INST_NOP},
+	{C_OPCODE_BRK,		INST_NOP},
+	{C_OPCODE_IF,		INST_NOP},
+	{C_OPCODE_BGNFOR,	INST_NOP},
+	{C_OPCODE_REP,		INST_NOP},
+	{C_OPCODE_ELSE,		INST_NOP},
+	{C_OPCODE_ENDIF,	INST_NOP},
+	{C_OPCODE_ENDFOR,	INST_NOP},
+	{C_OPCODE_ENDREP,	INST_NOP},
+	{C_OPCODE_PUSHA,	INST_NOP},
+	{C_OPCODE_POPA,		INST_NOP},
+	{C_OPCODE_CEIL,		INST_NOP},
+	{C_OPCODE_I2F,		INST_NOP},
+	{C_OPCODE_NOT,		INST_NOP},
+	{C_OPCODE_TRUNC,	INST_NOP},
+	{C_OPCODE_SHL,		INST_NOP},
+	{88,			INST_NOP},
+	{C_OPCODE_AND,		INST_NOP},
+	{C_OPCODE_OR,		INST_NOP},
+	{C_OPCODE_MOD,		INST_NOP},
+	{C_OPCODE_XOR,		INST_NOP},
+	{C_OPCODE_SAD,		INST_NOP},
+	{C_OPCODE_TXF,		INST_NOP},
+	{C_OPCODE_TXQ,		INST_NOP},
+	{C_OPCODE_CONT,		INST_NOP},
+	{C_OPCODE_EMIT,		INST_NOP},
+	{C_OPCODE_ENDPRIM,	INST_NOP},
+	{C_OPCODE_BGNLOOP,	INST_NOP},
+	{C_OPCODE_BGNSUB,	INST_NOP},
+	{C_OPCODE_ENDLOOP,	INST_NOP},
+	{C_OPCODE_ENDSUB,	INST_NOP},
+	{103,			INST_NOP},
+	{104,			INST_NOP},
+	{105,			INST_NOP},
+	{106,			INST_NOP},
+	{107,			INST_NOP},
+	{108,			INST_NOP},
+	{109,			INST_NOP},
+	{110,			INST_NOP},
+	{111,			INST_NOP},
+	{C_OPCODE_NRM4,		INST_NOP},
+	{C_OPCODE_CALLNZ,	INST_NOP},
+	{C_OPCODE_IFC,		INST_NOP},
+	{C_OPCODE_BREAKC,	INST_NOP},
+	{C_OPCODE_KIL,		INST_NOP},
+	{C_OPCODE_END,		INST_NOP},
+	{118,			INST_NOP},
+	{C_OPCODE_F2I,		INST_NOP},
+	{C_OPCODE_IDIV,		INST_NOP},
+	{C_OPCODE_IMAX,		INST_NOP},
+	{C_OPCODE_IMIN,		INST_NOP},
+	{C_OPCODE_INEG,		INST_NOP},
+	{C_OPCODE_ISGE,		INST_NOP},
+	{C_OPCODE_ISHR,		INST_NOP},
+	{C_OPCODE_ISLT,		INST_NOP},
+	{C_OPCODE_F2U,		INST_NOP},
+	{C_OPCODE_U2F,		INST_NOP},
+	{C_OPCODE_UADD,		INST_NOP},
+	{C_OPCODE_UDIV,		INST_NOP},
+	{C_OPCODE_UMAD,		INST_NOP},
+	{C_OPCODE_UMAX,		INST_NOP},
+	{C_OPCODE_UMIN,		INST_NOP},
+	{C_OPCODE_UMOD,		INST_NOP},
+	{C_OPCODE_UMUL,		INST_NOP},
+	{C_OPCODE_USEQ,		INST_NOP},
+	{C_OPCODE_USGE,		INST_NOP},
+	{C_OPCODE_USHR,		INST_NOP},
+	{C_OPCODE_USLT,		INST_NOP},
+	{C_OPCODE_USNE,		INST_NOP},
+	{C_OPCODE_SWITCH,	INST_NOP},
+	{C_OPCODE_CASE,		INST_NOP},
+	{C_OPCODE_DEFAULT,	INST_NOP},
+	{C_OPCODE_ENDSWITCH,	INST_NOP},
+	{C_OPCODE_VFETCH,	INST_NOP},
+	{C_OPCODE_ENTRY,	INST_NOP},
+	{C_OPCODE_ARL,		INST_NOP},
+};
diff --git a/src/gallium/drivers/r600/r600_compiler_r700.c b/src/gallium/drivers/r600/r600_compiler_r700.c
new file mode 100644
index 0000000000..809a57ae5c
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_compiler_r700.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <errno.h>
+#include "r600_context.h"
+#include "r700_sq.h"
+
+static int r700_shader_cf_node_bytecode(struct r600_shader *rshader,
+					struct r600_shader_node *rnode,
+					unsigned *cid)
+{
+	unsigned id = *cid;
+
+	if (rnode->nfetch) {
+		rshader->bcode[id++] = S_SQ_CF_WORD0_ADDR(rnode->cf_addr >> 1);
+		rshader->bcode[id++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
+					S_SQ_CF_WORD1_BARRIER(1) |
+					S_SQ_CF_WORD1_COUNT(rnode->nfetch - 1);
+	} else {
+		rshader->bcode[id++] = S_SQ_CF_ALU_WORD0_ADDR(rnode->cf_addr >> 1);
+		rshader->bcode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU) |
+					S_SQ_CF_ALU_WORD1_BARRIER(1) |
+					S_SQ_CF_ALU_WORD1_COUNT(rnode->nslot - 1);
+	}
+	*cid = id;
+	return 0;
+}
+
+static int r700_shader_cf_output_bytecode(struct r600_shader *rshader,
+						struct c_vector *v,
+						unsigned *cid,
+						unsigned end)
+{
+	struct r600_shader_operand out;
+	unsigned id = *cid;
+	int r;
+
+	r = r600_shader_find_gpr(rshader, v, 0, &out);
+	if (r)
+		return r;
+	rshader->bcode[id + 0] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(out.sel) |
+				S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(3);
+	rshader->bcode[id + 1] = S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(0) |
+		S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(1) |
+		S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(2) |
+		S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(3) |
+		S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(1) |
+		S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE) |
+		S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(end);
+	switch (v->name) {
+	case C_SEMANTIC_POSITION:
+		rshader->bcode[id + 0] |= S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(60) |
+			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
+		break;
+	case C_SEMANTIC_COLOR:
+		if (rshader->cshader.type == C_PROGRAM_TYPE_VS) {
+			rshader->output[rshader->noutput].gpr = out.sel;
+			rshader->output[rshader->noutput].sid = v->sid;
+			rshader->output[rshader->noutput].name = v->name;
+			rshader->bcode[id + 0] |= S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(rshader->noutput++) |
+				S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
+		} else {
+			rshader->bcode[id + 0] |= S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(0) |
+				S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL);
+		}
+		break;
+	case C_SEMANTIC_GENERIC:
+		rshader->output[rshader->noutput].gpr = out.sel;
+		rshader->output[rshader->noutput].sid = v->sid;
+		rshader->output[rshader->noutput].name = v->name;
+		rshader->bcode[id + 0] |= S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(rshader->noutput++) |
+			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
+		break;
+	default:
+		fprintf(stderr, "%s:%d unsupported\n", __func__, __LINE__);
+		return -EINVAL;
+	}
+	*cid = id + 2;
+	return 0;
+}
+
+static int r700_shader_alu_bytecode(struct r600_shader *rshader,
+					struct r600_shader_node *rnode,
+					struct r600_shader_inst *alu,
+					unsigned *cid)
+{
+	unsigned id = *cid;
+
+	/* don't replace gpr by pv or ps for destination register */
+	if (alu->is_op3) {
+		rshader->bcode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
+					S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
+					S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
+					S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
+					S_SQ_ALU_WORD0_LAST(alu->last);
+		rshader->bcode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
+					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
+					S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
+					S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
+					S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
+					S_SQ_ALU_WORD1_OP3_ALU_INST(alu->opcode) |
+					S_SQ_ALU_WORD1_BANK_SWIZZLE(0);
+	} else {
+		rshader->bcode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
+					S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
+					S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
+					S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
+					S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
+					S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
+					S_SQ_ALU_WORD0_LAST(alu->last);
+		rshader->bcode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
+					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
+					S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
+					S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
+					S_SQ_ALU_WORD1_OP2_WRITE_MASK(1) |
+					S_SQ_ALU_WORD1_OP2_ALU_INST(alu->opcode) |
+					S_SQ_ALU_WORD1_BANK_SWIZZLE(0);
+	}
+	*cid = id;
+	return 0;
+}
+
+int r700_shader_translate(struct r600_shader *rshader)
+{
+	struct c_shader *shader = &rshader->cshader;
+	struct r600_shader_node *rnode;
+	struct r600_shader_vfetch *vfetch;
+	struct r600_shader_alu *alu;
+	struct c_vector *v;
+	unsigned id, i, end;
+	int r;
+
+	r = r600_shader_register(rshader);
+	if (r) {
+		fprintf(stderr, "%s %d register allocation failed\n", __FILE__, __LINE__);
+		return r;
+	}
+	r = r600_shader_translate_rec(rshader, &shader->entry);
+	if (r) {
+		fprintf(stderr, "%s %d translation failed\n", __FILE__, __LINE__);
+		return r;
+	}
+	r = r600_shader_legalize(rshader);
+	if (r) {
+		fprintf(stderr, "%s %d legalize failed\n", __FILE__, __LINE__);
+		return r;
+	}
+	r600_shader_node_place(rshader);
+	rshader->bcode = malloc(rshader->ndw * 4);
+	if (rshader->bcode == NULL)
+		return -ENOMEM;
+	c_list_for_each(rnode, &rshader->nodes) {
+		id = rnode->cf_addr;
+		c_list_for_each(vfetch, &rnode->vfetch) {
+			r = r600_shader_vfetch_bytecode(rshader, rnode, vfetch, &id);
+			if (r)
+				return r;
+		}
+		c_list_for_each(alu, &rnode->alu) {
+			for (i = 0; i < alu->nalu; i++) {
+				r = r700_shader_alu_bytecode(rshader, rnode, &alu->alu[i], &id);
+				if (r)
+					return r;
+			}
+			for (i = 0; i < alu->nliteral; i++) {
+				rshader->bcode[id++] = alu->literal[i];
+			}
+		}
+	}
+	id = 0;
+	c_list_for_each(rnode, &rshader->nodes) {
+		r = r700_shader_cf_node_bytecode(rshader, rnode, &id);
+		if (r)
+			return r;
+	}
+	c_list_for_each(v, &rshader->cshader.files[C_FILE_OUTPUT].vectors) {
+		end = 0;
+		if (v->next == &rshader->cshader.files[C_FILE_OUTPUT].vectors)
+			end = 1;
+		r = r700_shader_cf_output_bytecode(rshader, v, &id, end);
+		if (r)
+			return r;
+	}
+	c_list_for_each(v, &rshader->cshader.files[C_FILE_INPUT].vectors) {
+		rshader->input[rshader->ninput].gpr = rshader->ninput;
+		rshader->input[rshader->ninput].sid = v->sid;
+		rshader->input[rshader->ninput].name = v->name;
+		rshader->ninput++;
+	}
+	return 0;
+}
diff --git a/src/gallium/drivers/r600/r600_compiler_tgsi.c b/src/gallium/drivers/r600/r600_compiler_tgsi.c
new file mode 100644
index 0000000000..172cf154a3
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_compiler_tgsi.c
@@ -0,0 +1,730 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <errno.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_scan.h>
+#include "r600_shader.h"
+#include "r600_context.h"
+
+struct tgsi_shader {
+	struct c_vector				**v[TGSI_FILE_COUNT];
+	struct tgsi_shader_info			info;
+	struct tgsi_parse_context		parser;
+	const struct tgsi_token			*tokens;
+	struct c_shader				*shader;
+	struct c_node				*node;
+};
+
+static unsigned tgsi_file_to_c_file(unsigned file);
+static unsigned tgsi_sname_to_c_sname(unsigned sname);
+static int tgsi_opcode_to_c_opcode(unsigned opcode, unsigned *copcode);
+
+static int tgsi_shader_init(struct tgsi_shader *ts,
+				const struct tgsi_token *tokens,
+				struct c_shader *shader)
+{
+	int i;
+
+	ts->shader = shader;
+	ts->tokens = tokens;
+	tgsi_scan_shader(ts->tokens, &ts->info);
+	tgsi_parse_init(&ts->parser, ts->tokens);
+	/* initialize to NULL in case of error */
+	for (i = 0; i < C_FILE_COUNT; i++) {
+		ts->v[i] = NULL;
+	}
+	for (i = 0; i < TGSI_FILE_COUNT; i++) {
+		if (ts->info.file_count[i] > 0) {
+			ts->v[i] = calloc(ts->info.file_count[i], sizeof(void*));
+			if (ts->v[i] == NULL) {
+				fprintf(stderr, "%s:%d unsupported %d %d\n", __func__, __LINE__, i, ts->info.file_count[i]);
+				return -ENOMEM;
+			}
+		}
+	}
+	return 0;
+}
+
+static void tgsi_shader_destroy(struct tgsi_shader *ts)
+{
+	int i;
+
+	for (i = 0; i < TGSI_FILE_COUNT; i++) {
+		free(ts->v[i]);
+	}
+	tgsi_parse_free(&ts->parser);
+}
+
+static int ntransform_declaration(struct tgsi_shader *ts)
+{
+	struct tgsi_full_declaration *fd = &ts->parser.FullToken.FullDeclaration;
+	struct c_vector *v;
+	unsigned file;
+	unsigned name;
+	int sid;
+	int i;
+
+	if (fd->Declaration.Dimension) {
+		fprintf(stderr, "%s:%d unsupported\n", __func__, __LINE__);
+		return -EINVAL;
+	}
+	for (i = fd->Range.First ; i <= fd->Range.Last; i++) {
+		sid = i;
+		name = C_SEMANTIC_GENERIC;
+		file = tgsi_file_to_c_file(fd->Declaration.File);
+		if (file == TGSI_FILE_NULL) {
+			fprintf(stderr, "%s:%d unsupported\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+		if (fd->Declaration.Semantic) {
+			name = tgsi_sname_to_c_sname(fd->Semantic.Name);
+			sid = fd->Semantic.Index;
+		}
+		v = c_shader_vector_new(ts->shader, file, name, sid);
+		if (v == NULL) {
+			fprintf(stderr, "%s:%d unsupported\n", __func__, __LINE__);
+			return -ENOMEM;
+		}
+		ts->v[fd->Declaration.File][i] = v;
+	}
+	return 0;
+}
+
+static int ntransform_immediate(struct tgsi_shader *ts)
+{
+	struct tgsi_full_immediate *fd = &ts->parser.FullToken.FullImmediate;
+	struct c_vector *v;
+	unsigned file;
+	unsigned name;
+
+	if (fd->Immediate.DataType != TGSI_IMM_FLOAT32) {
+		fprintf(stderr, "%s:%d unsupported\n", __func__, __LINE__);
+		return -EINVAL;
+	}
+	name = C_SEMANTIC_GENERIC;
+	file = C_FILE_IMMEDIATE;
+	v = c_shader_vector_new(ts->shader, file, name, 0);
+	if (v == NULL) {
+		fprintf(stderr, "%s:%d unsupported\n", __func__, __LINE__);
+			return -ENOMEM;
+	}
+	v->channel[0]->value = fd->u[0].Uint;
+	v->channel[1]->value = fd->u[1].Uint;
+	v->channel[2]->value = fd->u[2].Uint;
+	v->channel[3]->value = fd->u[3].Uint;
+	ts->v[TGSI_FILE_IMMEDIATE][0] = v;
+	return 0;
+}
+
+static int ntransform_instruction(struct tgsi_shader *ts)
+{
+	struct tgsi_full_instruction *fi = &ts->parser.FullToken.FullInstruction;
+	struct c_shader *shader = ts->shader;
+	struct c_instruction instruction;
+	unsigned opcode;
+	int i, j, r;
+
+	if (fi->Instruction.NumDstRegs > 1) {
+		fprintf(stderr, "%s %d unsupported\n", __func__, __LINE__);
+		return -EINVAL;
+	}
+	if (fi->Instruction.Saturate) {
+		fprintf(stderr, "%s %d unsupported\n", __func__, __LINE__);
+		return -EINVAL;
+	}
+	if (fi->Instruction.Predicate) {
+		fprintf(stderr, "%s %d unsupported\n", __func__, __LINE__);
+		return -EINVAL;
+	}
+	if (fi->Instruction.Label) {
+		fprintf(stderr, "%s %d unsupported\n", __func__, __LINE__);
+		return -EINVAL;
+	}
+	if (fi->Instruction.Texture) {
+		fprintf(stderr, "%s %d unsupported\n", __func__, __LINE__);
+		return -EINVAL;
+	}
+	for (i = 0; i < fi->Instruction.NumSrcRegs; i++) {
+		if (fi->Src[i].Register.Indirect ||
+			fi->Src[i].Register.Dimension ||
+			fi->Src[i].Register.Absolute) {
+			fprintf(stderr, "%s %d unsupported\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+	}
+	for (i = 0; i < fi->Instruction.NumDstRegs; i++) {
+		if (fi->Dst[i].Register.Indirect || fi->Dst[i].Register.Dimension) {
+			fprintf(stderr, "%s %d unsupported\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+	}
+	r = tgsi_opcode_to_c_opcode(fi->Instruction.Opcode, &opcode);
+	if (r) {
+		fprintf(stderr, "%s:%d unsupported\n", __func__, __LINE__);
+		return r;
+	}
+	if (opcode == C_OPCODE_END) {
+		return c_node_cfg_link(ts->node, &shader->end);
+	}
+	/* FIXME add flow instruction handling */
+	memset(&instruction, 0, sizeof(struct c_instruction));
+	instruction.nop = 0;
+	for (j = 0; j < 4; j++) {
+		instruction.op[instruction.nop].opcode = opcode;
+		instruction.op[instruction.nop].ninput = fi->Instruction.NumSrcRegs;
+		for (i = 0; i < fi->Instruction.NumSrcRegs; i++) {
+			instruction.op[instruction.nop].input[i].vector = ts->v[fi->Src[i].Register.File][fi->Src[i].Register.Index];
+			switch (j) {
+			case 0:
+				instruction.op[instruction.nop].input[i].swizzle = fi->Src[i].Register.SwizzleX;
+				break;
+			case 1:
+				instruction.op[instruction.nop].input[i].swizzle = fi->Src[i].Register.SwizzleY;
+				break;
+			case 2:
+				instruction.op[instruction.nop].input[i].swizzle = fi->Src[i].Register.SwizzleZ;
+				break;
+			case 3:
+				instruction.op[instruction.nop].input[i].swizzle = fi->Src[i].Register.SwizzleW;
+				break;
+			default:
+				return -EINVAL;
+			}
+		}
+		instruction.op[instruction.nop].output.vector = ts->v[fi->Dst[0].Register.File][fi->Dst[0].Register.Index];
+		switch (j) {
+		case 0:
+			instruction.op[instruction.nop].output.swizzle = (fi->Dst[0].Register.WriteMask & 0x1) ? C_SWIZZLE_X : C_SWIZZLE_D;
+			break;
+		case 1:
+			instruction.op[instruction.nop].output.swizzle = (fi->Dst[0].Register.WriteMask & 0x1) ? C_SWIZZLE_Y : C_SWIZZLE_D;
+			break;
+		case 2:
+			instruction.op[instruction.nop].output.swizzle = (fi->Dst[0].Register.WriteMask & 0x1) ? C_SWIZZLE_Z : C_SWIZZLE_D;
+			break;
+		case 3:
+			instruction.op[instruction.nop].output.swizzle = (fi->Dst[0].Register.WriteMask & 0x1) ? C_SWIZZLE_W : C_SWIZZLE_D;
+			break;
+		default:
+			return -EINVAL;
+		}
+		instruction.nop++;
+	}
+	return c_node_add_new_instruction(ts->node, &instruction);
+}
+
+int c_shader_from_tgsi(struct c_shader *shader, unsigned type,
+			const struct tgsi_token *tokens)
+{
+	struct tgsi_shader ts;
+	int r = 0;
+
+	c_shader_init(shader, type);
+	r = tgsi_shader_init(&ts, tokens, shader);
+	if (r)
+		goto out_err;
+	ts.shader = shader;
+	ts.node = &shader->entry;
+	while (!tgsi_parse_end_of_tokens(&ts.parser)) {
+		tgsi_parse_token(&ts.parser);
+		switch (ts.parser.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			r = ntransform_immediate(&ts);
+			if (r)
+				goto out_err;
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+			r = ntransform_declaration(&ts);
+			if (r)
+				goto out_err;
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			r = ntransform_instruction(&ts);
+			if (r)
+				goto out_err;
+			break;
+		default:
+			r = -EINVAL;
+			goto out_err;
+		}
+	}
+	tgsi_shader_destroy(&ts);
+	return 0;
+out_err:
+	c_shader_destroy(shader);
+	tgsi_shader_destroy(&ts);
+	return r;
+}
+
+static unsigned tgsi_file_to_c_file(unsigned file)
+{
+	switch (file) {
+	case TGSI_FILE_CONSTANT:
+		return C_FILE_CONSTANT;
+	case TGSI_FILE_INPUT:
+		return C_FILE_INPUT;
+	case TGSI_FILE_OUTPUT:
+		return C_FILE_OUTPUT;
+	case TGSI_FILE_TEMPORARY:
+		return C_FILE_TEMPORARY;
+	case TGSI_FILE_SAMPLER:
+		return C_FILE_SAMPLER;
+	case TGSI_FILE_ADDRESS:
+		return C_FILE_ADDRESS;
+	case TGSI_FILE_IMMEDIATE:
+		return C_FILE_IMMEDIATE;
+	case TGSI_FILE_PREDICATE:
+		return C_FILE_PREDICATE;
+	case TGSI_FILE_SYSTEM_VALUE:
+		return C_FILE_SYSTEM_VALUE;
+	case TGSI_FILE_NULL:
+		return C_FILE_NULL;
+	default:
+		fprintf(stderr, "%s:%d unsupported file %d\n", __func__, __LINE__, file);
+		return C_FILE_NULL;
+	}
+}
+
+static unsigned tgsi_sname_to_c_sname(unsigned sname)
+{
+	switch (sname) {
+	case TGSI_SEMANTIC_POSITION:
+		return C_SEMANTIC_POSITION;
+	case TGSI_SEMANTIC_COLOR:
+		return C_SEMANTIC_COLOR;
+	case TGSI_SEMANTIC_BCOLOR:
+		return C_SEMANTIC_BCOLOR;
+	case TGSI_SEMANTIC_FOG:
+		return C_SEMANTIC_FOG;
+	case TGSI_SEMANTIC_PSIZE:
+		return C_SEMANTIC_PSIZE;
+	case TGSI_SEMANTIC_GENERIC:
+		return C_SEMANTIC_GENERIC;
+	case TGSI_SEMANTIC_NORMAL:
+		return C_SEMANTIC_NORMAL;
+	case TGSI_SEMANTIC_FACE:
+		return C_SEMANTIC_FACE;
+	case TGSI_SEMANTIC_EDGEFLAG:
+		return C_SEMANTIC_EDGEFLAG;
+	case TGSI_SEMANTIC_PRIMID:
+		return C_SEMANTIC_PRIMID;
+	case TGSI_SEMANTIC_INSTANCEID:
+		return C_SEMANTIC_INSTANCEID;
+	default:
+		return C_SEMANTIC_GENERIC;
+	}
+}
+
+static int tgsi_opcode_to_c_opcode(unsigned opcode, unsigned *copcode)
+{
+	switch (opcode) {
+	case TGSI_OPCODE_MOV:
+		*copcode = C_OPCODE_MOV;
+		return 0;
+	case TGSI_OPCODE_MUL:
+		*copcode = C_OPCODE_MUL;
+		return 0;
+	case TGSI_OPCODE_MAD:
+		*copcode = C_OPCODE_MAD;
+		return 0;
+	case TGSI_OPCODE_END:
+		*copcode = C_OPCODE_END;
+		return 0;
+	case TGSI_OPCODE_ARL:
+		*copcode = C_OPCODE_ARL;
+		return 0;
+	case TGSI_OPCODE_LIT:
+		*copcode = C_OPCODE_LIT;
+		return 0;
+	case TGSI_OPCODE_RCP:
+		*copcode = C_OPCODE_RCP;
+		return 0;
+	case TGSI_OPCODE_RSQ:
+		*copcode = C_OPCODE_RSQ;
+		return 0;
+	case TGSI_OPCODE_EXP:
+		*copcode = C_OPCODE_EXP;
+		return 0;
+	case TGSI_OPCODE_LOG:
+		*copcode = C_OPCODE_LOG;
+		return 0;
+	case TGSI_OPCODE_ADD:
+		*copcode = C_OPCODE_ADD;
+		return 0;
+	case TGSI_OPCODE_DP3:
+		*copcode = C_OPCODE_DP3;
+		return 0;
+	case TGSI_OPCODE_DP4:
+		*copcode = C_OPCODE_DP4;
+		return 0;
+	case TGSI_OPCODE_DST:
+		*copcode = C_OPCODE_DST;
+		return 0;
+	case TGSI_OPCODE_MIN:
+		*copcode = C_OPCODE_MIN;
+		return 0;
+	case TGSI_OPCODE_MAX:
+		*copcode = C_OPCODE_MAX;
+		return 0;
+	case TGSI_OPCODE_SLT:
+		*copcode = C_OPCODE_SLT;
+		return 0;
+	case TGSI_OPCODE_SGE:
+		*copcode = C_OPCODE_SGE;
+		return 0;
+	case TGSI_OPCODE_SUB:
+		*copcode = C_OPCODE_SUB;
+		return 0;
+	case TGSI_OPCODE_LRP:
+		*copcode = C_OPCODE_LRP;
+		return 0;
+	case TGSI_OPCODE_CND:
+		*copcode = C_OPCODE_CND;
+		return 0;
+	case TGSI_OPCODE_DP2A:
+		*copcode = C_OPCODE_DP2A;
+		return 0;
+	case TGSI_OPCODE_FRC:
+		*copcode = C_OPCODE_FRC;
+		return 0;
+	case TGSI_OPCODE_CLAMP:
+		*copcode = C_OPCODE_CLAMP;
+		return 0;
+	case TGSI_OPCODE_FLR:
+		*copcode = C_OPCODE_FLR;
+		return 0;
+	case TGSI_OPCODE_ROUND:
+		*copcode = C_OPCODE_ROUND;
+		return 0;
+	case TGSI_OPCODE_EX2:
+		*copcode = C_OPCODE_EX2;
+		return 0;
+	case TGSI_OPCODE_LG2:
+		*copcode = C_OPCODE_LG2;
+		return 0;
+	case TGSI_OPCODE_POW:
+		*copcode = C_OPCODE_POW;
+		return 0;
+	case TGSI_OPCODE_XPD:
+		*copcode = C_OPCODE_XPD;
+		return 0;
+	case TGSI_OPCODE_ABS:
+		*copcode = C_OPCODE_ABS;
+		return 0;
+	case TGSI_OPCODE_RCC:
+		*copcode = C_OPCODE_RCC;
+		return 0;
+	case TGSI_OPCODE_DPH:
+		*copcode = C_OPCODE_DPH;
+		return 0;
+	case TGSI_OPCODE_COS:
+		*copcode = C_OPCODE_COS;
+		return 0;
+	case TGSI_OPCODE_DDX:
+		*copcode = C_OPCODE_DDX;
+		return 0;
+	case TGSI_OPCODE_DDY:
+		*copcode = C_OPCODE_DDY;
+		return 0;
+	case TGSI_OPCODE_KILP:
+		*copcode = C_OPCODE_KILP;
+		return 0;
+	case TGSI_OPCODE_PK2H:
+		*copcode = C_OPCODE_PK2H;
+		return 0;
+	case TGSI_OPCODE_PK2US:
+		*copcode = C_OPCODE_PK2US;
+		return 0;
+	case TGSI_OPCODE_PK4B:
+		*copcode = C_OPCODE_PK4B;
+		return 0;
+	case TGSI_OPCODE_PK4UB:
+		*copcode = C_OPCODE_PK4UB;
+		return 0;
+	case TGSI_OPCODE_RFL:
+		*copcode = C_OPCODE_RFL;
+		return 0;
+	case TGSI_OPCODE_SEQ:
+		*copcode = C_OPCODE_SEQ;
+		return 0;
+	case TGSI_OPCODE_SFL:
+		*copcode = C_OPCODE_SFL;
+		return 0;
+	case TGSI_OPCODE_SGT:
+		*copcode = C_OPCODE_SGT;
+		return 0;
+	case TGSI_OPCODE_SIN:
+		*copcode = C_OPCODE_SIN;
+		return 0;
+	case TGSI_OPCODE_SLE:
+		*copcode = C_OPCODE_SLE;
+		return 0;
+	case TGSI_OPCODE_SNE:
+		*copcode = C_OPCODE_SNE;
+		return 0;
+	case TGSI_OPCODE_STR:
+		*copcode = C_OPCODE_STR;
+		return 0;
+	case TGSI_OPCODE_TEX:
+		*copcode = C_OPCODE_TEX;
+		return 0;
+	case TGSI_OPCODE_TXD:
+		*copcode = C_OPCODE_TXD;
+		return 0;
+	case TGSI_OPCODE_TXP:
+		*copcode = C_OPCODE_TXP;
+		return 0;
+	case TGSI_OPCODE_UP2H:
+		*copcode = C_OPCODE_UP2H;
+		return 0;
+	case TGSI_OPCODE_UP2US:
+		*copcode = C_OPCODE_UP2US;
+		return 0;
+	case TGSI_OPCODE_UP4B:
+		*copcode = C_OPCODE_UP4B;
+		return 0;
+	case TGSI_OPCODE_UP4UB:
+		*copcode = C_OPCODE_UP4UB;
+		return 0;
+	case TGSI_OPCODE_X2D:
+		*copcode = C_OPCODE_X2D;
+		return 0;
+	case TGSI_OPCODE_ARA:
+		*copcode = C_OPCODE_ARA;
+		return 0;
+	case TGSI_OPCODE_ARR:
+		*copcode = C_OPCODE_ARR;
+		return 0;
+	case TGSI_OPCODE_BRA:
+		*copcode = C_OPCODE_BRA;
+		return 0;
+	case TGSI_OPCODE_CAL:
+		*copcode = C_OPCODE_CAL;
+		return 0;
+	case TGSI_OPCODE_RET:
+		*copcode = C_OPCODE_RET;
+		return 0;
+	case TGSI_OPCODE_SSG:
+		*copcode = C_OPCODE_SSG;
+		return 0;
+	case TGSI_OPCODE_CMP:
+		*copcode = C_OPCODE_CMP;
+		return 0;
+	case TGSI_OPCODE_SCS:
+		*copcode = C_OPCODE_SCS;
+		return 0;
+	case TGSI_OPCODE_TXB:
+		*copcode = C_OPCODE_TXB;
+		return 0;
+	case TGSI_OPCODE_NRM:
+		*copcode = C_OPCODE_NRM;
+		return 0;
+	case TGSI_OPCODE_DIV:
+		*copcode = C_OPCODE_DIV;
+		return 0;
+	case TGSI_OPCODE_DP2:
+		*copcode = C_OPCODE_DP2;
+		return 0;
+	case TGSI_OPCODE_TXL:
+		*copcode = C_OPCODE_TXL;
+		return 0;
+	case TGSI_OPCODE_BRK:
+		*copcode = C_OPCODE_BRK;
+		return 0;
+	case TGSI_OPCODE_IF:
+		*copcode = C_OPCODE_IF;
+		return 0;
+	case TGSI_OPCODE_ELSE:
+		*copcode = C_OPCODE_ELSE;
+		return 0;
+	case TGSI_OPCODE_ENDIF:
+		*copcode = C_OPCODE_ENDIF;
+		return 0;
+	case TGSI_OPCODE_PUSHA:
+		*copcode = C_OPCODE_PUSHA;
+		return 0;
+	case TGSI_OPCODE_POPA:
+		*copcode = C_OPCODE_POPA;
+		return 0;
+	case TGSI_OPCODE_CEIL:
+		*copcode = C_OPCODE_CEIL;
+		return 0;
+	case TGSI_OPCODE_I2F:
+		*copcode = C_OPCODE_I2F;
+		return 0;
+	case TGSI_OPCODE_NOT:
+		*copcode = C_OPCODE_NOT;
+		return 0;
+	case TGSI_OPCODE_TRUNC:
+		*copcode = C_OPCODE_TRUNC;
+		return 0;
+	case TGSI_OPCODE_SHL:
+		*copcode = C_OPCODE_SHL;
+		return 0;
+	case TGSI_OPCODE_AND:
+		*copcode = C_OPCODE_AND;
+		return 0;
+	case TGSI_OPCODE_OR:
+		*copcode = C_OPCODE_OR;
+		return 0;
+	case TGSI_OPCODE_MOD:
+		*copcode = C_OPCODE_MOD;
+		return 0;
+	case TGSI_OPCODE_XOR:
+		*copcode = C_OPCODE_XOR;
+		return 0;
+	case TGSI_OPCODE_SAD:
+		*copcode = C_OPCODE_SAD;
+		return 0;
+	case TGSI_OPCODE_TXF:
+		*copcode = C_OPCODE_TXF;
+		return 0;
+	case TGSI_OPCODE_TXQ:
+		*copcode = C_OPCODE_TXQ;
+		return 0;
+	case TGSI_OPCODE_CONT:
+		*copcode = C_OPCODE_CONT;
+		return 0;
+	case TGSI_OPCODE_EMIT:
+		*copcode = C_OPCODE_EMIT;
+		return 0;
+	case TGSI_OPCODE_ENDPRIM:
+		*copcode = C_OPCODE_ENDPRIM;
+		return 0;
+	case TGSI_OPCODE_BGNLOOP:
+		*copcode = C_OPCODE_BGNLOOP;
+		return 0;
+	case TGSI_OPCODE_BGNSUB:
+		*copcode = C_OPCODE_BGNSUB;
+		return 0;
+	case TGSI_OPCODE_ENDLOOP:
+		*copcode = C_OPCODE_ENDLOOP;
+		return 0;
+	case TGSI_OPCODE_ENDSUB:
+		*copcode = C_OPCODE_ENDSUB;
+		return 0;
+	case TGSI_OPCODE_NOP:
+		*copcode = C_OPCODE_NOP;
+		return 0;
+	case TGSI_OPCODE_NRM4:
+		*copcode = C_OPCODE_NRM4;
+		return 0;
+	case TGSI_OPCODE_CALLNZ:
+		*copcode = C_OPCODE_CALLNZ;
+		return 0;
+	case TGSI_OPCODE_IFC:
+		*copcode = C_OPCODE_IFC;
+		return 0;
+	case TGSI_OPCODE_BREAKC:
+		*copcode = C_OPCODE_BREAKC;
+		return 0;
+	case TGSI_OPCODE_KIL:
+		*copcode = C_OPCODE_KIL;
+		return 0;
+	case TGSI_OPCODE_F2I:
+		*copcode = C_OPCODE_F2I;
+		return 0;
+	case TGSI_OPCODE_IDIV:
+		*copcode = C_OPCODE_IDIV;
+		return 0;
+	case TGSI_OPCODE_IMAX:
+		*copcode = C_OPCODE_IMAX;
+		return 0;
+	case TGSI_OPCODE_IMIN:
+		*copcode = C_OPCODE_IMIN;
+		return 0;
+	case TGSI_OPCODE_INEG:
+		*copcode = C_OPCODE_INEG;
+		return 0;
+	case TGSI_OPCODE_ISGE:
+		*copcode = C_OPCODE_ISGE;
+		return 0;
+	case TGSI_OPCODE_ISHR:
+		*copcode = C_OPCODE_ISHR;
+		return 0;
+	case TGSI_OPCODE_ISLT:
+		*copcode = C_OPCODE_ISLT;
+		return 0;
+	case TGSI_OPCODE_F2U:
+		*copcode = C_OPCODE_F2U;
+		return 0;
+	case TGSI_OPCODE_U2F:
+		*copcode = C_OPCODE_U2F;
+		return 0;
+	case TGSI_OPCODE_UADD:
+		*copcode = C_OPCODE_UADD;
+		return 0;
+	case TGSI_OPCODE_UDIV:
+		*copcode = C_OPCODE_UDIV;
+		return 0;
+	case TGSI_OPCODE_UMAD:
+		*copcode = C_OPCODE_UMAD;
+		return 0;
+	case TGSI_OPCODE_UMAX:
+		*copcode = C_OPCODE_UMAX;
+		return 0;
+	case TGSI_OPCODE_UMIN:
+		*copcode = C_OPCODE_UMIN;
+		return 0;
+	case TGSI_OPCODE_UMOD:
+		*copcode = C_OPCODE_UMOD;
+		return 0;
+	case TGSI_OPCODE_UMUL:
+		*copcode = C_OPCODE_UMUL;
+		return 0;
+	case TGSI_OPCODE_USEQ:
+		*copcode = C_OPCODE_USEQ;
+		return 0;
+	case TGSI_OPCODE_USGE:
+		*copcode = C_OPCODE_USGE;
+		return 0;
+	case TGSI_OPCODE_USHR:
+		*copcode = C_OPCODE_USHR;
+		return 0;
+	case TGSI_OPCODE_USLT:
+		*copcode = C_OPCODE_USLT;
+		return 0;
+	case TGSI_OPCODE_USNE:
+		*copcode = C_OPCODE_USNE;
+		return 0;
+	case TGSI_OPCODE_SWITCH:
+		*copcode = C_OPCODE_SWITCH;
+		return 0;
+	case TGSI_OPCODE_CASE:
+		*copcode = C_OPCODE_CASE;
+		return 0;
+	case TGSI_OPCODE_DEFAULT:
+		*copcode = C_OPCODE_DEFAULT;
+		return 0;
+	case TGSI_OPCODE_ENDSWITCH:
+		*copcode = C_OPCODE_ENDSWITCH;
+		return 0;
+	default:
+		fprintf(stderr, "%s:%d unsupported opcode %d\n", __func__, __LINE__, opcode);
+		return -EINVAL;
+	}
+}
diff --git a/src/gallium/drivers/r600/r600_context.c b/src/gallium/drivers/r600/r600_context.c
new file mode 100644
index 0000000000..0a7efe3bfb
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_context.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ *      Corbin Simpson
+ */
+#include <stdio.h>
+#include <util/u_inlines.h>
+#include <util/u_format.h>
+#include <util/u_memory.h>
+#include <util/u_blitter.h>
+#include "r600_resource.h"
+#include "r600_screen.h"
+#include "r600_context.h"
+
+static void r600_destroy_context(struct pipe_context *context)
+{
+	struct r600_context *rctx = r600_context(context);
+
+	FREE(rctx);
+}
+
+static void r600_flush(struct pipe_context *ctx, unsigned flags,
+			struct pipe_fence_handle **fence)
+{
+	struct r600_context *rctx = r600_context(ctx);
+	struct r600_screen *rscreen = rctx->screen;
+	static int dc = 0;
+
+	if (radeon_ctx_pm4(rctx->ctx))
+		return;
+	/* FIXME dumping should be removed once shader support instructions
+	 * without throwing bad code
+	 */
+	if (!dc)
+		radeon_ctx_dump_bof(rctx->ctx, "gallium.bof");
+#if 0
+	radeon_ctx_submit(rctx->ctx);
+#endif
+	rctx->ctx = radeon_ctx_decref(rctx->ctx);
+	rctx->ctx = radeon_ctx(rscreen->rw);
+	dc++;
+}
+
+struct pipe_context *r600_create_context(struct pipe_screen *screen, void *priv)
+{
+	struct r600_context *rctx = CALLOC_STRUCT(r600_context);
+	struct r600_screen* rscreen = r600_screen(screen);
+
+	if (rctx == NULL)
+		return NULL;
+	rctx->context.winsys = rscreen->screen.winsys;
+	rctx->context.screen = screen;
+	rctx->context.priv = priv;
+	rctx->context.destroy = r600_destroy_context;
+	rctx->context.draw_arrays = r600_draw_arrays;
+	rctx->context.draw_elements = r600_draw_elements;
+	rctx->context.draw_range_elements = r600_draw_range_elements;
+	rctx->context.flush = r600_flush;
+
+	/* Easy accessing of screen/winsys. */
+	rctx->screen = rscreen;
+	rctx->rw = rscreen->rw;
+
+	r600_init_blit_functions(rctx);
+	r600_init_query_functions(rctx);
+	r600_init_state_functions(rctx);
+	r600_init_context_resource_functions(rctx);
+
+	rctx->blitter = util_blitter_create(&rctx->context);
+	if (rctx->blitter == NULL) {
+		FREE(rctx);
+		return NULL;
+	}
+
+	rctx->cb_cntl = radeon_state(rscreen->rw, R600_CB_CNTL_TYPE, R600_CB_CNTL);
+	rctx->cb_cntl->states[R600_CB_CNTL__CB_SHADER_MASK] = 0x0000000F;
+	rctx->cb_cntl->states[R600_CB_CNTL__CB_TARGET_MASK] = 0x0000000F;
+	rctx->cb_cntl->states[R600_CB_CNTL__CB_COLOR_CONTROL] = 0x00CC0000;
+	rctx->cb_cntl->states[R600_CB_CNTL__PA_SC_AA_CONFIG] = 0x00000000;
+	rctx->cb_cntl->states[R600_CB_CNTL__PA_SC_AA_SAMPLE_LOCS_MCTX] = 0x00000000;
+	rctx->cb_cntl->states[R600_CB_CNTL__PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX] = 0x00000000;
+	rctx->cb_cntl->states[R600_CB_CNTL__CB_CLRCMP_CONTROL] = 0x01000000;
+	rctx->cb_cntl->states[R600_CB_CNTL__CB_CLRCMP_SRC] = 0x00000000;
+	rctx->cb_cntl->states[R600_CB_CNTL__CB_CLRCMP_DST] = 0x000000FF;
+	rctx->cb_cntl->states[R600_CB_CNTL__CB_CLRCMP_MSK] = 0xFFFFFFFF;
+	rctx->cb_cntl->states[R600_CB_CNTL__PA_SC_AA_MASK] = 0xFFFFFFFF;
+	radeon_state_pm4(rctx->cb_cntl);
+
+	rctx->config = radeon_state(rscreen->rw, R600_CONFIG_TYPE, R600_CONFIG);
+	rctx->config->states[R600_CONFIG__SQ_CONFIG] = 0xE400000C;
+	rctx->config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] = 0x403800C0;
+	rctx->config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] = 0x00000000;
+	rctx->config->states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] = 0x00003090;
+	rctx->config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] = 0x00800080;
+	rctx->config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] = 0x00000000;
+	rctx->config->states[R600_CONFIG__SQ_DYN_GPR_CNTL_PS_FLUSH_REQ] = 0x00004000;
+	rctx->config->states[R600_CONFIG__TA_CNTL_AUX] = 0x07000002;
+	rctx->config->states[R600_CONFIG__VC_ENHANCE] = 0x00000000;
+	rctx->config->states[R600_CONFIG__DB_DEBUG] = 0x00000000;
+	rctx->config->states[R600_CONFIG__DB_WATERMARKS] = 0x00420204;
+	rctx->config->states[R600_CONFIG__SX_MISC] = 0x00000000;
+	rctx->config->states[R600_CONFIG__SPI_THREAD_GROUPING] = 0x00000001;
+	rctx->config->states[R600_CONFIG__CB_SHADER_CONTROL] = 0x00000003;
+	rctx->config->states[R600_CONFIG__SQ_ESGS_RING_ITEMSIZE] = 0x00000000;
+	rctx->config->states[R600_CONFIG__SQ_GSVS_RING_ITEMSIZE] = 0x00000000;
+	rctx->config->states[R600_CONFIG__SQ_ESTMP_RING_ITEMSIZE] = 0x00000000;
+	rctx->config->states[R600_CONFIG__SQ_GSTMP_RING_ITEMSIZE] = 0x00000000;
+	rctx->config->states[R600_CONFIG__SQ_VSTMP_RING_ITEMSIZE] = 0x00000000;
+	rctx->config->states[R600_CONFIG__SQ_PSTMP_RING_ITEMSIZE] = 0x00000000;
+	rctx->config->states[R600_CONFIG__SQ_FBUF_RING_ITEMSIZE] = 0x00000000;
+	rctx->config->states[R600_CONFIG__SQ_REDUC_RING_ITEMSIZE] = 0x00000000;
+	rctx->config->states[R600_CONFIG__SQ_GS_VERT_ITEMSIZE] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_OUTPUT_PATH_CNTL] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_HOS_CNTL] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_HOS_MAX_TESS_LEVEL] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_HOS_MIN_TESS_LEVEL] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_HOS_REUSE_DEPTH] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_GROUP_PRIM_TYPE] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_GROUP_FIRST_DECR] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_GROUP_DECR] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_GROUP_VECT_0_CNTL] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_GROUP_VECT_1_CNTL] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_GROUP_VECT_0_FMT_CNTL] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_GROUP_VECT_1_FMT_CNTL] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_GS_MODE] = 0x00000000;
+	rctx->config->states[R600_CONFIG__PA_SC_MODE_CNTL] = 0x00514000;
+	rctx->config->states[R600_CONFIG__VGT_STRMOUT_EN] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_REUSE_OFF] = 0x00000001;
+	rctx->config->states[R600_CONFIG__VGT_VTX_CNT_EN] = 0x00000000;
+	rctx->config->states[R600_CONFIG__VGT_STRMOUT_BUFFER_EN] = 0x00000000;
+	radeon_state_pm4(rctx->config);
+
+	rctx->ctx = radeon_ctx(rscreen->rw);
+	rctx->draw = radeon_draw(rscreen->rw);
+	return &rctx->context;
+}
diff --git a/src/gallium/drivers/r600/r600_context.h b/src/gallium/drivers/r600/r600_context.h
new file mode 100644
index 0000000000..f27ff58ed4
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_context.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef R600_CONTEXT_H
+#define R600_CONTEXT_H
+
+#include <pipe/p_state.h>
+#include <pipe/p_context.h>
+#include <tgsi/tgsi_scan.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_util.h>
+#include <util/u_blitter.h>
+#include "radeon.h"
+#include "r600_shader.h"
+
+/* XXX move this to a more appropriate place */
+struct r600_vertex_elements_state
+{
+	unsigned count;
+	struct pipe_vertex_element elements[32];
+};
+
+struct r600_pipe_shader {
+	unsigned				type;
+	struct r600_shader			shader;
+	struct radeon_bo			*bo;
+	struct radeon_state			*state;
+};
+
+struct r600_context {
+	struct pipe_context		context;
+	struct r600_screen		*screen;
+	struct radeon			*rw;
+	struct radeon_ctx		*ctx;
+	struct radeon_state		*cb_cntl;
+	struct radeon_state		*db;
+	struct radeon_state		*config;
+	struct r600_pipe_shader		*ps_shader;
+	struct r600_pipe_shader		*vs_shader;
+	unsigned			flat_shade;
+	unsigned			nvertex_buffer;
+	struct r600_vertex_elements_state *vertex_elements;
+	struct pipe_vertex_buffer	vertex_buffer[PIPE_MAX_ATTRIBS];
+	struct blitter_context		*blitter;
+	struct pipe_stencil_ref		stencil_ref;
+	struct pipe_framebuffer_state	fb_state;
+	struct radeon_draw		*draw;
+	struct pipe_viewport_state	viewport;
+};
+
+/* Convenience cast wrapper. */
+static INLINE struct r600_context *r600_context(struct pipe_context *pipe)
+{
+    return (struct r600_context*)pipe;
+}
+
+void r600_draw_arrays(struct pipe_context *ctx, unsigned mode,
+			unsigned start, unsigned count);
+void r600_draw_elements(struct pipe_context *ctx,
+		struct pipe_resource *index_buffer,
+		unsigned index_size, int index_bias, unsigned mode,
+		unsigned start, unsigned count);
+void r600_draw_range_elements(struct pipe_context *ctx,
+		struct pipe_resource *index_buffer,
+		unsigned index_size, int index_bias, unsigned min_index,
+		unsigned max_index, unsigned mode,
+		unsigned start, unsigned count);
+
+void r600_init_blit_functions(struct r600_context *rctx);
+void r600_init_state_functions(struct r600_context *rctx);
+void r600_init_query_functions(struct r600_context* rctx);
+struct pipe_context *r600_create_context(struct pipe_screen *screen, void *priv);
+
+void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *rpshader);
+struct r600_pipe_shader *r600_pipe_shader_create(struct pipe_context *ctx,
+						unsigned type,
+						const struct tgsi_token *tokens);
+int r600_pipe_shader_update(struct pipe_context *ctx, struct r600_pipe_shader *rpshader);
+
+#endif
diff --git a/src/gallium/drivers/r600/r600_draw.c b/src/gallium/drivers/r600/r600_draw.c
new file mode 100644
index 0000000000..724fb6c988
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_draw.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ *      Corbin Simpson
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <pipe/p_screen.h>
+#include <util/u_format.h>
+#include <util/u_math.h>
+#include <util/u_inlines.h>
+#include <util/u_memory.h>
+#include "r600_screen.h"
+#include "r600_context.h"
+#include "r600d.h"
+
+struct r600_draw {
+	struct pipe_context	*ctx;
+	struct radeon_state	*draw;
+	struct radeon_state	*vgt;
+	unsigned		mode;
+	unsigned		start;
+	unsigned		count;
+	unsigned		index_size;
+	struct pipe_resource	*index_buffer;
+};
+
+static int r600_draw_common(struct r600_draw *draw)
+{
+	struct r600_context *rctx = r600_context(draw->ctx);
+	struct r600_screen *rscreen = rctx->screen;
+	struct radeon_state *vs_resource;
+	struct r600_buffer *rbuffer;
+	unsigned i, j, offset, format, prim;
+	u32 vgt_dma_index_type, vgt_draw_initiator;
+	int r;
+
+	switch (draw->index_size) {
+	case 2:
+		vgt_draw_initiator = 0;
+		vgt_dma_index_type = 0;
+		break;
+	case 4:
+		vgt_draw_initiator = 0;
+		vgt_dma_index_type = 1;
+		break;
+	case 0:
+		vgt_draw_initiator = 2;
+		vgt_dma_index_type = 0;
+		break;
+	default:
+		fprintf(stderr, "%s %d unsupported index size %d\n", __func__, __LINE__, draw->index_size);
+		return -EINVAL;
+	}
+	r = r600_conv_pipe_prim(draw->mode, &prim);
+	if (r)
+		return r;
+	/* rebuild vertex shader if input format changed */
+	r = r600_pipe_shader_update(draw->ctx, rctx->vs_shader);
+	if (r)
+		return r;
+	r = r600_pipe_shader_update(draw->ctx, rctx->ps_shader);
+	if (r)
+		return r;
+	r = radeon_draw_set(rctx->draw, rctx->vs_shader->state);
+	if (r)
+		return r;
+	r = radeon_draw_set(rctx->draw, rctx->ps_shader->state);
+	if (r)
+		return r;
+	r = radeon_draw_set(rctx->draw, rctx->cb_cntl);
+	if (r)
+		return r;
+	r = radeon_draw_set(rctx->draw, rctx->db);
+	if (r)
+		return r;
+	r = radeon_draw_set(rctx->draw, rctx->config);
+	if (r)
+		return r;
+
+	for (i = 0 ; i < rctx->vertex_elements->count; i++) {
+		j = rctx->vertex_elements->elements[i].vertex_buffer_index;
+		rbuffer = (struct r600_buffer*)rctx->vertex_buffer[j].buffer;
+		offset = rctx->vertex_elements->elements[i].src_offset + rctx->vertex_buffer[j].buffer_offset;
+		r = r600_conv_pipe_format(rctx->vertex_elements->elements[i].src_format, &format);
+		if (r)
+			return r;
+		vs_resource = radeon_state(rscreen->rw, R600_VS_RESOURCE_TYPE, R600_VS_RESOURCE + i);
+		if (vs_resource == NULL)
+			return -ENOMEM;
+		vs_resource->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+		vs_resource->nbo = 1;
+		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD0] = offset;
+		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD1] = rbuffer->bo->size - offset;
+		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD2] = S_038008_STRIDE(rctx->vertex_buffer[j].stride) |
+								S_038008_DATA_FORMAT(format);
+		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD3] = 0x00000000;
+		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD4] = 0x00000000;
+		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD5] = 0x00000000;
+		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD6] = 0xC0000000;
+		vs_resource->placement[0] = RADEON_GEM_DOMAIN_GTT;
+		vs_resource->placement[1] = RADEON_GEM_DOMAIN_GTT;
+		r = radeon_draw_set_new(rctx->draw, vs_resource);
+		if (r)
+			return r;
+	}
+	/* FIXME start need to change winsys */
+	draw->draw = radeon_state(rscreen->rw, R600_DRAW_TYPE, R600_DRAW);
+	if (draw->draw == NULL)
+		return -ENOMEM;
+	draw->draw->states[R600_DRAW__VGT_NUM_INDICES] = draw->count;
+	draw->draw->states[R600_DRAW__VGT_DRAW_INITIATOR] = vgt_draw_initiator;
+	if (draw->index_buffer) {
+		rbuffer = (struct r600_buffer*)draw->index_buffer;
+		draw->draw->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+		draw->draw->placement[0] = RADEON_GEM_DOMAIN_GTT;
+		draw->draw->placement[1] = RADEON_GEM_DOMAIN_GTT;
+		draw->draw->nbo = 1;
+	}
+	r = radeon_draw_set_new(rctx->draw, draw->draw);
+	if (r)
+		return r;
+	draw->vgt = radeon_state(rscreen->rw, R600_VGT_TYPE, R600_VGT);
+	if (draw->vgt == NULL)
+		return -ENOMEM;
+	draw->vgt->states[R600_VGT__VGT_PRIMITIVE_TYPE] = prim;
+	draw->vgt->states[R600_VGT__VGT_MAX_VTX_INDX] = 0x00FFFFFF;
+	draw->vgt->states[R600_VGT__VGT_MIN_VTX_INDX] = 0x00000000;
+	draw->vgt->states[R600_VGT__VGT_INDX_OFFSET] = draw->start;
+	draw->vgt->states[R600_VGT__VGT_MULTI_PRIM_IB_RESET_INDX] = 0x00000000;
+	draw->vgt->states[R600_VGT__VGT_DMA_INDEX_TYPE] = vgt_dma_index_type;
+	draw->vgt->states[R600_VGT__VGT_PRIMITIVEID_EN] = 0x00000000;
+	draw->vgt->states[R600_VGT__VGT_DMA_NUM_INSTANCES] = 0x00000001;
+	draw->vgt->states[R600_VGT__VGT_MULTI_PRIM_IB_RESET_EN] = 0x00000000;
+	draw->vgt->states[R600_VGT__VGT_INSTANCE_STEP_RATE_0] = 0x00000000;
+	draw->vgt->states[R600_VGT__VGT_INSTANCE_STEP_RATE_1] = 0x00000000;
+	r = radeon_draw_set_new(rctx->draw, draw->vgt);
+	if (r)
+		return r;
+	/* FIXME */
+	r = radeon_ctx_set_draw_new(rctx->ctx, rctx->draw);
+	if (r)
+		return r;
+	rctx->draw = radeon_draw_duplicate(rctx->draw);
+	return 0;
+}
+
+void r600_draw_range_elements(struct pipe_context *ctx,
+		struct pipe_resource *index_buffer,
+		unsigned index_size, int index_bias, unsigned min_index,
+		unsigned max_index, unsigned mode,
+		unsigned start, unsigned count)
+{
+	struct r600_draw draw;
+	assert(index_bias == 0);
+
+	draw.ctx = ctx;
+	draw.mode = mode;
+	draw.start = start;
+	draw.count = count;
+	draw.index_size = index_size;
+	draw.index_buffer = index_buffer;
+printf("index_size %d min %d max %d  start %d  count %d\n", index_size, min_index, max_index, start, count);
+	r600_draw_common(&draw);
+}
+
+void r600_draw_elements(struct pipe_context *ctx,
+		struct pipe_resource *index_buffer,
+		unsigned index_size, int index_bias, unsigned mode,
+		unsigned start, unsigned count)
+{
+	struct r600_draw draw;
+	assert(index_bias == 0);
+
+	draw.ctx = ctx;
+	draw.mode = mode;
+	draw.start = start;
+	draw.count = count;
+	draw.index_size = index_size;
+	draw.index_buffer = index_buffer;
+	r600_draw_common(&draw);
+}
+
+void r600_draw_arrays(struct pipe_context *ctx, unsigned mode,
+			unsigned start, unsigned count)
+{
+	struct r600_draw draw;
+
+	draw.ctx = ctx;
+	draw.mode = mode;
+	draw.start = start;
+	draw.count = count;
+	draw.index_size = 0;
+	draw.index_buffer = NULL;
+	r600_draw_common(&draw);
+}
diff --git a/src/gallium/drivers/r600/r600_helper.c b/src/gallium/drivers/r600/r600_helper.c
new file mode 100644
index 0000000000..e3175b627a
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_helper.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <util/u_inlines.h>
+#include "r600_screen.h"
+#include "r600d.h"
+
+int r600_conv_pipe_format(unsigned pformat, unsigned *format)
+{
+	switch (pformat) {
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+		*format = 0x30;
+		return 0;
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*format = V_0280A0_COLOR_32_32_32_32_FLOAT;
+		return 0;
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R8G8B8X8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_USCALED:
+	case PIPE_FORMAT_R8G8B8A8_SNORM:
+	case PIPE_FORMAT_R8G8B8A8_SSCALED:
+		*format = V_0280A0_COLOR_8_8_8_8;
+		return 0;
+	case PIPE_FORMAT_L8_UNORM:
+	case PIPE_FORMAT_A8_UNORM:
+	case PIPE_FORMAT_I8_UNORM:
+	case PIPE_FORMAT_L16_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
+	case PIPE_FORMAT_Z32_UNORM:
+	case PIPE_FORMAT_Z32_FLOAT:
+	case PIPE_FORMAT_R64_FLOAT:
+	case PIPE_FORMAT_R64G64_FLOAT:
+	case PIPE_FORMAT_R64G64B64_FLOAT:
+	case PIPE_FORMAT_R64G64B64A64_FLOAT:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32_UNORM:
+	case PIPE_FORMAT_R32G32_UNORM:
+	case PIPE_FORMAT_R32G32B32_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_UNORM:
+	case PIPE_FORMAT_R32_USCALED:
+	case PIPE_FORMAT_R32G32_USCALED:
+	case PIPE_FORMAT_R32G32B32_USCALED:
+	case PIPE_FORMAT_R32G32B32A32_USCALED:
+	case PIPE_FORMAT_R32_SNORM:
+	case PIPE_FORMAT_R32G32_SNORM:
+	case PIPE_FORMAT_R32G32B32_SNORM:
+	case PIPE_FORMAT_R32G32B32A32_SNORM:
+	case PIPE_FORMAT_R32_SSCALED:
+	case PIPE_FORMAT_R32G32_SSCALED:
+	case PIPE_FORMAT_R32G32B32_SSCALED:
+	case PIPE_FORMAT_R32G32B32A32_SSCALED:
+	case PIPE_FORMAT_R16_UNORM:
+	case PIPE_FORMAT_R16G16_UNORM:
+	case PIPE_FORMAT_R16G16B16_UNORM:
+	case PIPE_FORMAT_R16G16B16A16_UNORM:
+	case PIPE_FORMAT_R16_USCALED:
+	case PIPE_FORMAT_R16G16_USCALED:
+	case PIPE_FORMAT_R16G16B16_USCALED:
+	case PIPE_FORMAT_R16G16B16A16_USCALED:
+	case PIPE_FORMAT_R16_SNORM:
+	case PIPE_FORMAT_R16G16_SNORM:
+	case PIPE_FORMAT_R16G16B16_SNORM:
+	case PIPE_FORMAT_R16G16B16A16_SNORM:
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8_USCALED:
+	case PIPE_FORMAT_R8G8_USCALED:
+	case PIPE_FORMAT_R8G8B8_USCALED:
+	case PIPE_FORMAT_R8_SNORM:
+	case PIPE_FORMAT_R8G8_SNORM:
+	case PIPE_FORMAT_R8G8B8_SNORM:
+	case PIPE_FORMAT_R8_SSCALED:
+	case PIPE_FORMAT_R8G8_SSCALED:
+	case PIPE_FORMAT_R8G8B8_SSCALED:
+	case PIPE_FORMAT_R32_FIXED:
+	case PIPE_FORMAT_R32G32_FIXED:
+	case PIPE_FORMAT_R32G32B32_FIXED:
+	case PIPE_FORMAT_R32G32B32A32_FIXED:
+	default:
+		fprintf(stderr, "%s:%d unsupported %d\n", __func__, __LINE__, pformat);
+		return -EINVAL;
+	}
+}
+
+int r600_conv_pipe_prim(unsigned pprim, unsigned *prim)
+{
+	switch (pprim) {
+	case PIPE_PRIM_POINTS:
+		*prim = V_008958_DI_PT_POINTLIST;
+		return 0;
+	case PIPE_PRIM_LINES:
+		*prim = V_008958_DI_PT_LINELIST;
+		return 0;
+	case PIPE_PRIM_LINE_STRIP:
+		*prim = V_008958_DI_PT_LINESTRIP;
+		return 0;
+	case PIPE_PRIM_LINE_LOOP:
+		*prim = V_008958_DI_PT_LINELOOP;
+		return 0;
+	case PIPE_PRIM_TRIANGLES:
+		*prim = V_008958_DI_PT_TRILIST;
+		return 0;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		*prim = V_008958_DI_PT_TRISTRIP;
+		return 0;
+	case PIPE_PRIM_TRIANGLE_FAN:
+		*prim = V_008958_DI_PT_TRIFAN;
+		return 0;
+	case PIPE_PRIM_POLYGON:
+		*prim = V_008958_DI_PT_POLYGON;
+		return 0;
+	case PIPE_PRIM_QUADS:
+		*prim = V_008958_DI_PT_QUADLIST;
+		return 0;
+	case PIPE_PRIM_QUAD_STRIP:
+		*prim = V_008958_DI_PT_QUADSTRIP;
+		return 0;
+	default:
+		fprintf(stderr, "%s:%d unsupported %d\n", __func__, __LINE__, pprim);
+		return -EINVAL;
+	}
+}
diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c
new file mode 100644
index 0000000000..9b02ae680e
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_query.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ *      Corbin Simpson
+ */
+#include <util/u_inlines.h>
+#include <util/u_format.h>
+#include <util/u_memory.h>
+#include "r600_screen.h"
+#include "r600_context.h"
+
+static struct pipe_query *r600_create_query(struct pipe_context *pipe, unsigned query_type)
+{
+	return NULL;
+}
+
+static void r600_destroy_query(struct pipe_context *pipe, struct pipe_query *query)
+{
+	FREE(query);
+}
+
+static void r600_begin_query(struct pipe_context *pipe, struct pipe_query *query)
+{
+}
+
+static void r600_end_query(struct pipe_context *pipe, struct pipe_query *query)
+{
+}
+
+static boolean r600_get_query_result(struct pipe_context *pipe,
+					struct pipe_query *query,
+					boolean wait, void *result)
+{
+	return TRUE;
+}
+
+void r600_init_query_functions(struct r600_context* rctx)
+{
+	rctx->context.create_query = r600_create_query;
+	rctx->context.destroy_query = r600_destroy_query;
+	rctx->context.begin_query = r600_begin_query;
+	rctx->context.end_query = r600_end_query;
+	rctx->context.get_query_result = r600_get_query_result;
+}
diff --git a/src/gallium/drivers/r600/r600_resource.c b/src/gallium/drivers/r600/r600_resource.c
new file mode 100644
index 0000000000..d9aa1df04f
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_resource.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "r600_context.h"
+#include "r600_resource.h"
+#include "r600_screen.h"
+#include "r600_texture.h"
+
+static struct pipe_resource *
+r600_resource_create(struct pipe_screen *screen,
+                    const struct pipe_resource *templ)
+{
+   if (templ->target == PIPE_BUFFER)
+      return r600_buffer_create(screen, templ);
+   else
+      return r600_texture_create(screen, templ);
+}
+
+static struct pipe_resource *
+r600_resource_from_handle(struct pipe_screen * screen,
+			 const struct pipe_resource *templ,
+			 struct winsys_handle *whandle)
+{
+   if (templ->target == PIPE_BUFFER)
+      return NULL;
+   else
+      return r600_texture_from_handle(screen, templ, whandle);
+}
+
+void r600_init_context_resource_functions(struct r600_context *r600)
+{
+   r600->context.get_transfer = u_get_transfer_vtbl;
+   r600->context.transfer_map = u_transfer_map_vtbl;
+   r600->context.transfer_flush_region = u_transfer_flush_region_vtbl;
+   r600->context.transfer_unmap = u_transfer_unmap_vtbl;
+   r600->context.transfer_destroy = u_transfer_destroy_vtbl;
+   r600->context.transfer_inline_write = u_transfer_inline_write_vtbl;
+   r600->context.is_resource_referenced = u_is_resource_referenced_vtbl;
+}
+
+void r600_init_screen_resource_functions(struct r600_screen *r600screen)
+{
+   r600screen->screen.resource_create = r600_resource_create;
+   r600screen->screen.resource_from_handle = r600_resource_from_handle;
+   r600screen->screen.resource_get_handle = u_resource_get_handle_vtbl;
+   r600screen->screen.resource_destroy = u_resource_destroy_vtbl;
+   r600screen->screen.user_buffer_create = r600_user_buffer_create;
+}
diff --git a/src/gallium/drivers/r600/r600_resource.h b/src/gallium/drivers/r600/r600_resource.h
new file mode 100644
index 0000000000..95084a371b
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_resource.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef R600_RESOURCE_H
+#define R600_RESOURCE_H
+
+struct r600_context;
+struct r600_screen;
+
+void r600_init_context_resource_functions(struct r600_context *r600);
+void r600_init_screen_resource_functions(struct r600_screen *r600screen);
+
+#endif
diff --git a/src/gallium/drivers/r600/r600_screen.c b/src/gallium/drivers/r600/r600_screen.c
new file mode 100644
index 0000000000..1d83383fd9
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_screen.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ *      Corbin Simpson
+ */
+#include <util/u_inlines.h>
+#include <util/u_format.h>
+#include <util/u_memory.h>
+#include "r600_resource.h"
+#include "r600_screen.h"
+#include "r600_texture.h"
+#include "r600_context.h"
+#include <stdio.h>
+
+static const char* r600_get_vendor(struct pipe_screen* pscreen)
+{
+	return "X.Org";
+}
+
+static const char* r600_get_name(struct pipe_screen* pscreen)
+{
+	return "R600/R700 (HD2XXX,HD3XXX,HD4XXX)";
+}
+
+static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+	case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+		return 16;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 1;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 1;
+	case PIPE_CAP_DUAL_SOURCE_BLEND:
+		return 1;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		/* FIXME some r6xx are buggy and can only do 4 */
+		return 8;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		/* FIXME not sure here */
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+		return 1;
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		/* FIXME allow this once infrastructure is there */
+		return 0;
+	case PIPE_CAP_TGSI_CONT_SUPPORTED:
+		return 0;
+	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+		return 1;
+	case PIPE_CAP_SM3:
+		return 1;
+	case PIPE_CAP_INDEP_BLEND_ENABLE:
+		return 1;
+	case PIPE_CAP_INDEP_BLEND_FUNC:
+		/* FIXME allow this */
+		return 0;
+	case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
+		return 1;
+	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+		return 1;
+	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+		return 0;
+	default:
+		debug_printf("r600: unknown param %d\n", param);
+		return 0;
+	}
+}
+
+static float r600_get_paramf(struct pipe_screen* pscreen, enum pipe_cap param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 8192.0f;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 16.0f;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 16.0f;
+	default:
+		debug_printf("r600: unsupported paramf %d\n", param);
+		return 0.0f;
+	}
+}
+
+static boolean r600_is_format_supported(struct pipe_screen* screen,
+					enum pipe_format format,
+					enum pipe_texture_target target,
+					unsigned sample_count,
+					unsigned bindings,
+					unsigned geom_flags)
+{
+	if (target >= PIPE_MAX_TEXTURE_TYPES) {
+		debug_printf("r600: unsupported texture type %d\n", target);
+		return FALSE;
+	}
+	switch (format) {
+	case PIPE_FORMAT_B4G4R4A4_UNORM:
+	case PIPE_FORMAT_B5G6R5_UNORM:
+	case PIPE_FORMAT_B5G5R5A1_UNORM:
+	case PIPE_FORMAT_A8_UNORM:
+	case PIPE_FORMAT_L8_UNORM:
+	case PIPE_FORMAT_A8R8G8B8_SRGB:
+	case PIPE_FORMAT_R8G8B8A8_SRGB:
+	case PIPE_FORMAT_DXT1_RGB:
+	case PIPE_FORMAT_DXT1_RGBA:
+	case PIPE_FORMAT_DXT3_RGBA:
+	case PIPE_FORMAT_DXT5_RGBA:
+	case PIPE_FORMAT_UYVY:
+	case PIPE_FORMAT_L8_SRGB:
+	case PIPE_FORMAT_L8A8_SRGB:
+	case PIPE_FORMAT_L8A8_UNORM:
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R8G8B8X8_UNORM:
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+	case PIPE_FORMAT_A8B8G8R8_SRGB:
+	case PIPE_FORMAT_B8G8R8A8_SRGB:
+	case PIPE_FORMAT_I8_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
+	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+	case PIPE_FORMAT_Z32_UNORM:
+	case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+	case PIPE_FORMAT_Z24X8_UNORM:
+		return TRUE;
+	default:
+		/* Unknown format... */
+		break;
+	}
+	return FALSE;
+}
+
+struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx,
+						struct pipe_resource *texture,
+						struct pipe_subresource sr,
+						unsigned usage,
+						const struct pipe_box *box)
+{
+	struct r600_texture *rtex = (struct r600_texture*)texture;
+	struct r600_transfer *trans;
+
+	trans = CALLOC_STRUCT(r600_transfer);
+	if (trans == NULL)
+		return NULL;
+	pipe_resource_reference(&trans->transfer.resource, texture);
+	trans->transfer.sr = sr;
+	trans->transfer.usage = usage;
+	trans->transfer.box = *box;
+	trans->transfer.stride = rtex->stride[sr.level];
+	trans->offset = r600_texture_get_offset(rtex, sr.level, box->z, sr.face);
+	return &trans->transfer;
+}
+
+void r600_texture_transfer_destroy(struct pipe_context *ctx,
+				   struct pipe_transfer *trans)
+{
+	pipe_resource_reference(&trans->resource, NULL);
+	FREE(trans);
+}
+
+void* r600_texture_transfer_map(struct pipe_context *ctx,
+				struct pipe_transfer* transfer)
+{
+	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+	struct r600_texture *rtex = (struct r600_texture*)transfer->resource;
+	char *map;
+	enum pipe_format format = rtex->b.b.format;
+
+	map = pipe_buffer_map(ctx, rtex->buffer,
+			      transfer->usage,
+			      &rtransfer->buffer_transfer);
+
+	if (!map) {
+		return NULL;
+	}
+
+	return map + rtransfer->offset +
+		transfer->box.y / util_format_get_blockheight(format) * transfer->stride +
+		transfer->box.x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
+}
+
+void r600_texture_transfer_unmap(struct pipe_context *ctx,
+				 struct pipe_transfer* transfer)
+{
+	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+	struct r600_texture *rtex = (struct r600_texture*)transfer->resource;
+
+	pipe_buffer_unmap(ctx, rtex->buffer, rtransfer->buffer_transfer);
+}
+
+static void r600_destroy_screen(struct pipe_screen* pscreen)
+{
+	struct r600_screen* rscreen = r600_screen(pscreen);
+
+	if (rscreen == NULL)
+		return;
+	FREE(rscreen);
+}
+
+struct pipe_screen *radeon_create_screen(struct radeon *rw)
+{
+	struct r600_screen* rscreen;
+
+	rscreen = CALLOC_STRUCT(r600_screen);
+	if (rscreen == NULL) {
+		return NULL;
+	}
+	rscreen->rw = rw;
+	rscreen->screen.winsys = (struct pipe_winsys*)rw;
+	rscreen->screen.destroy = r600_destroy_screen;
+	rscreen->screen.get_name = r600_get_name;
+	rscreen->screen.get_vendor = r600_get_vendor;
+	rscreen->screen.get_param = r600_get_param;
+	rscreen->screen.get_paramf = r600_get_paramf;
+	rscreen->screen.is_format_supported = r600_is_format_supported;
+	rscreen->screen.context_create = r600_create_context;
+	r600_init_screen_texture_functions(&rscreen->screen);
+	r600_init_screen_resource_functions(rscreen);
+	return &rscreen->screen;
+}
diff --git a/src/gallium/drivers/r600/r600_screen.h b/src/gallium/drivers/r600/r600_screen.h
new file mode 100644
index 0000000000..0a0286d96b
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_screen.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef R600_SCREEN_H
+#define R600_SCREEN_H
+
+#include <pipe/p_state.h>
+#include <pipe/p_screen.h>
+#include <pipebuffer/pb_buffer.h>
+#include <xf86drm.h>
+#include <radeon_drm.h>
+#include "radeon.h"
+#include "util/u_transfer.h"
+
+/* Texture transfer. */
+struct r600_transfer {
+	/* Base class. */
+	struct pipe_transfer		transfer;
+	/* Buffer transfer. */
+	struct pipe_transfer		*buffer_transfer;
+	unsigned			offset;
+};
+
+struct r600_buffer {
+	struct u_resource		b;
+	struct radeon_bo		*bo;
+	u32				domain;
+	u32				flink;
+	struct pb_buffer		*pb;
+};
+
+struct r600_screen {
+	struct pipe_screen		screen;
+	struct radeon			*rw;
+};
+
+static INLINE struct r600_screen *r600_screen(struct pipe_screen *screen)
+{
+	return (struct r600_screen*)screen;
+}
+
+/* Buffer functions. */
+struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
+					 const struct pipe_resource *templ);
+struct pipe_resource *r600_user_buffer_create(struct pipe_screen *screen,
+					      void *ptr, unsigned bytes,
+					      unsigned bind);
+unsigned r600_buffer_is_referenced_by_cs(struct pipe_context *context,
+					 struct pipe_resource *buf,
+					 unsigned face, unsigned level);
+struct pipe_resource *r600_buffer_from_handle(struct pipe_screen *screen,
+					      struct winsys_handle *whandle);
+
+/* Texture transfer functions. */
+struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx,
+						struct pipe_resource *texture,
+						struct pipe_subresource sr,
+						unsigned usage,
+						const struct pipe_box *box);
+void r600_texture_transfer_destroy(struct pipe_context *ctx,
+				   struct pipe_transfer *trans);
+void* r600_texture_transfer_map(struct pipe_context *ctx,
+				struct pipe_transfer* transfer);
+void r600_texture_transfer_unmap(struct pipe_context *ctx,
+				 struct pipe_transfer* transfer);
+
+
+/* helpers */
+int r600_conv_pipe_format(unsigned pformat, unsigned *format);
+int r600_conv_pipe_prim(unsigned pprim, unsigned *prim);
+
+union r600_float_to_u32_u {
+	u32	u;
+	float	f;
+};
+
+static inline u32 r600_float_to_u32(float f)
+{
+	union r600_float_to_u32_u c;
+
+	c.f = f;
+	return c.u;
+}
+
+#endif
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
new file mode 100644
index 0000000000..6b29d33379
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <util/u_inlines.h>
+#include <util/u_format.h>
+#include <util/u_memory.h>
+#include <tgsi/tgsi_dump.h>
+#include "r600_screen.h"
+#include "r600_context.h"
+#include "r600d.h"
+
+static int r600_pipe_shader_vs(struct pipe_context *ctx, struct r600_pipe_shader *rpshader)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct r600_shader *rshader = &rpshader->shader;
+	struct radeon_state *state;
+	unsigned i, tmp;
+
+	rpshader->state = radeon_state_decref(rpshader->state);
+	state = radeon_state(rscreen->rw, R600_VS_SHADER_TYPE, R600_VS_SHADER);
+	if (state == NULL)
+		return -ENOMEM;
+	for (i = 0; i < rshader->noutput; i += 4) {
+		tmp = rshader->output[i].sid;
+		tmp |= rshader->output[i + 1].sid << 8;
+		tmp |= rshader->output[i + 2].sid << 16;
+		tmp |= rshader->output[i + 3].sid << 24;
+		state->states[R600_VS_SHADER__SPI_VS_OUT_ID_0 + i / 4] = tmp;
+	}
+	state->states[R600_VS_SHADER__SPI_VS_OUT_CONFIG] = S_0286C4_VS_EXPORT_COUNT(rshader->noutput - 1);
+	state->states[R600_VS_SHADER__SQ_PGM_RESOURCES_VS] = S_028868_NUM_GPRS(rshader->ngpr);
+	rpshader->state = state;
+	rpshader->state->bo[0] = radeon_bo_incref(rscreen->rw, rpshader->bo);
+	rpshader->state->bo[1] = radeon_bo_incref(rscreen->rw, rpshader->bo);
+	rpshader->state->nbo = 2;
+	rpshader->state->placement[0] = RADEON_GEM_DOMAIN_GTT;
+	return radeon_state_pm4(state);
+}
+
+static int r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader *rpshader)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct r600_shader *rshader = &rpshader->shader;
+	struct radeon_state *state;
+	unsigned i, tmp;
+
+	rpshader->state = radeon_state_decref(rpshader->state);
+	state = radeon_state(rscreen->rw, R600_PS_SHADER_TYPE, R600_PS_SHADER);
+	if (state == NULL)
+		return -ENOMEM;
+	for (i = 0; i < rshader->ninput; i++) {
+		tmp = S_028644_SEMANTIC(rshader->input[i].sid);
+		tmp |= S_028644_SEL_CENTROID(1);
+		tmp |= S_028644_FLAT_SHADE(rshader->flat_shade);
+		state->states[R600_PS_SHADER__SPI_PS_INPUT_CNTL_0 + i] = tmp;
+	}
+	state->states[R600_PS_SHADER__SPI_PS_IN_CONTROL_0] = S_0286CC_NUM_INTERP(rshader->ninput) |
+							S_0286CC_PERSP_GRADIENT_ENA(1);
+	state->states[R600_PS_SHADER__SPI_PS_IN_CONTROL_1] = 0x00000000;
+	state->states[R600_PS_SHADER__SQ_PGM_RESOURCES_PS] = S_028868_NUM_GPRS(rshader->ngpr);
+	state->states[R600_PS_SHADER__SQ_PGM_EXPORTS_PS] = 0x00000002;
+	rpshader->state = state;
+	rpshader->state->bo[0] = radeon_bo_incref(rscreen->rw, rpshader->bo);
+	rpshader->state->nbo = 1;
+	rpshader->state->placement[0] = RADEON_GEM_DOMAIN_GTT;
+	return radeon_state_pm4(state);
+}
+
+static int r600_pipe_shader(struct pipe_context *ctx, struct r600_pipe_shader *rpshader)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct r600_context *rctx = r600_context(ctx);
+	struct r600_shader *rshader = &rpshader->shader;
+	int r;
+
+	/* copy new shader */
+	radeon_bo_decref(rscreen->rw, rpshader->bo);
+	rpshader->bo = NULL;
+	rpshader->bo = radeon_bo(rscreen->rw, 0, rshader->ndw * 4,
+				4096, NULL);
+	if (rpshader->bo == NULL) {
+		return -ENOMEM;
+	}
+	radeon_bo_map(rscreen->rw, rpshader->bo);
+	memcpy(rpshader->bo->data, rshader->bcode, rshader->ndw * 4);
+	radeon_bo_unmap(rscreen->rw, rpshader->bo);
+	/* build state */
+	rshader->flat_shade = rctx->flat_shade;
+	switch (rpshader->type) {
+	case C_PROGRAM_TYPE_VS:
+		r = r600_pipe_shader_vs(ctx, rpshader);
+		break;
+	case C_PROGRAM_TYPE_FS:
+		r = r600_pipe_shader_ps(ctx, rpshader);
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+	return r;
+}
+
+struct r600_pipe_shader *r600_pipe_shader_create(struct pipe_context *ctx, unsigned type, const struct tgsi_token *tokens)
+{
+	struct r600_pipe_shader *rpshader = CALLOC_STRUCT(r600_pipe_shader);
+	struct r600_shader *rshader = &rpshader->shader;
+	int r;
+
+	if (rpshader == NULL)
+		return NULL;
+	rpshader->type = type;
+	c_list_init(&rshader->nodes);
+	fprintf(stderr, "<<\n");
+	tgsi_dump(tokens, 0);
+	fprintf(stderr, "--------------------------------------------------------------\n");
+	r = c_shader_from_tgsi(&rshader->cshader, type, tokens);
+	if (r) {
+		r600_pipe_shader_destroy(ctx, rpshader);
+		fprintf(stderr, "ERROR(%s %d)>>\n\n", __func__, __LINE__);
+		return NULL;
+	}
+	r = r600_shader_insert_fetch(&rshader->cshader);
+	if (r) {
+		r600_pipe_shader_destroy(ctx, rpshader);
+		fprintf(stderr, "ERROR(%s %d)>>\n\n", __func__, __LINE__);
+		return NULL;
+	}
+	r = c_shader_build_dominator_tree(&rshader->cshader);
+	if (r) {
+		r600_pipe_shader_destroy(ctx, rpshader);
+		fprintf(stderr, "ERROR(%s %d)>>\n\n", __func__, __LINE__);
+		return NULL;
+	}
+	c_shader_dump(&rshader->cshader);
+	r = r600_cshader_legalize(&rshader->cshader);
+	if (r) {
+		r600_pipe_shader_destroy(ctx, rpshader);
+		fprintf(stderr, "ERROR(%s %d)>>\n\n", __func__, __LINE__);
+		return NULL;
+	}
+	r = r700_shader_translate(rshader);
+	if (r) {
+		r600_pipe_shader_destroy(ctx, rpshader);
+		fprintf(stderr, "ERROR(%s %d)>>\n\n", __func__, __LINE__);
+		return NULL;
+	}
+#if 1
+#if 0
+	fprintf(stderr, "--------------------------------------------------------------\n");
+	for (int i = 0; i < rshader->ndw; i++) {
+		fprintf(stderr, "0x%08X\n", rshader->bcode[i]);
+	}
+#endif
+	fprintf(stderr, ">>\n\n");
+#endif
+	return rpshader;
+}
+
+void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *rpshader)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+
+	if (rpshader == NULL)
+		return;
+	radeon_bo_decref(rscreen->rw, rpshader->bo);
+	rpshader->bo = NULL;
+	r600_shader_cleanup(&rpshader->shader);
+	FREE(rpshader);
+}
+
+int r600_pipe_shader_update(struct pipe_context *ctx, struct r600_pipe_shader *rpshader)
+{
+	struct r600_context *rctx = r600_context(ctx);
+	struct r600_shader *rshader;
+	enum pipe_format resource_format[160];
+	unsigned i, nresources = 0;
+	int r;
+
+	if (rpshader == NULL)
+		return -EINVAL;
+	rshader = &rpshader->shader;
+	switch (rpshader->type) {
+	case C_PROGRAM_TYPE_VS:
+		for (i = 0; i < rctx->vertex_elements->count; i++) {
+			resource_format[nresources++] = rctx->vertex_elements->elements[i].src_format;
+		}
+		break;
+	default:
+		break;
+	}
+	/* there should be enough input */
+	if (nresources < rshader->nresource)
+		return -EINVAL;
+	/* FIXME compare resources */
+	r = r600_shader_update(rshader, resource_format);
+	if (r)
+		return r;
+	return r600_pipe_shader(ctx, rpshader);
+}
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
new file mode 100644
index 0000000000..7d30ca79d1
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -0,0 +1,263 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef R600_SHADER_H
+#define R600_SHADER_H
+
+#include "r600_compiler.h"
+#include "radeon.h"
+
+struct r600_shader_operand {
+	struct c_vector			*vector;
+	unsigned			sel;
+	unsigned			chan;
+	unsigned			neg;
+	unsigned			abs;
+};
+
+struct r600_shader_vfetch {
+	struct r600_shader_vfetch	*next;
+	struct r600_shader_vfetch	*prev;
+	unsigned			cf_addr;
+	struct r600_shader_operand	src[2];
+	struct r600_shader_operand	dst[4];
+};
+
+struct r600_shader_inst {
+	unsigned			is_op3;
+	unsigned			opcode;
+	unsigned			inst;
+	struct r600_shader_operand	src[3];
+	struct r600_shader_operand	dst;
+	unsigned			last;
+};
+
+struct r600_shader_alu {
+	struct r600_shader_alu		*next;
+	struct r600_shader_alu		*prev;
+	unsigned			nalu;
+	unsigned			nliteral;
+	unsigned			nconstant;
+	struct r600_shader_inst		alu[5];
+	u32				literal[4];
+};
+
+struct r600_shader_node {
+	struct r600_shader_node		*next;
+	struct r600_shader_node		*prev;
+	unsigned			cf_id;		/**< cf index (in dw) in byte code */
+	unsigned			cf_addr;	/**< instructions index (in dw) in byte code */
+	unsigned			nslot;		/**< number of slot (2 dw) needed by this node */
+	unsigned			nfetch;
+	struct c_node			*node;		/**< compiler node from which this node originate */
+	struct r600_shader_vfetch	vfetch;		/**< list of vfetch instructions */
+	struct r600_shader_alu		alu;		/**< list of alu instructions */
+};
+
+struct r600_shader_io {
+	unsigned	name;
+	unsigned	gpr;
+	int		sid;
+};
+
+struct r600_shader {
+	unsigned			stack_size;		/**< stack size needed by this shader */
+	unsigned			ngpr;			/**< number of GPR needed by this shader */
+	unsigned			nconstant;		/**< number of constants used by this shader */
+	unsigned			nresource;		/**< number of resources used by this shader */
+	unsigned			noutput;
+	unsigned			ninput;
+	unsigned			nvector;
+	unsigned			ncf;			/**< total number of cf clauses */
+	unsigned			nslot;			/**< total number of slots (2 dw) */
+	unsigned			flat_shade;		/**< are we flat shading */
+	struct r600_shader_node		nodes;			/**< list of node */
+	struct r600_shader_io		input[32];
+	struct r600_shader_io		output[32];
+	/* TODO replace GPR by some better register allocator */
+	struct c_vector			**gpr;
+	unsigned			ndw;			/**< bytes code size in dw */
+	u32				*bcode;			/**< bytes code */
+	enum pipe_format		resource_format[160];	/**< format of resource */
+	struct c_shader			cshader;
+};
+
+void r600_shader_cleanup(struct r600_shader *rshader);
+int r600_shader_register(struct r600_shader *rshader);
+int r600_shader_node(struct r600_shader *shader);
+void r600_shader_node_place(struct r600_shader *rshader);
+int r600_shader_find_gpr(struct r600_shader *rshader, struct c_vector *v, unsigned swizzle,
+			struct r600_shader_operand *operand);
+int r600_shader_vfetch_bytecode(struct r600_shader *rshader,
+				struct r600_shader_node *rnode,
+				struct r600_shader_vfetch *vfetch,
+				unsigned *cid);
+int r600_shader_update(struct r600_shader *rshader,
+			enum pipe_format *resource_format);
+int r600_shader_legalize(struct r600_shader *rshader);
+int r600_cshader_legalize(struct c_shader *shader);
+
+int r700_shader_translate(struct r600_shader *rshader);
+
+int c_shader_from_tgsi(struct c_shader *shader, unsigned type,
+			const struct tgsi_token *tokens);
+int r600_shader_register(struct r600_shader *rshader);
+int r600_shader_translate_rec(struct r600_shader *rshader, struct c_node *node);
+int r700_shader_translate(struct r600_shader *rshader);
+int r600_shader_insert_fetch(struct c_shader *shader);
+
+enum r600_instruction {
+	INST_ADD			= 0,
+	INST_MUL			= 1,
+	INST_MUL_IEEE			= 2,
+	INST_MAX			= 3,
+	INST_MIN			= 4,
+	INST_MAX_DX10			= 5,
+	INST_MIN_DX10			= 6,
+	INST_SETE			= 7,
+	INST_SETGT			= 8,
+	INST_SETGE			= 9,
+	INST_SETNE			= 10,
+	INST_SETE_DX10			= 11,
+	INST_SETGT_DX10			= 12,
+	INST_SETGE_DX10			= 13,
+	INST_SETNE_DX10			= 14,
+	INST_FRACT			= 15,
+	INST_TRUNC			= 16,
+	INST_CEIL			= 17,
+	INST_RNDNE			= 18,
+	INST_FLOOR			= 19,
+	INST_MOVA			= 20,
+	INST_MOVA_FLOOR			= 21,
+	INST_MOVA_INT			= 22,
+	INST_MOV			= 23,
+	INST_NOP			= 24,
+	INST_PRED_SETGT_UINT		= 25,
+	INST_PRED_SETGE_UINT		= 26,
+	INST_PRED_SETE			= 27,
+	INST_PRED_SETGT			= 28,
+	INST_PRED_SETGE			= 29,
+	INST_PRED_SETNE			= 30,
+	INST_PRED_SET_INV		= 31,
+	INST_PRED_SET_POP		= 32,
+	INST_PRED_SET_CLR		= 33,
+	INST_PRED_SET_RESTORE		= 34,
+	INST_PRED_SETE_PUSH		= 35,
+	INST_PRED_SETGT_PUSH		= 36,
+	INST_PRED_SETGE_PUSH		= 37,
+	INST_PRED_SETNE_PUSH		= 38,
+	INST_KILLE			= 39,
+	INST_KILLGT			= 40,
+	INST_KILLGE			= 41,
+	INST_KILLNE			= 42,
+	INST_AND_INT			= 43,
+	INST_OR_INT			= 44,
+	INST_XOR_INT			= 45,
+	INST_NOT_INT			= 46,
+	INST_ADD_INT			= 47,
+	INST_SUB_INT			= 48,
+	INST_MAX_INT			= 49,
+	INST_MIN_INT			= 50,
+	INST_MAX_UINT			= 51,
+	INST_MIN_UINT			= 52,
+	INST_SETE_INT			= 53,
+	INST_SETGT_INT			= 54,
+	INST_SETGE_INT			= 55,
+	INST_SETNE_INT			= 56,
+	INST_SETGT_UINT			= 57,
+	INST_SETGE_UINT			= 58,
+	INST_KILLGT_UINT		= 59,
+	INST_KILLGE_UINT		= 60,
+	INST_PRED_SETE_INT		= 61,
+	INST_PRED_SETGT_INT		= 62,
+	INST_PRED_SETGE_INT		= 63,
+	INST_PRED_SETNE_INT		= 64,
+	INST_KILLE_INT			= 65,
+	INST_KILLGT_INT			= 66,
+	INST_KILLGE_INT			= 67,
+	INST_KILLNE_INT			= 68,
+	INST_PRED_SETE_PUSH_INT		= 69,
+	INST_PRED_SETGT_PUSH_INT	= 70,
+	INST_PRED_SETGE_PUSH_INT	= 71,
+	INST_PRED_SETNE_PUSH_INT	= 72,
+	INST_PRED_SETLT_PUSH_INT	= 73,
+	INST_PRED_SETLE_PUSH_INT	= 74,
+	INST_DOT4			= 75,
+	INST_DOT4_IEEE			= 76,
+	INST_CUBE			= 77,
+	INST_MAX4			= 78,
+	INST_MOVA_GPR_INT		= 79,
+	INST_EXP_IEEE			= 80,
+	INST_LOG_CLAMPED		= 81,
+	INST_LOG_IEEE			= 82,
+	INST_RECIP_CLAMPED		= 83,
+	INST_RECIP_FF			= 84,
+	INST_RECIP_IEEE			= 85,
+	INST_RECIPSQRT_CLAMPED		= 86,
+	INST_RECIPSQRT_FF		= 87,
+	INST_RECIPSQRT_IEEE		= 88,
+	INST_SQRT_IEEE			= 89,
+	INST_FLT_TO_INT			= 90,
+	INST_INT_TO_FLT			= 91,
+	INST_UINT_TO_FLT		= 92,
+	INST_SIN			= 93,
+	INST_COS			= 94,
+	INST_ASHR_INT			= 95,
+	INST_LSHR_INT			= 96,
+	INST_LSHL_INT			= 97,
+	INST_MULLO_INT			= 98,
+	INST_MULHI_INT			= 99,
+	INST_MULLO_UINT			= 100,
+	INST_MULHI_UINT			= 101,
+	INST_RECIP_INT			= 102,
+	INST_RECIP_UINT			= 103,
+	INST_FLT_TO_UINT		= 104,
+	INST_MUL_LIT			= 105,
+	INST_MUL_LIT_M2			= 106,
+	INST_MUL_LIT_M4			= 107,
+	INST_MUL_LIT_D2			= 108,
+	INST_MULADD			= 109,
+	INST_MULADD_M2			= 110,
+	INST_MULADD_M4			= 111,
+	INST_MULADD_D2			= 112,
+	INST_MULADD_IEEE		= 113,
+	INST_MULADD_IEEE_M2		= 114,
+	INST_MULADD_IEEE_M4		= 115,
+	INST_MULADD_IEEE_D2		= 116,
+	INST_CNDE			= 117,
+	INST_CNDGT			= 118,
+	INST_CNDGE			= 119,
+	INST_CNDE_INT			= 120,
+	INST_CNDGT_INT			= 121,
+	INST_CNDGE_INT			= 122,
+	INST_COUNT
+};
+
+struct r600_instruction_info {
+	enum r600_instruction		instruction;
+	unsigned			opcode;
+	unsigned			is_trans;
+	unsigned			is_op3;
+};
+
+
+#endif
diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h
new file mode 100644
index 0000000000..71aa09719e
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -0,0 +1,606 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+#ifndef R600_SQ_H
+#define R600_SQ_H
+
+#define P_SQ_CF_WORD0
+#define   S_SQ_CF_WORD0_ADDR(x)                                      (((x) & 0xFFFFFFFF) << 0)
+#define   G_SQ_CF_WORD0_ADDR(x)                                      (((x) >> 0) & 0xFFFFFFFF)
+#define   C_SQ_CF_WORD0_ADDR                                         0x00000000
+#define P_SQ_CF_WORD1
+#define   S_SQ_CF_WORD1_POP_COUNT(x)                                 (((x) & 0x7) << 0)
+#define   G_SQ_CF_WORD1_POP_COUNT(x)                                 (((x) >> 0) & 0x7)
+#define   C_SQ_CF_WORD1_POP_COUNT                                    0xFFFFFFF8
+#define   S_SQ_CF_WORD1_CF_CONST(x)                                  (((x) & 0x1F) << 3)
+#define   G_SQ_CF_WORD1_CF_CONST(x)                                  (((x) >> 3) & 0x1F)
+#define   C_SQ_CF_WORD1_CF_CONST                                     0xFFFFFF07
+#define   S_SQ_CF_WORD1_COND(x)                                      (((x) & 0x3) << 8)
+#define   G_SQ_CF_WORD1_COND(x)                                      (((x) >> 8) & 0x3)
+#define   C_SQ_CF_WORD1_COND                                         0xFFFFFCFF
+#define   S_SQ_CF_WORD1_COUNT(x)                                     (((x) & 0x7) << 10)
+#define   G_SQ_CF_WORD1_COUNT(x)                                     (((x) >> 10) & 0x7)
+#define   C_SQ_CF_WORD1_COUNT                                        0xFFFFE3FF
+#define   S_SQ_CF_WORD1_CALL_COUNT(x)                                (((x) & 0x3F) << 13)
+#define   G_SQ_CF_WORD1_CALL_COUNT(x)                                (((x) >> 13) & 0x3F)
+#define   C_SQ_CF_WORD1_CALL_COUNT                                   0xFFF81FFF
+#define   S_SQ_CF_WORD1_END_OF_PROGRAM(x)                            (((x) & 0x1) << 21)
+#define   G_SQ_CF_WORD1_END_OF_PROGRAM(x)                            (((x) >> 21) & 0x1)
+#define   C_SQ_CF_WORD1_END_OF_PROGRAM                               0xFFDFFFFF
+#define   S_SQ_CF_WORD1_VALID_PIXEL_MODE(x)                          (((x) & 0x1) << 22)
+#define   G_SQ_CF_WORD1_VALID_PIXEL_MODE(x)                          (((x) >> 22) & 0x1)
+#define   C_SQ_CF_WORD1_VALID_PIXEL_MODE                             0xFFBFFFFF
+#define   S_SQ_CF_WORD1_CF_INST(x)                                   (((x) & 0x7F) << 23)
+#define   G_SQ_CF_WORD1_CF_INST(x)                                   (((x) >> 23) & 0x7F)
+#define   C_SQ_CF_WORD1_CF_INST                                      0xC07FFFFF
+#define     V_SQ_CF_WORD1_SQ_CF_INST_NOP                             0x00000000
+#define     V_SQ_CF_WORD1_SQ_CF_INST_TEX                             0x00000001
+#define     V_SQ_CF_WORD1_SQ_CF_INST_VTX                             0x00000002
+#define     V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC                          0x00000003
+#define     V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START                      0x00000004
+#define     V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END                        0x00000005
+#define     V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10                 0x00000006
+#define     V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL                0x00000007
+#define     V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE                   0x00000008
+#define     V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK                      0x00000009
+#define     V_SQ_CF_WORD1_SQ_CF_INST_JUMP                            0x0000000A
+#define     V_SQ_CF_WORD1_SQ_CF_INST_PUSH                            0x0000000B
+#define     V_SQ_CF_WORD1_SQ_CF_INST_PUSH_ELSE                       0x0000000C
+#define     V_SQ_CF_WORD1_SQ_CF_INST_ELSE                            0x0000000D
+#define     V_SQ_CF_WORD1_SQ_CF_INST_POP                             0x0000000E
+#define     V_SQ_CF_WORD1_SQ_CF_INST_POP_JUMP                        0x0000000F
+#define     V_SQ_CF_WORD1_SQ_CF_INST_POP_PUSH                        0x00000010
+#define     V_SQ_CF_WORD1_SQ_CF_INST_POP_PUSH_ELSE                   0x00000011
+#define     V_SQ_CF_WORD1_SQ_CF_INST_CALL                            0x00000012
+#define     V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS                         0x00000013
+#define     V_SQ_CF_WORD1_SQ_CF_INST_RETURN                          0x00000014
+#define     V_SQ_CF_WORD1_SQ_CF_INST_EMIT_VERTEX                     0x00000015
+#define     V_SQ_CF_WORD1_SQ_CF_INST_EMIT_CUT_VERTEX                 0x00000016
+#define     V_SQ_CF_WORD1_SQ_CF_INST_CUT_VERTEX                      0x00000017
+#define     V_SQ_CF_WORD1_SQ_CF_INST_KILL                            0x00000018
+#define   S_SQ_CF_WORD1_WHOLE_QUAD_MODE(x)                           (((x) & 0x1) << 30)
+#define   G_SQ_CF_WORD1_WHOLE_QUAD_MODE(x)                           (((x) >> 30) & 0x1)
+#define   C_SQ_CF_WORD1_WHOLE_QUAD_MODE                              0xBFFFFFFF
+#define   S_SQ_CF_WORD1_BARRIER(x)                                   (((x) & 0x1) << 31)
+#define   G_SQ_CF_WORD1_BARRIER(x)                                   (((x) >> 31) & 0x1)
+#define   C_SQ_CF_WORD1_BARRIER                                      0x7FFFFFFF
+#define P_SQ_CF_ALU_WORD0
+#define   S_SQ_CF_ALU_WORD0_ALU_ADDR(x)                              (((x) & 0x3FFFFF) << 0)
+#define   G_SQ_CF_ALU_WORD0_ALU_ADDR(x)                              (((x) >> 0) & 0x3FFFFF)
+#define   C_SQ_CF_ALU_WORD0_ALU_ADDR                                 0xFFC00000
+#define   S_SQ_CF_ALU_WORD0_KCACHE_BANK0(x)                          (((x) & 0xF) << 22)
+#define   G_SQ_CF_ALU_WORD0_KCACHE_BANK0(x)                          (((x) >> 22) & 0xF)
+#define   C_SQ_CF_ALU_WORD0_KCACHE_BANK0                             0xFC3FFFFF
+#define   S_SQ_CF_ALU_WORD0_KCACHE_BANK1(x)                          (((x) & 0xF) << 26)
+#define   G_SQ_CF_ALU_WORD0_KCACHE_BANK1(x)                          (((x) >> 26) & 0xF)
+#define   C_SQ_CF_ALU_WORD0_KCACHE_BANK1                             0xC3FFFFFF
+#define   S_SQ_CF_ALU_WORD0_KCACHE_MODE0(x)                          (((x) & 0x3) << 30)
+#define   G_SQ_CF_ALU_WORD0_KCACHE_MODE0(x)                          (((x) >> 30) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_KCACHE_MODE0                             0x3FFFFFFF
+#define P_SQ_CF_ALU_WORD1
+#define   S_SQ_CF_ALU_WORD1_KCACHE_MODE1(x)                          (((x) & 0x3) << 0)
+#define   G_SQ_CF_ALU_WORD1_KCACHE_MODE1(x)                          (((x) >> 0) & 0x3)
+#define   C_SQ_CF_ALU_WORD1_KCACHE_MODE1                             0xFFFFFFFC
+#define   S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(x)                          (((x) & 0xFF) << 2)
+#define   G_SQ_CF_ALU_WORD1_KCACHE_ADDR0(x)                          (((x) >> 2) & 0xFF)
+#define   C_SQ_CF_ALU_WORD1_KCACHE_ADDR0                             0xFFFFFC03
+#define   S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(x)                          (((x) & 0xFF) << 10)
+#define   G_SQ_CF_ALU_WORD1_KCACHE_ADDR1(x)                          (((x) >> 10) & 0xFF)
+#define   C_SQ_CF_ALU_WORD1_KCACHE_ADDR1                             0xFFFC03FF
+#define   S_SQ_CF_ALU_WORD1_ALU_COUNT(x)                             (((x) & 0x7F) << 18)
+#define   G_SQ_CF_ALU_WORD1_ALU_COUNT(x)                             (((x) >> 18) & 0x7F)
+#define   C_SQ_CF_ALU_WORD1_ALU_COUNT                                0xFE03FFFF
+#define   S_SQ_CF_ALU_WORD1_USES_WATERFALL(x)                        (((x) & 0x1) << 25)
+#define   G_SQ_CF_ALU_WORD1_USES_WATERFALL(x)                        (((x) >> 25) & 0x1)
+#define   C_SQ_CF_ALU_WORD1_USES_WATERFALL                           0xFDFFFFFF
+#define   S_SQ_CF_ALU_WORD1_CF_ALU_INST(x)                           (((x) & 0xF) << 26)
+#define   G_SQ_CF_ALU_WORD1_CF_ALU_INST(x)                           (((x) >> 26) & 0xF)
+#define   C_SQ_CF_ALU_WORD1_CF_ALU_INST                              0xC3FFFFFF
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU                         0x00000008
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE             0x00000009
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER               0x0000000A
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER              0x0000000B
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_CONTINUE                0x0000000D
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_BREAK                   0x0000000E
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_ELSE_AFTER              0x0000000F
+#define   S_SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE(x)                       (((x) & 0x1) << 30)
+#define   G_SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE(x)                       (((x) >> 30) & 0x1)
+#define   C_SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE                          0xBFFFFFFF
+#define   S_SQ_CF_ALU_WORD1_BARRIER(x)                               (((x) & 0x1) << 31)
+#define   G_SQ_CF_ALU_WORD1_BARRIER(x)                               (((x) >> 31) & 0x1)
+#define   C_SQ_CF_ALU_WORD1_BARRIER                                  0x7FFFFFFF
+#define P_SQ_CF_ALLOC_EXPORT_WORD0
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(x)                   (((x) & 0x1FFF) << 0)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(x)                   (((x) >> 0) & 0x1FFF)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE                      0xFFFFE000
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(x)                         (((x) & 0x3) << 13)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(x)                         (((x) >> 13) & 0x3)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_TYPE                            0xFFFF9FFF
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL               0x00000000
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS                 0x00000001
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM               0x00000002
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_SX                  0x00000003
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x)                       (((x) & 0x7F) << 15)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x)                       (((x) >> 15) & 0x7F)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR                          0xFFC07FFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_RW_REL(x)                       (((x) & 0x1) << 22)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_RW_REL(x)                       (((x) >> 22) & 0x1)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_RW_REL                          0xFFBFFFFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(x)                    (((x) & 0x7F) << 23)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(x)                    (((x) >> 23) & 0x7F)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR                       0xC07FFFFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(x)                    (((x) & 0x3) << 30)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(x)                    (((x) >> 30) & 0x3)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE                       0x3FFFFFFF
+#define P_SQ_CF_ALLOC_EXPORT_WORD1
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(x)                  (((x) & 0xF) << 17)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(x)                  (((x) >> 17) & 0xF)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT                     0xFFE1FFFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(x)               (((x) & 0x1) << 21)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(x)               (((x) >> 21) & 0x1)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM                  0xFFDFFFFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE(x)             (((x) & 0x1) << 22)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE(x)             (((x) >> 22) & 0x1)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE                0xFFBFFFFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(x)                      (((x) & 0x7F) << 23)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(x)                      (((x) >> 23) & 0x7F)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST                         0xC07FFFFF
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0        0x00000020
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1        0x00000021
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2        0x00000022
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3        0x00000023
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_SCRATCH        0x00000024
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_REDUCTION      0x00000025
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_RING           0x00000026
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT             0x00000027
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE        0x00000028
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE(x)              (((x) & 0x1) << 30)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE(x)              (((x) >> 30) & 0x1)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE                 0xBFFFFFFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(x)                      (((x) & 0x1) << 31)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(x)                      (((x) >> 31) & 0x1)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER                         0x7FFFFFFF
+#define P_SQ_CF_ALLOC_EXPORT_WORD1_BUF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(x)               (((x) & 0xFFF) << 0)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(x)               (((x) >> 0) & 0xFFF)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE                  0xFFFFF000
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(x)                (((x) & 0xF) << 12)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(x)                (((x) >> 12) & 0xF)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK                   0xFFFF0FFF
+#define P_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(x)                   (((x) & 0x7) << 0)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(x)                   (((x) >> 0) & 0x7)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X                      0xFFFFFFF8
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(x)                   (((x) & 0x7) << 3)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(x)                   (((x) >> 3) & 0x7)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y                      0xFFFFFFC7
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(x)                   (((x) & 0x7) << 6)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(x)                   (((x) >> 6) & 0x7)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z                      0xFFFFFE3F
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(x)                   (((x) & 0x7) << 9)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(x)                   (((x) >> 9) & 0x7)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W                      0xFFFFF1FF
+#define P_SQ_ALU_WORD0
+#define   S_SQ_ALU_WORD0_SRC0_SEL(x)                                 (((x) & 0x1FF) << 0)
+#define   G_SQ_ALU_WORD0_SRC0_SEL(x)                                 (((x) >> 0) & 0x1FF)
+#define   C_SQ_ALU_WORD0_SRC0_SEL                                    0xFFFFFE00
+#define   S_SQ_ALU_WORD0_SRC0_REL(x)                                 (((x) & 0x1) << 9)
+#define   G_SQ_ALU_WORD0_SRC0_REL(x)                                 (((x) >> 9) & 0x1)
+#define   C_SQ_ALU_WORD0_SRC0_REL                                    0xFFFFFDFF
+#define   S_SQ_ALU_WORD0_SRC0_CHAN(x)                                (((x) & 0x3) << 10)
+#define   G_SQ_ALU_WORD0_SRC0_CHAN(x)                                (((x) >> 10) & 0x3)
+#define   C_SQ_ALU_WORD0_SRC0_CHAN                                   0xFFFFF3FF
+#define   S_SQ_ALU_WORD0_SRC0_NEG(x)                                 (((x) & 0x1) << 12)
+#define   G_SQ_ALU_WORD0_SRC0_NEG(x)                                 (((x) >> 12) & 0x1)
+#define   C_SQ_ALU_WORD0_SRC0_NEG                                    0xFFFFEFFF
+#define   S_SQ_ALU_WORD0_SRC1_SEL(x)                                 (((x) & 0x1FF) << 13)
+#define   G_SQ_ALU_WORD0_SRC1_SEL(x)                                 (((x) >> 13) & 0x1FF)
+#define   C_SQ_ALU_WORD0_SRC1_SEL                                    0xFFC01FFF
+#define   S_SQ_ALU_WORD0_SRC1_REL(x)                                 (((x) & 0x1) << 22)
+#define   G_SQ_ALU_WORD0_SRC1_REL(x)                                 (((x) >> 22) & 0x1)
+#define   C_SQ_ALU_WORD0_SRC1_REL                                    0xFFBFFFFF
+#define   S_SQ_ALU_WORD0_SRC1_CHAN(x)                                (((x) & 0x3) << 23)
+#define   G_SQ_ALU_WORD0_SRC1_CHAN(x)                                (((x) >> 23) & 0x3)
+#define   C_SQ_ALU_WORD0_SRC1_CHAN                                   0xFE7FFFFF
+#define   S_SQ_ALU_WORD0_SRC1_NEG(x)                                 (((x) & 0x1) << 25)
+#define   G_SQ_ALU_WORD0_SRC1_NEG(x)                                 (((x) >> 25) & 0x1)
+#define   C_SQ_ALU_WORD0_SRC1_NEG                                    0xFDFFFFFF
+#define   S_SQ_ALU_WORD0_INDEX_MODE(x)                               (((x) & 0x7) << 26)
+#define   G_SQ_ALU_WORD0_INDEX_MODE(x)                               (((x) >> 26) & 0x7)
+#define   C_SQ_ALU_WORD0_INDEX_MODE                                  0xE3FFFFFF
+#define   S_SQ_ALU_WORD0_PRED_SEL(x)                                 (((x) & 0x3) << 29)
+#define   G_SQ_ALU_WORD0_PRED_SEL(x)                                 (((x) >> 29) & 0x3)
+#define   C_SQ_ALU_WORD0_PRED_SEL                                    0x9FFFFFFF
+#define   S_SQ_ALU_WORD0_LAST(x)                                     (((x) & 0x1) << 31)
+#define   G_SQ_ALU_WORD0_LAST(x)                                     (((x) >> 31) & 0x1)
+#define   C_SQ_ALU_WORD0_LAST                                        0x7FFFFFFF
+#define P_SQ_ALU_WORD1
+#define   S_SQ_ALU_WORD1_ENCODING(x)                                 (((x) & 0x7) << 15)
+#define   G_SQ_ALU_WORD1_ENCODING(x)                                 (((x) >> 15) & 0x7)
+#define   C_SQ_ALU_WORD1_ENCODING                                    0xFFFC7FFF
+#define   S_SQ_ALU_WORD1_BANK_SWIZZLE(x)                             (((x) & 0x7) << 18)
+#define   G_SQ_ALU_WORD1_BANK_SWIZZLE(x)                             (((x) >> 18) & 0x7)
+#define   C_SQ_ALU_WORD1_BANK_SWIZZLE                                0xFFE3FFFF
+#define   S_SQ_ALU_WORD1_DST_GPR(x)                                  (((x) & 0x7F) << 21)
+#define   G_SQ_ALU_WORD1_DST_GPR(x)                                  (((x) >> 21) & 0x7F)
+#define   C_SQ_ALU_WORD1_DST_GPR                                     0xF01FFFFF
+#define   S_SQ_ALU_WORD1_DST_REL(x)                                  (((x) & 0x1) << 28)
+#define   G_SQ_ALU_WORD1_DST_REL(x)                                  (((x) >> 28) & 0x1)
+#define   C_SQ_ALU_WORD1_DST_REL                                     0xEFFFFFFF
+#define   S_SQ_ALU_WORD1_DST_CHAN(x)                                 (((x) & 0x3) << 29)
+#define   G_SQ_ALU_WORD1_DST_CHAN(x)                                 (((x) >> 29) & 0x3)
+#define   C_SQ_ALU_WORD1_DST_CHAN                                    0x9FFFFFFF
+#define   S_SQ_ALU_WORD1_CLAMP(x)                                    (((x) & 0x1) << 31)
+#define   G_SQ_ALU_WORD1_CLAMP(x)                                    (((x) >> 31) & 0x1)
+#define   C_SQ_ALU_WORD1_CLAMP                                       0x7FFFFFFF
+#define P_SQ_ALU_WORD1_OP2
+#define   S_SQ_ALU_WORD1_OP2_SRC0_ABS(x)                             (((x) & 0x1) << 0)
+#define   G_SQ_ALU_WORD1_OP2_SRC0_ABS(x)                             (((x) >> 0) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_SRC0_ABS                                0xFFFFFFFE
+#define   S_SQ_ALU_WORD1_OP2_SRC1_ABS(x)                             (((x) & 0x1) << 1)
+#define   G_SQ_ALU_WORD1_OP2_SRC1_ABS(x)                             (((x) >> 1) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_SRC1_ABS                                0xFFFFFFFD
+#define   S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(x)                  (((x) & 0x1) << 2)
+#define   G_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(x)                  (((x) >> 2) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK                     0xFFFFFFFB
+#define   S_SQ_ALU_WORD1_OP2_UPDATE_PRED(x)                          (((x) & 0x1) << 3)
+#define   G_SQ_ALU_WORD1_OP2_UPDATE_PRED(x)                          (((x) >> 3) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_UPDATE_PRED                             0xFFFFFFF7
+#define   S_SQ_ALU_WORD1_OP2_WRITE_MASK(x)                           (((x) & 0x1) << 4)
+#define   G_SQ_ALU_WORD1_OP2_WRITE_MASK(x)                           (((x) >> 4) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_WRITE_MASK                              0xFFFFFFEF
+#define   S_SQ_ALU_WORD1_OP2_FOG_MERGE(x)                            (((x) & 0x1) << 5)
+#define   G_SQ_ALU_WORD1_OP2_FOG_MERGE(x)                            (((x) >> 5) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_FOG_MERGE                               0xFFFFFFDF
+#define   S_SQ_ALU_WORD1_OP2_OMOD(x)                                 (((x) & 0x3) << 6)
+#define   G_SQ_ALU_WORD1_OP2_OMOD(x)                                 (((x) >> 6) & 0x3)
+#define   C_SQ_ALU_WORD1_OP2_OMOD                                    0xFFFFFF3F
+#define   S_SQ_ALU_WORD1_OP2_ALU_INST(x)                             (((x) & 0x3FF) << 8)
+#define   G_SQ_ALU_WORD1_OP2_ALU_INST(x)                             (((x) >> 8) & 0x3FF)
+#define   C_SQ_ALU_WORD1_OP2_ALU_INST                                0xFFFC00FF
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD                       0x00000000
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL                       0x00000001
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_IEEE                  0x00000002
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX                       0x00000003
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN                       0x00000004
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_DX10                  0x00000005
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_DX10                  0x00000006
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE                      0x00000008
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT                     0x00000009
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE                     0x0000000A
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE                     0x0000000B
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_DX10                 0x0000000C
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_DX10                0x0000000D
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_DX10                0x0000000E
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_DX10                0x0000000F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT                     0x00000010
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC                     0x00000011
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL                      0x00000012
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE                     0x00000013
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR                     0x00000014
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA                      0x00000015
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR                0x00000016
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT                  0x00000018
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV                       0x00000019
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP                       0x0000001A
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT           0x0000001E
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT           0x0000001F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE                 0x00000020
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT                0x00000021
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE                0x00000022
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE                0x00000023
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV              0x00000024
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP              0x00000025
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR              0x00000026
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE          0x00000027
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH            0x00000028
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH           0x00000029
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH           0x0000002A
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH           0x0000002B
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE                     0x0000002C
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT                    0x0000002D
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE                    0x0000002E
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE                    0x0000002F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT                   0x00000030
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT                    0x00000031
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT                   0x00000032
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT                   0x00000033
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT                   0x00000034
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT                   0x00000035
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT                   0x00000036
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT                   0x00000037
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT                  0x00000038
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT                  0x00000039
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT                  0x0000003A
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT                 0x0000003B
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT                 0x0000003C
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT                 0x0000003D
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT                0x0000003E
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT                0x0000003F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT               0x00000040
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT               0x00000041
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT             0x00000042
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT            0x00000043
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT            0x00000044
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT            0x00000045
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT                 0x00000046
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT                0x00000047
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT                0x00000048
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT                0x00000049
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT        0x0000004A
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT       0x0000004B
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT       0x0000004C
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT       0x0000004D
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT       0x0000004E
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT       0x0000004F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4                      0x00000050
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE                 0x00000051
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE                      0x00000052
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4                      0x00000053
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT              0x00000060
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE                  0x00000061
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED               0x00000062
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE                  0x00000063
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED             0x00000064
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF                  0x00000065
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE                0x00000066
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED         0x00000067
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF              0x00000068
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE            0x00000069
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE                 0x0000006A
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT                0x0000006B
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT                0x0000006C
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT               0x0000006D
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN                       0x0000006E
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS                       0x0000006F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT                  0x00000070
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT                  0x00000071
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT                  0x00000072
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT                 0x00000073
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT                 0x00000074
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT                0x00000075
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT                0x00000076
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT                 0x00000077
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT                0x00000078
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT               0x00000079
+#define P_SQ_ALU_WORD1_OP3
+#define   S_SQ_ALU_WORD1_OP3_SRC2_SEL(x)                             (((x) & 0x1FF) << 0)
+#define   G_SQ_ALU_WORD1_OP3_SRC2_SEL(x)                             (((x) >> 0) & 0x1FF)
+#define   C_SQ_ALU_WORD1_OP3_SRC2_SEL                                0xFFFFFE00
+#define   S_SQ_ALU_WORD1_OP3_SRC2_REL(x)                             (((x) & 0x1) << 9)
+#define   G_SQ_ALU_WORD1_OP3_SRC2_REL(x)                             (((x) >> 9) & 0x1)
+#define   C_SQ_ALU_WORD1_OP3_SRC2_REL                                0xFFFFFDFF
+#define   S_SQ_ALU_WORD1_OP3_SRC2_CHAN(x)                            (((x) & 0x3) << 10)
+#define   G_SQ_ALU_WORD1_OP3_SRC2_CHAN(x)                            (((x) >> 10) & 0x3)
+#define   C_SQ_ALU_WORD1_OP3_SRC2_CHAN                               0xFFFFF3FF
+#define   S_SQ_ALU_WORD1_OP3_SRC2_NEG(x)                             (((x) & 0x1) << 12)
+#define   G_SQ_ALU_WORD1_OP3_SRC2_NEG(x)                             (((x) >> 12) & 0x1)
+#define   C_SQ_ALU_WORD1_OP3_SRC2_NEG                                0xFFFFEFFF
+#define   S_SQ_ALU_WORD1_OP3_ALU_INST(x)                             (((x) & 0x1F) << 13)
+#define   G_SQ_ALU_WORD1_OP3_ALU_INST(x)                             (((x) >> 13) & 0x1F)
+#define   C_SQ_ALU_WORD1_OP3_ALU_INST                                0xFFFC1FFF
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT                   0x0000000C
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M2                0x0000000D
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M4                0x0000000E
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_D2                0x0000000F
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD                    0x00000010
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_M2                 0x00000011
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_M4                 0x00000012
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_D2                 0x00000013
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_IEEE               0x00000014
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_IEEE_M2            0x00000015
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_IEEE_M4            0x00000016
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_IEEE_D2            0x00000017
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE                      0x00000018
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT                     0x00000019
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE                     0x0000001A
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT                  0x0000001C
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT                 0x0000001D
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT                 0x0000001E
+#define P_SQ_VTX_WORD0
+#define   S_SQ_VTX_WORD0_VTX_INST(x)                                 (((x) & 0x1F) << 0)
+#define   G_SQ_VTX_WORD0_VTX_INST(x)                                 (((x) >> 0) & 0x1F)
+#define   C_SQ_VTX_WORD0_VTX_INST                                    0xFFFFFFE0
+#define   S_SQ_VTX_WORD0_FETCH_TYPE(x)                               (((x) & 0x3) << 5)
+#define   G_SQ_VTX_WORD0_FETCH_TYPE(x)                               (((x) >> 5) & 0x3)
+#define   C_SQ_VTX_WORD0_FETCH_TYPE                                  0xFFFFFF9F
+#define   S_SQ_VTX_WORD0_FETCH_WHOLE_QUAD(x)                         (((x) & 0x1) << 7)
+#define   G_SQ_VTX_WORD0_FETCH_WHOLE_QUAD(x)                         (((x) >> 7) & 0x1)
+#define   C_SQ_VTX_WORD0_FETCH_WHOLE_QUAD                            0xFFFFFF7F
+#define   S_SQ_VTX_WORD0_BUFFER_ID(x)                                (((x) & 0xFF) << 8)
+#define   G_SQ_VTX_WORD0_BUFFER_ID(x)                                (((x) >> 8) & 0xFF)
+#define   C_SQ_VTX_WORD0_BUFFER_ID                                   0xFFFF00FF
+#define   S_SQ_VTX_WORD0_SRC_GPR(x)                                  (((x) & 0x7F) << 16)
+#define   G_SQ_VTX_WORD0_SRC_GPR(x)                                  (((x) >> 16) & 0x7F)
+#define   C_SQ_VTX_WORD0_SRC_GPR                                     0xFF80FFFF
+#define   S_SQ_VTX_WORD0_SRC_REL(x)                                  (((x) & 0x1) << 23)
+#define   G_SQ_VTX_WORD0_SRC_REL(x)                                  (((x) >> 23) & 0x1)
+#define   C_SQ_VTX_WORD0_SRC_REL                                     0xFF7FFFFF
+#define   S_SQ_VTX_WORD0_SRC_SEL_X(x)                                (((x) & 0x3) << 24)
+#define   G_SQ_VTX_WORD0_SRC_SEL_X(x)                                (((x) >> 24) & 0x3)
+#define   C_SQ_VTX_WORD0_SRC_SEL_X                                   0xFCFFFFFF
+#define   S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(x)                         (((x) & 0x3F) << 26)
+#define   G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(x)                         (((x) >> 26) & 0x3F)
+#define   C_SQ_VTX_WORD0_MEGA_FETCH_COUNT                            0x03FFFFFF
+#define P_SQ_VTX_WORD1
+#define   S_SQ_VTX_WORD1_DST_SEL_X(x)                                (((x) & 0x7) << 9)
+#define   G_SQ_VTX_WORD1_DST_SEL_X(x)                                (((x) >> 9) & 0x7)
+#define   C_SQ_VTX_WORD1_DST_SEL_X                                   0xFFFFF1FF
+#define   S_SQ_VTX_WORD1_DST_SEL_Y(x)                                (((x) & 0x7) << 12)
+#define   G_SQ_VTX_WORD1_DST_SEL_Y(x)                                (((x) >> 12) & 0x7)
+#define   C_SQ_VTX_WORD1_DST_SEL_Y                                   0xFFFF8FFF
+#define   S_SQ_VTX_WORD1_DST_SEL_Z(x)                                (((x) & 0x7) << 15)
+#define   G_SQ_VTX_WORD1_DST_SEL_Z(x)                                (((x) >> 15) & 0x7)
+#define   C_SQ_VTX_WORD1_DST_SEL_Z                                   0xFFFC7FFF
+#define   S_SQ_VTX_WORD1_DST_SEL_W(x)                                (((x) & 0x7) << 18)
+#define   G_SQ_VTX_WORD1_DST_SEL_W(x)                                (((x) >> 18) & 0x7)
+#define   C_SQ_VTX_WORD1_DST_SEL_W                                   0xFFE3FFFF
+#define   S_SQ_VTX_WORD1_USE_CONST_FIELDS(x)                         (((x) & 0x1) << 21)
+#define   G_SQ_VTX_WORD1_USE_CONST_FIELDS(x)                         (((x) >> 21) & 0x1)
+#define   C_SQ_VTX_WORD1_USE_CONST_FIELDS                            0xFFDFFFFF
+#define   S_SQ_VTX_WORD1_DATA_FORMAT(x)                              (((x) & 0x3F) << 22)
+#define   G_SQ_VTX_WORD1_DATA_FORMAT(x)                              (((x) >> 22) & 0x3F)
+#define   C_SQ_VTX_WORD1_DATA_FORMAT                                 0xF03FFFFF
+#define   S_SQ_VTX_WORD1_NUM_FORMAT_ALL(x)                           (((x) & 0x3) << 28)
+#define   G_SQ_VTX_WORD1_NUM_FORMAT_ALL(x)                           (((x) >> 28) & 0x3)
+#define   C_SQ_VTX_WORD1_NUM_FORMAT_ALL                              0xCFFFFFFF
+#define   S_SQ_VTX_WORD1_FORMAT_COMP_ALL(x)                          (((x) & 0x1) << 30)
+#define   G_SQ_VTX_WORD1_FORMAT_COMP_ALL(x)                          (((x) >> 30) & 0x1)
+#define   C_SQ_VTX_WORD1_FORMAT_COMP_ALL                             0xBFFFFFFF
+#define   S_SQ_VTX_WORD1_SRF_MODE_ALL(x)                             (((x) & 0x1) << 31)
+#define   G_SQ_VTX_WORD1_SRF_MODE_ALL(x)                             (((x) >> 31) & 0x1)
+#define   C_SQ_VTX_WORD1_SRF_MODE_ALL                                0x7FFFFFFF
+#define P_SQ_VTX_WORD1_GPR
+#define   S_SQ_VTX_WORD1_GPR_DST_GPR(x)                              (((x) & 0x7F) << 0)
+#define   G_SQ_VTX_WORD1_GPR_DST_GPR(x)                              (((x) >> 0) & 0x7F)
+#define   C_SQ_VTX_WORD1_GPR_DST_GPR                                 0xFFFFFF80
+#define   S_SQ_VTX_WORD1_GPR_DST_REL(x)                              (((x) & 0x1) << 7)
+#define   G_SQ_VTX_WORD1_GPR_DST_REL(x)                              (((x) >> 7) & 0x1)
+#define   C_SQ_VTX_WORD1_GPR_DST_REL                                 0xFFFFFF7F
+#define P_SQ_VTX_WORD1_SEM
+#define   S_SQ_VTX_WORD1_SEM_SEMANTIC_ID(x)                          (((x) & 0xFF) << 0)
+#define   G_SQ_VTX_WORD1_SEM_SEMANTIC_ID(x)                          (((x) >> 0) & 0xFF)
+#define   C_SQ_VTX_WORD1_SEM_SEMANTIC_ID                             0xFFFFFF00
+#define P_SQ_VTX_WORD2
+#define   S_SQ_VTX_WORD2_OFFSET(x)                                   (((x) & 0xFFFF) << 0)
+#define   G_SQ_VTX_WORD2_OFFSET(x)                                   (((x) >> 0) & 0xFFFF)
+#define   C_SQ_VTX_WORD2_OFFSET                                      0xFFFF0000
+#define   S_SQ_VTX_WORD2_ENDIAN_SWAP(x)                              (((x) & 0x3) << 16)
+#define   G_SQ_VTX_WORD2_ENDIAN_SWAP(x)                              (((x) >> 16) & 0x3)
+#define   C_SQ_VTX_WORD2_ENDIAN_SWAP                                 0xFFFCFFFF
+#define   S_SQ_VTX_WORD2_CONST_BUF_NO_STRIDE(x)                      (((x) & 0x1) << 18)
+#define   G_SQ_VTX_WORD2_CONST_BUF_NO_STRIDE(x)                      (((x) >> 18) & 0x1)
+#define   C_SQ_VTX_WORD2_CONST_BUF_NO_STRIDE                         0xFFFBFFFF
+#define   S_SQ_VTX_WORD2_MEGA_FETCH(x)                               (((x) & 0x1) << 19)
+#define   G_SQ_VTX_WORD2_MEGA_FETCH(x)                               (((x) >> 19) & 0x1)
+#define   C_SQ_VTX_WORD2_MEGA_FETCH                                  0xFFF7FFFF
+#define   S_SQ_VTX_WORD2_ALT_CONST(x)                                (((x) & 0x1) << 20)
+#define   G_SQ_VTX_WORD2_ALT_CONST(x)                                (((x) >> 20) & 0x1)
+#define   C_SQ_VTX_WORD2_ALT_CONST                                   0xFFEFFFFF
+#define P_SQ_TEX_WORD0
+#define   S_SQ_TEX_WORD0_TEX_INST(x)                                 (((x) & 0x1F) << 0)
+#define   G_SQ_TEX_WORD0_TEX_INST(x)                                 (((x) >> 0) & 0x1F)
+#define   C_SQ_TEX_WORD0_TEX_INST                                    0xFFFFFFE0
+#define   S_SQ_TEX_WORD0_BC_FRAC_MODE(x)                             (((x) & 0x1) << 5)
+#define   G_SQ_TEX_WORD0_BC_FRAC_MODE(x)                             (((x) >> 5) & 0x1)
+#define   C_SQ_TEX_WORD0_BC_FRAC_MODE                                0xFFFFFFDF
+#define   S_SQ_TEX_WORD0_FETCH_WHOLE_QUAD(x)                         (((x) & 0x1) << 7)
+#define   G_SQ_TEX_WORD0_FETCH_WHOLE_QUAD(x)                         (((x) >> 7) & 0x1)
+#define   C_SQ_TEX_WORD0_FETCH_WHOLE_QUAD                            0xFFFFFF7F
+#define   S_SQ_TEX_WORD0_RESOURCE_ID(x)                              (((x) & 0xFF) << 8)
+#define   G_SQ_TEX_WORD0_RESOURCE_ID(x)                              (((x) >> 8) & 0xFF)
+#define   C_SQ_TEX_WORD0_RESOURCE_ID                                 0xFFFF00FF
+#define   S_SQ_TEX_WORD0_SRC_GPR(x)                                  (((x) & 0x7F) << 16)
+#define   G_SQ_TEX_WORD0_SRC_GPR(x)                                  (((x) >> 16) & 0x7F)
+#define   C_SQ_TEX_WORD0_SRC_GPR                                     0xFF80FFFF
+#define   S_SQ_TEX_WORD0_SRC_REL(x)                                  (((x) & 0x1) << 23)
+#define   G_SQ_TEX_WORD0_SRC_REL(x)                                  (((x) >> 23) & 0x1)
+#define   C_SQ_TEX_WORD0_SRC_REL                                     0xFF7FFFFF
+#define   S_SQ_TEX_WORD0_ALT_CONST(x)                                (((x) & 0x1) << 24)
+#define   G_SQ_TEX_WORD0_ALT_CONST(x)                                (((x) >> 24) & 0x1)
+#define   C_SQ_TEX_WORD0_ALT_CONST                                   0xFEFFFFFF
+#define P_SQ_TEX_WORD1
+#define   S_SQ_TEX_WORD1_DST_GPR(x)                                  (((x) & 0x7F) << 0)
+#define   G_SQ_TEX_WORD1_DST_GPR(x)                                  (((x) >> 0) & 0x7F)
+#define   C_SQ_TEX_WORD1_DST_GPR                                     0xFFFFFF80
+#define   S_SQ_TEX_WORD1_DST_REL(x)                                  (((x) & 0x1) << 7)
+#define   G_SQ_TEX_WORD1_DST_REL(x)                                  (((x) >> 7) & 0x1)
+#define   C_SQ_TEX_WORD1_DST_REL                                     0xFFFFFF7F
+#define   S_SQ_TEX_WORD1_DST_SEL_X(x)                                (((x) & 0x7) << 9)
+#define   G_SQ_TEX_WORD1_DST_SEL_X(x)                                (((x) >> 9) & 0x7)
+#define   C_SQ_TEX_WORD1_DST_SEL_X                                   0xFFFFF1FF
+#define   S_SQ_TEX_WORD1_DST_SEL_Y(x)                                (((x) & 0x7) << 12)
+#define   G_SQ_TEX_WORD1_DST_SEL_Y(x)                                (((x) >> 12) & 0x7)
+#define   C_SQ_TEX_WORD1_DST_SEL_Y                                   0xFFFF8FFF
+#define   S_SQ_TEX_WORD1_DST_SEL_Z(x)                                (((x) & 0x7) << 15)
+#define   G_SQ_TEX_WORD1_DST_SEL_Z(x)                                (((x) >> 15) & 0x7)
+#define   C_SQ_TEX_WORD1_DST_SEL_Z                                   0xFFFC7FFF
+#define   S_SQ_TEX_WORD1_DST_SEL_W(x)                                (((x) & 0x7) << 18)
+#define   G_SQ_TEX_WORD1_DST_SEL_W(x)                                (((x) >> 18) & 0x7)
+#define   C_SQ_TEX_WORD1_DST_SEL_W                                   0xFFE3FFFF
+#define   S_SQ_TEX_WORD1_LOD_BIAS(x)                                 (((x) & 0x7F) << 21)
+#define   G_SQ_TEX_WORD1_LOD_BIAS(x)                                 (((x) >> 21) & 0x7F)
+#define   C_SQ_TEX_WORD1_LOD_BIAS                                    0xF01FFFFF
+#define   S_SQ_TEX_WORD1_COORD_TYPE_X(x)                             (((x) & 0x1) << 28)
+#define   G_SQ_TEX_WORD1_COORD_TYPE_X(x)                             (((x) >> 28) & 0x1)
+#define   C_SQ_TEX_WORD1_COORD_TYPE_X                                0xEFFFFFFF
+#define   S_SQ_TEX_WORD1_COORD_TYPE_Y(x)                             (((x) & 0x1) << 29)
+#define   G_SQ_TEX_WORD1_COORD_TYPE_Y(x)                             (((x) >> 29) & 0x1)
+#define   C_SQ_TEX_WORD1_COORD_TYPE_Y                                0xDFFFFFFF
+#define   S_SQ_TEX_WORD1_COORD_TYPE_Z(x)                             (((x) & 0x1) << 30)
+#define   G_SQ_TEX_WORD1_COORD_TYPE_Z(x)                             (((x) >> 30) & 0x1)
+#define   C_SQ_TEX_WORD1_COORD_TYPE_Z                                0xBFFFFFFF
+#define   S_SQ_TEX_WORD1_COORD_TYPE_W(x)                             (((x) & 0x1) << 31)
+#define   G_SQ_TEX_WORD1_COORD_TYPE_W(x)                             (((x) >> 31) & 0x1)
+#define   C_SQ_TEX_WORD1_COORD_TYPE_W                                0x7FFFFFFF
+#define P_SQ_TEX_WORD2
+#define   S_SQ_TEX_WORD2_OFFSET_X(x)                                 (((x) & 0x1F) << 0)
+#define   G_SQ_TEX_WORD2_OFFSET_X(x)                                 (((x) >> 0) & 0x1F)
+#define   C_SQ_TEX_WORD2_OFFSET_X                                    0xFFFFFFE0
+#define   S_SQ_TEX_WORD2_OFFSET_Y(x)                                 (((x) & 0x1F) << 5)
+#define   G_SQ_TEX_WORD2_OFFSET_Y(x)                                 (((x) >> 5) & 0x1F)
+#define   C_SQ_TEX_WORD2_OFFSET_Y                                    0xFFFFFC1F
+#define   S_SQ_TEX_WORD2_OFFSET_Z(x)                                 (((x) & 0x1F) << 10)
+#define   G_SQ_TEX_WORD2_OFFSET_Z(x)                                 (((x) >> 10) & 0x1F)
+#define   C_SQ_TEX_WORD2_OFFSET_Z                                    0xFFFF83FF
+#define   S_SQ_TEX_WORD2_SAMPLER_ID(x)                               (((x) & 0x1F) << 15)
+#define   G_SQ_TEX_WORD2_SAMPLER_ID(x)                               (((x) >> 15) & 0x1F)
+#define   C_SQ_TEX_WORD2_SAMPLER_ID                                  0xFFF07FFF
+#define   S_SQ_TEX_WORD2_SRC_SEL_X(x)                                (((x) & 0x7) << 20)
+#define   G_SQ_TEX_WORD2_SRC_SEL_X(x)                                (((x) >> 20) & 0x7)
+#define   C_SQ_TEX_WORD2_SRC_SEL_X                                   0xFF8FFFFF
+#define   S_SQ_TEX_WORD2_SRC_SEL_Y(x)                                (((x) & 0x7) << 23)
+#define   G_SQ_TEX_WORD2_SRC_SEL_Y(x)                                (((x) >> 23) & 0x7)
+#define   C_SQ_TEX_WORD2_SRC_SEL_Y                                   0xFC7FFFFF
+#define   S_SQ_TEX_WORD2_SRC_SEL_Z(x)                                (((x) & 0x7) << 26)
+#define   G_SQ_TEX_WORD2_SRC_SEL_Z(x)                                (((x) >> 26) & 0x7)
+#define   C_SQ_TEX_WORD2_SRC_SEL_Z                                   0xE3FFFFFF
+#define   S_SQ_TEX_WORD2_SRC_SEL_W(x)                                (((x) & 0x7) << 29)
+#define   G_SQ_TEX_WORD2_SRC_SEL_W(x)                                (((x) >> 29) & 0x7)
+#define   C_SQ_TEX_WORD2_SRC_SEL_W                                   0x1FFFFFFF
+#define P_SQ_ALU_WORD1_OP2_V2
+#define   S_SQ_ALU_WORD1_OP2_V2_SRC0_ABS(x)                          (((x) & 0x1) << 0)
+#define   G_SQ_ALU_WORD1_OP2_V2_SRC0_ABS(x)                          (((x) >> 0) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_V2_SRC0_ABS                             0xFFFFFFFE
+#define   S_SQ_ALU_WORD1_OP2_V2_SRC1_ABS(x)                          (((x) & 0x1) << 1)
+#define   G_SQ_ALU_WORD1_OP2_V2_SRC1_ABS(x)                          (((x) >> 1) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_V2_SRC1_ABS                             0xFFFFFFFD
+#define   S_SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK(x)               (((x) & 0x1) << 2)
+#define   G_SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK(x)               (((x) >> 2) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK                  0xFFFFFFFB
+#define   S_SQ_ALU_WORD1_OP2_V2_UPDATE_PRED(x)                       (((x) & 0x1) << 3)
+#define   G_SQ_ALU_WORD1_OP2_V2_UPDATE_PRED(x)                       (((x) >> 3) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_V2_UPDATE_PRED                          0xFFFFFFF7
+#define   S_SQ_ALU_WORD1_OP2_V2_WRITE_MASK(x)                        (((x) & 0x1) << 4)
+#define   G_SQ_ALU_WORD1_OP2_V2_WRITE_MASK(x)                        (((x) >> 4) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_V2_WRITE_MASK                           0xFFFFFFEF
+#define   S_SQ_ALU_WORD1_OP2_V2_OMOD(x)                              (((x) & 0x3) << 5)
+#define   G_SQ_ALU_WORD1_OP2_V2_OMOD(x)                              (((x) >> 5) & 0x3)
+#define   C_SQ_ALU_WORD1_OP2_V2_OMOD                                 0xFFFFFF9F
+#define   S_SQ_ALU_WORD1_OP2_V2_ALU_INST(x)                          (((x) & 0x7FF) << 7)
+#define   G_SQ_ALU_WORD1_OP2_V2_ALU_INST(x)                          (((x) >> 7) & 0x7FF)
+#define   C_SQ_ALU_WORD1_OP2_V2_ALU_INST                             0xFFFC007F
+
+#endif
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
new file mode 100644
index 0000000000..4150f88785
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -0,0 +1,504 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+#include <stdio.h>
+#include <util/u_inlines.h>
+#include <util/u_format.h>
+#include <util/u_memory.h>
+#include "r600_screen.h"
+#include "r600_texture.h"
+#include "r600_context.h"
+#include "r600d.h"
+
+
+static void r600_delete_state(struct pipe_context *ctx, void *state)
+{
+	struct radeon_state *rstate = state;
+
+	radeon_state_decref(rstate);
+}
+
+static void *r600_create_blend_state(struct pipe_context *ctx,
+					const struct pipe_blend_state *state)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct radeon_state *rstate;
+
+	rstate = radeon_state(rscreen->rw, R600_BLEND_TYPE, R600_BLEND);
+	if (rstate == NULL)
+		return NULL;
+	rstate->states[R600_BLEND__CB_BLEND_RED] = 0x00000000;
+	rstate->states[R600_BLEND__CB_BLEND_GREEN] = 0x00000000;
+	rstate->states[R600_BLEND__CB_BLEND_BLUE] = 0x00000000;
+	rstate->states[R600_BLEND__CB_BLEND_ALPHA] = 0x00000000;
+	rstate->states[R600_BLEND__CB_BLEND0_CONTROL] = 0x00010001;
+	rstate->states[R600_BLEND__CB_BLEND1_CONTROL] = 0x00000000;
+	rstate->states[R600_BLEND__CB_BLEND2_CONTROL] = 0x00000000;
+	rstate->states[R600_BLEND__CB_BLEND3_CONTROL] = 0x00000000;
+	rstate->states[R600_BLEND__CB_BLEND4_CONTROL] = 0x00000000;
+	rstate->states[R600_BLEND__CB_BLEND5_CONTROL] = 0x00000000;
+	rstate->states[R600_BLEND__CB_BLEND6_CONTROL] = 0x00000000;
+	rstate->states[R600_BLEND__CB_BLEND7_CONTROL] = 0x00000000;
+	rstate->states[R600_BLEND__CB_BLEND_CONTROL] = 0x00000000;
+	if (radeon_state_pm4(rstate)) {
+		radeon_state_decref(rstate);
+		return NULL;
+	}
+	return rstate;
+}
+
+static void r600_bind_blend_state(struct pipe_context *ctx, void *state)
+{
+	struct r600_context *rctx = r600_context(ctx);
+	radeon_draw_set(rctx->draw, state);
+}
+
+static void r600_set_blend_color(struct pipe_context *ctx,
+					const struct pipe_blend_color *color)
+{
+}
+
+static void r600_set_clip_state(struct pipe_context *ctx,
+				const struct pipe_clip_state *state)
+{
+}
+
+static void r600_set_framebuffer_state(struct pipe_context *ctx,
+					const struct pipe_framebuffer_state *state)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct r600_context *rctx = r600_context(ctx);
+	struct r600_texture *rtex;
+	struct r600_buffer *rbuffer;
+	struct radeon_state *rstate;
+	unsigned level = state->cbufs[0]->level;
+	unsigned pitch, slice;
+
+	rstate = radeon_state(rscreen->rw, R600_CB0_TYPE, R600_CB0);
+	if (rstate == NULL)
+		return;
+	rtex = (struct r600_texture*)state->cbufs[0]->texture;
+	rbuffer = (struct r600_buffer*)rtex->buffer;
+	rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+	rstate->bo[1] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+	rstate->bo[2] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+	rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
+	rstate->placement[2] = RADEON_GEM_DOMAIN_GTT;
+	rstate->placement[4] = RADEON_GEM_DOMAIN_GTT;
+	rstate->nbo = 3;
+	pitch = rtex->pitch[level] / 8 - 1;
+	slice = rtex->pitch[level] * state->cbufs[0]->height / 64 - 1;
+	rstate->states[R600_CB0__CB_COLOR0_BASE] = 0x00000000;
+	rstate->states[R600_CB0__CB_COLOR0_INFO] = 0x08110068;
+	rstate->states[R600_CB0__CB_COLOR0_SIZE] = S_028060_PITCH_TILE_MAX(pitch) |
+						S_028060_SLICE_TILE_MAX(slice);
+	rstate->states[R600_CB0__CB_COLOR0_VIEW] = 0x00000000;
+	rstate->states[R600_CB0__CB_COLOR0_FRAG] = 0x00000000;
+	rstate->states[R600_CB0__CB_COLOR0_TILE] = 0x00000000;
+	rstate->states[R600_CB0__CB_COLOR0_MASK] = 0x00000000;
+	if (radeon_state_pm4(rstate)) {
+		radeon_state_decref(rstate);
+		return;
+	}
+	radeon_draw_set_new(rctx->draw, rstate);
+	rctx->db = radeon_state_decref(rctx->db);
+	if(state->zsbuf) {
+		rtex = (struct r600_texture*)state->zsbuf->texture;
+		rbuffer = (struct r600_buffer*)rtex->buffer;
+		rctx->db = radeon_state(rscreen->rw, R600_DB_TYPE, R600_DB);
+		if(rctx->db == NULL)
+		     return;
+		rctx->db->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+		rctx->db->nbo = 1;
+		rctx->db->placement[0] = RADEON_GEM_DOMAIN_VRAM;
+		level = state->zsbuf->level;
+		pitch = rtex->pitch[level] / 8 - 1;
+		slice = rtex->pitch[level] * state->zsbuf->height / 64 - 1;
+
+		rctx->db->states[R600_DB__DB_DEPTH_BASE] = 0x00000000;
+		rctx->db->states[R600_DB__DB_DEPTH_INFO] = 0x00010006;
+		rctx->db->states[R600_DB__DB_DEPTH_VIEW] = 0x00000000;
+		rctx->db->states[R600_DB__DB_PREFETCH_LIMIT] = (state->zsbuf->height / 8) -1;
+		rctx->db->states[R600_DB__DB_DEPTH_SIZE] = S_028000_PITCH_TILE_MAX(pitch) |
+						S_028000_SLICE_TILE_MAX(slice);
+	} else 
+		rctx->db = NULL;
+	rctx->fb_state = *state;
+}
+
+static void *r600_create_fs_state(struct pipe_context *ctx,
+					const struct pipe_shader_state *shader)
+{
+	return r600_pipe_shader_create(ctx, C_PROGRAM_TYPE_FS, shader->tokens);
+}
+
+static void r600_bind_fs_state(struct pipe_context *ctx, void *state)
+{
+	struct r600_context *rctx = r600_context(ctx);
+
+	rctx->ps_shader = state;
+}
+
+static void *r600_create_vs_state(struct pipe_context *ctx,
+					const struct pipe_shader_state *shader)
+{
+	return r600_pipe_shader_create(ctx, C_PROGRAM_TYPE_VS, shader->tokens);
+}
+
+static void r600_bind_vs_state(struct pipe_context *ctx, void *state)
+{
+	struct r600_context *rctx = r600_context(ctx);
+
+	rctx->vs_shader = state;
+}
+
+static void r600_set_polygon_stipple(struct pipe_context *ctx,
+					 const struct pipe_poly_stipple *state)
+{
+}
+
+static void *r600_create_rs_state(struct pipe_context *ctx,
+					const struct pipe_rasterizer_state *state)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct r600_context *rctx = r600_context(ctx);
+	struct radeon_state *rstate;
+
+	rctx->flat_shade = state->flatshade;
+	rstate = radeon_state(rscreen->rw, R600_RASTERIZER_TYPE, R600_RASTERIZER);
+	if (rstate == NULL)
+		return NULL;
+	rstate->states[R600_RASTERIZER__SPI_INTERP_CONTROL_0] = 0x00000001;
+	rstate->states[R600_RASTERIZER__PA_CL_CLIP_CNTL] = 0x00000000;
+	rstate->states[R600_RASTERIZER__PA_SU_SC_MODE_CNTL] = 0x00080000;
+	rstate->states[R600_RASTERIZER__PA_CL_VS_OUT_CNTL] = 0x00000000;
+	rstate->states[R600_RASTERIZER__PA_CL_NANINF_CNTL] = 0x00000000;
+	rstate->states[R600_RASTERIZER__PA_SU_POINT_SIZE] = 0x00080008;
+	rstate->states[R600_RASTERIZER__PA_SU_POINT_MINMAX] = 0x00000000;
+	rstate->states[R600_RASTERIZER__PA_SU_LINE_CNTL] = 0x00000008;
+	rstate->states[R600_RASTERIZER__PA_SC_LINE_STIPPLE] = 0x00000005;
+	rstate->states[R600_RASTERIZER__PA_SC_MPASS_PS_CNTL] = 0x00000000;
+	rstate->states[R600_RASTERIZER__PA_SC_LINE_CNTL] = 0x00000400;
+	rstate->states[R600_RASTERIZER__PA_CL_GB_VERT_CLIP_ADJ] = 0x3F800000;
+	rstate->states[R600_RASTERIZER__PA_CL_GB_VERT_DISC_ADJ] = 0x3F800000;
+	rstate->states[R600_RASTERIZER__PA_CL_GB_HORZ_CLIP_ADJ] = 0x3F800000;
+	rstate->states[R600_RASTERIZER__PA_CL_GB_HORZ_DISC_ADJ] = 0x3F800000;
+	rstate->states[R600_RASTERIZER__PA_SU_POLY_OFFSET_DB_FMT_CNTL] = 0x00000000;
+	rstate->states[R600_RASTERIZER__PA_SU_POLY_OFFSET_CLAMP] = 0x00000000;
+	rstate->states[R600_RASTERIZER__PA_SU_POLY_OFFSET_FRONT_SCALE] = 0x00000000;
+	rstate->states[R600_RASTERIZER__PA_SU_POLY_OFFSET_FRONT_OFFSET] = 0x00000000;
+	rstate->states[R600_RASTERIZER__PA_SU_POLY_OFFSET_BACK_SCALE] = 0x00000000;
+	rstate->states[R600_RASTERIZER__PA_SU_POLY_OFFSET_BACK_OFFSET] = 0x00000000;
+	if (radeon_state_pm4(rstate)) {
+		radeon_state_decref(rstate);
+		return NULL;
+	}
+	return rstate;
+}
+
+static void r600_bind_rs_state(struct pipe_context *ctx, void *state)
+{
+	struct r600_context *rctx = r600_context(ctx);
+	radeon_draw_set(rctx->draw, state);
+}
+
+static void *r600_create_sampler_state(struct pipe_context *ctx,
+					const struct pipe_sampler_state *state)
+{
+	return NULL;
+}
+
+static void r600_bind_sampler_states(struct pipe_context *ctx,
+					unsigned count, void **states)
+{
+}
+
+static struct pipe_sampler_view *r600_create_sampler_view(struct pipe_context *ctx,
+							  struct pipe_resource *texture,
+							  const struct pipe_sampler_view *templ)
+{
+	struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
+
+	*view = *templ;
+	return view;
+}
+
+static void r600_sampler_view_destroy(struct pipe_context *ctx,
+				      struct pipe_sampler_view *view)
+{
+	FREE(view);
+}
+
+static void r600_set_fragment_sampler_views(struct pipe_context *ctx,
+					    unsigned count,
+					    struct pipe_sampler_view **views)
+{
+}
+
+static void r600_set_vertex_sampler_views(struct pipe_context *ctx,
+					  unsigned count,
+					  struct pipe_sampler_view **views)
+{
+}
+
+static void r600_set_scissor_state(struct pipe_context *ctx,
+					const struct pipe_scissor_state *state)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct r600_context *rctx = r600_context(ctx);
+	struct radeon_state *rstate;
+	u32 tl, br;
+
+	tl = S_028240_TL_X(state->minx) | S_028240_TL_Y(state->miny) | S_028240_WINDOW_OFFSET_DISABLE(1);
+	br = S_028244_BR_X(state->maxx) | S_028244_BR_Y(state->maxy);
+	rstate = radeon_state(rscreen->rw, R600_SCISSOR_TYPE, R600_SCISSOR);
+	if (rstate == NULL)
+		return;
+	rstate->states[R600_SCISSOR__PA_SC_SCREEN_SCISSOR_TL] = tl;
+	rstate->states[R600_SCISSOR__PA_SC_SCREEN_SCISSOR_BR] = br;
+	rstate->states[R600_SCISSOR__PA_SC_WINDOW_OFFSET] = 0x00000000;
+	rstate->states[R600_SCISSOR__PA_SC_WINDOW_SCISSOR_TL] = tl;
+	rstate->states[R600_SCISSOR__PA_SC_WINDOW_SCISSOR_BR] = br;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_RULE] = 0x0000FFFF;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_0_TL] = tl;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_0_BR] = br;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_1_TL] = tl;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_1_BR] = br;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_2_TL] = tl;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_2_BR] = br;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_3_TL] = tl;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_3_BR] = br;
+	rstate->states[R600_SCISSOR__PA_SC_EDGERULE] = 0xAAAAAAAA;
+	rstate->states[R600_SCISSOR__PA_SC_GENERIC_SCISSOR_TL] = tl;
+	rstate->states[R600_SCISSOR__PA_SC_GENERIC_SCISSOR_BR] = br;
+	rstate->states[R600_SCISSOR__PA_SC_VPORT_SCISSOR_0_TL] = tl;
+	rstate->states[R600_SCISSOR__PA_SC_VPORT_SCISSOR_0_BR] = br;
+	if (radeon_state_pm4(rstate)) {
+		radeon_state_decref(rstate);
+		return;
+	}
+	radeon_draw_set_new(rctx->draw, rstate);
+}
+
+static void r600_set_viewport_state(struct pipe_context *ctx,
+					const struct pipe_viewport_state *state)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct r600_context *rctx = r600_context(ctx);
+	struct radeon_state *rstate;
+
+	rstate = radeon_state(rscreen->rw, R600_VIEWPORT_TYPE, R600_VIEWPORT);
+	if (rstate == NULL)
+		return;
+	rstate->states[R600_VIEWPORT__PA_SC_VPORT_ZMIN_0] = 0x00000000;
+	rstate->states[R600_VIEWPORT__PA_SC_VPORT_ZMAX_0] = 0x3F800000;
+	rstate->states[R600_VIEWPORT__PA_CL_VPORT_XSCALE_0] = r600_float_to_u32(state->scale[0]);
+	rstate->states[R600_VIEWPORT__PA_CL_VPORT_YSCALE_0] = r600_float_to_u32(state->scale[1]);
+	rstate->states[R600_VIEWPORT__PA_CL_VPORT_ZSCALE_0] = r600_float_to_u32(state->scale[2]);
+	rstate->states[R600_VIEWPORT__PA_CL_VPORT_XOFFSET_0] = r600_float_to_u32(state->translate[0]);
+	rstate->states[R600_VIEWPORT__PA_CL_VPORT_YOFFSET_0] = r600_float_to_u32(state->translate[1]);
+	rstate->states[R600_VIEWPORT__PA_CL_VPORT_ZOFFSET_0] = r600_float_to_u32(state->translate[2]);
+	rstate->states[R600_VIEWPORT__PA_CL_VTE_CNTL] = 0x0000043F;
+	if (radeon_state_pm4(rstate)) {
+		radeon_state_decref(rstate);
+		return;
+	}
+	radeon_draw_set_new(rctx->draw, rstate);
+	rctx->viewport = *state;
+}
+
+static void r600_set_vertex_buffers(struct pipe_context *ctx,
+					unsigned count,
+					const struct pipe_vertex_buffer *buffers)
+{
+	struct r600_context *rctx = r600_context(ctx);
+
+	memcpy(rctx->vertex_buffer, buffers, sizeof(struct pipe_vertex_buffer) * count);
+	rctx->nvertex_buffer = count;
+}
+
+
+static void *r600_create_vertex_elements_state(struct pipe_context *ctx,
+					       unsigned count,
+					       const struct pipe_vertex_element *elements)
+{
+	struct r600_vertex_elements_state *v = CALLOC_STRUCT(r600_vertex_elements_state);
+
+	assert(count < 32);
+	v->count = count;
+	memcpy(v->elements, elements, count * sizeof(struct pipe_vertex_element));
+	return v;
+}
+
+static void r600_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
+{
+	struct r600_context *rctx = r600_context(ctx);
+	struct r600_vertex_elements_state *v = (struct r600_vertex_elements_state*)state;
+
+	rctx->vertex_elements = v;
+}
+
+static void r600_delete_vertex_elements_state(struct pipe_context *ctx, void *state)
+{
+	FREE(state);
+}
+
+static void *r600_create_dsa_state(struct pipe_context *ctx,
+					const struct pipe_depth_stencil_alpha_state *state)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct radeon_state *rstate;
+	unsigned db_depth_control;
+
+	rstate = radeon_state(rscreen->rw, R600_DSA_TYPE, R600_DSA);
+	if (rstate == NULL)
+		return NULL;
+	db_depth_control = 0x00700700 | S_028800_Z_ENABLE(state->depth.enabled) | S_028800_Z_WRITE_ENABLE(state->depth.writemask) | S_028800_ZFUNC(state->depth.func);
+	
+	rstate->states[R600_DSA__DB_STENCIL_CLEAR] = 0x00000000;
+	rstate->states[R600_DSA__DB_DEPTH_CLEAR] = 0x3F800000;
+	rstate->states[R600_DSA__SX_ALPHA_TEST_CONTROL] = 0x00000000;
+	rstate->states[R600_DSA__DB_STENCILREFMASK] = 0xFFFFFF00;
+	rstate->states[R600_DSA__DB_STENCILREFMASK_BF] = 0xFFFFFF00;
+	rstate->states[R600_DSA__SX_ALPHA_REF] = 0x00000000;
+	rstate->states[R600_DSA__SPI_FOG_FUNC_SCALE] = 0x00000000;
+	rstate->states[R600_DSA__SPI_FOG_FUNC_BIAS] = 0x00000000;
+	rstate->states[R600_DSA__SPI_FOG_CNTL] = 0x00000000;
+	rstate->states[R600_DSA__DB_DEPTH_CONTROL] = db_depth_control;
+	rstate->states[R600_DSA__DB_SHADER_CONTROL] = 0x00000210;
+	rstate->states[R600_DSA__DB_RENDER_CONTROL] = 0x00000060;
+	rstate->states[R600_DSA__DB_RENDER_OVERRIDE] = 0x0000002A;
+	rstate->states[R600_DSA__DB_SRESULTS_COMPARE_STATE1] = 0x00000000;
+	rstate->states[R600_DSA__DB_PRELOAD_CONTROL] = 0x00000000;
+	rstate->states[R600_DSA__DB_ALPHA_TO_MASK] = 0x0000AA00;
+	if (radeon_state_pm4(rstate)) {
+		radeon_state_decref(rstate);
+		return NULL;
+	}
+	return rstate;
+}
+
+static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
+{
+	struct r600_context *rctx = r600_context(ctx);
+	radeon_draw_set(rctx->draw, state);
+}
+
+static void r600_set_constant_buffer(struct pipe_context *ctx,
+				     uint shader, uint index,
+				     struct pipe_resource *buffer)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct r600_context *rctx = r600_context(ctx);
+	unsigned nconstant = 0, i, type, id;
+	struct radeon_state *rstate;
+	struct pipe_transfer *transfer;
+	u32 *ptr;
+
+	switch (shader) {
+	case PIPE_SHADER_VERTEX:
+		id = R600_VS_CONSTANT;
+		type = R600_VS_CONSTANT_TYPE;
+		break;
+	case PIPE_SHADER_FRAGMENT:
+		id = R600_PS_CONSTANT;
+		type = R600_PS_CONSTANT_TYPE;
+		break;
+	default:
+		fprintf(stderr, "%s:%d unsupported %d\n", __func__, __LINE__, shader);
+		return;
+	}
+	if (buffer && buffer->width0 > 0) {
+		nconstant = buffer->width0 / 16;
+		ptr = pipe_buffer_map(ctx, buffer, PIPE_TRANSFER_READ, &transfer);
+		if (ptr == NULL)
+			return;
+		for (i = 0; i < nconstant; i++) {
+			rstate = radeon_state(rscreen->rw, type, id + i);
+			if (rstate == NULL)
+				return;
+			rstate->states[R600_PS_CONSTANT__SQ_ALU_CONSTANT0_0] = ptr[i * 4 + 0];
+			rstate->states[R600_PS_CONSTANT__SQ_ALU_CONSTANT1_0] = ptr[i * 4 + 1];
+			rstate->states[R600_PS_CONSTANT__SQ_ALU_CONSTANT2_0] = ptr[i * 4 + 2];
+			rstate->states[R600_PS_CONSTANT__SQ_ALU_CONSTANT3_0] = ptr[i * 4 + 3];
+			if (radeon_state_pm4(rstate))
+				return;
+			if (radeon_draw_set_new(rctx->draw, rstate))
+				return;
+		}
+		pipe_buffer_unmap(ctx, buffer, transfer);
+	}
+}
+
+static void r600_set_stencil_ref(struct pipe_context *ctx,
+				const struct pipe_stencil_ref *sr)
+{
+	struct r600_context *rctx = r600_context(ctx);
+	rctx->stencil_ref = *sr;
+}
+
+static void r600_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
+{
+}
+
+void r600_init_state_functions(struct r600_context *rctx)
+{
+	rctx->context.set_sample_mask = r600_set_sample_mask;
+	rctx->context.create_blend_state = r600_create_blend_state;
+	rctx->context.bind_blend_state = r600_bind_blend_state;
+	rctx->context.delete_blend_state = r600_delete_state;
+	rctx->context.set_blend_color = r600_set_blend_color;
+	rctx->context.set_clip_state = r600_set_clip_state;
+	rctx->context.set_constant_buffer = r600_set_constant_buffer;
+	rctx->context.create_depth_stencil_alpha_state = r600_create_dsa_state;
+	rctx->context.bind_depth_stencil_alpha_state = r600_bind_dsa_state;
+	rctx->context.delete_depth_stencil_alpha_state = r600_delete_state;
+	rctx->context.set_framebuffer_state = r600_set_framebuffer_state;
+	rctx->context.create_fs_state = r600_create_fs_state;
+	rctx->context.bind_fs_state = r600_bind_fs_state;
+	rctx->context.delete_fs_state = r600_delete_state;
+	rctx->context.set_polygon_stipple = r600_set_polygon_stipple;
+	rctx->context.create_rasterizer_state = r600_create_rs_state;
+	rctx->context.bind_rasterizer_state = r600_bind_rs_state;
+	rctx->context.delete_rasterizer_state = r600_delete_state;
+	rctx->context.create_sampler_state = r600_create_sampler_state;
+	rctx->context.bind_fragment_sampler_states = r600_bind_sampler_states;
+	rctx->context.bind_vertex_sampler_states = r600_bind_sampler_states;
+	rctx->context.delete_sampler_state = r600_delete_state;
+	rctx->context.create_sampler_view = r600_create_sampler_view;
+	rctx->context.sampler_view_destroy = r600_sampler_view_destroy;
+	rctx->context.set_fragment_sampler_views = r600_set_fragment_sampler_views;
+	rctx->context.set_vertex_sampler_views = r600_set_vertex_sampler_views;
+	rctx->context.set_scissor_state = r600_set_scissor_state;
+	rctx->context.set_viewport_state = r600_set_viewport_state;
+	rctx->context.set_vertex_buffers = r600_set_vertex_buffers;
+	rctx->context.create_vertex_elements_state = r600_create_vertex_elements_state;
+	rctx->context.bind_vertex_elements_state = r600_bind_vertex_elements_state;
+	rctx->context.delete_vertex_elements_state = r600_delete_vertex_elements_state;
+	rctx->context.create_vs_state = r600_create_vs_state;
+	rctx->context.bind_vs_state = r600_bind_vs_state;
+	rctx->context.delete_vs_state = r600_delete_state;
+	rctx->context.set_stencil_ref = r600_set_stencil_ref;
+}
diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c
new file mode 100644
index 0000000000..7d94bbe510
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ *      Corbin Simpson
+ */
+#include <pipe/p_screen.h>
+#include <util/u_format.h>
+#include <util/u_math.h>
+#include <util/u_inlines.h>
+#include <util/u_memory.h>
+#include "state_tracker/drm_api.h"
+#include "r600_screen.h"
+#include "r600_texture.h"
+
+extern struct u_resource_vtbl r600_texture_vtbl;
+
+unsigned long r600_texture_get_offset(struct r600_texture *rtex, unsigned level, unsigned zslice, unsigned face)
+{
+	unsigned long offset = rtex->offset[level];
+
+	switch (rtex->b.b.target) {
+	case PIPE_TEXTURE_3D:
+		assert(face == 0);
+		return offset + zslice * rtex->layer_size[level];
+	case PIPE_TEXTURE_CUBE:
+		assert(zslice == 0);
+		return offset + face * rtex->layer_size[level];
+	default:
+		assert(zslice == 0 && face == 0);
+		return offset;
+	}
+}
+
+static void r600_setup_miptree(struct r600_screen *rscreen, struct r600_texture *rtex)
+{
+	struct pipe_resource *ptex = &rtex->b.b;
+	unsigned long w, h, stride, size, layer_size, i, offset;
+
+	for (i = 0, offset = 0; i <= ptex->last_level; i++) {
+		w = u_minify(ptex->width0, i);
+		h = u_minify(ptex->height0, i);
+		stride = align(util_format_get_stride(ptex->format, w), 32);
+		layer_size = stride * h;
+		if (ptex->target == PIPE_TEXTURE_CUBE)
+			size = layer_size * 6;
+		else
+			size = layer_size * u_minify(ptex->depth0, i);
+		rtex->offset[i] = offset;
+		rtex->layer_size[i] = layer_size;
+		rtex->pitch[i] = stride / util_format_get_blocksize(ptex->format);
+		rtex->stride[i] = stride;
+		offset += align(size, 32);
+	}
+	rtex->size = offset;
+}
+
+struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
+					  const struct pipe_resource *templ)
+{
+	struct r600_texture *rtex = CALLOC_STRUCT(r600_texture);
+	struct r600_screen *rscreen = r600_screen(screen);
+	struct pipe_resource templ_buf;
+
+	if (!rtex) {
+		return NULL;
+	}
+	rtex->b.b = *templ;
+	rtex->b.vtbl = &r600_texture_vtbl;
+	pipe_reference_init(&rtex->b.b.reference, 1);
+	rtex->b.b.screen = screen;
+	r600_setup_miptree(rscreen, rtex);
+
+	memset(&templ_buf, 0, sizeof(struct pipe_resource));
+	templ_buf.target = PIPE_BUFFER;
+	templ_buf.format = PIPE_FORMAT_R8_UNORM;
+	templ_buf.usage = templ->usage;
+	templ_buf.bind = templ->bind;
+	templ_buf.width0 = rtex->size;
+	templ_buf.height0 = 1;
+	templ_buf.depth0 = 1;
+
+	rtex->buffer = screen->resource_create(screen, &templ_buf);
+	if (!rtex->buffer) {
+		FREE(rtex);
+		return NULL;
+	}
+	return &rtex->b.b;
+}
+
+static void r600_texture_destroy(struct pipe_screen *screen,
+				 struct pipe_resource *ptex)
+{
+	struct r600_texture *rtex = (struct r600_texture*)ptex;
+
+	FREE(rtex);
+}
+
+static struct pipe_surface *r600_get_tex_surface(struct pipe_screen *screen,
+						struct pipe_resource *texture,
+						unsigned face, unsigned level,
+						unsigned zslice, unsigned flags)
+{
+	struct r600_texture *rtex = (struct r600_texture*)texture;
+	struct pipe_surface *surface = CALLOC_STRUCT(pipe_surface);
+	unsigned long offset;
+
+	if (surface == NULL)
+		return NULL;
+	offset = r600_texture_get_offset(rtex, level, zslice, face);
+	pipe_reference_init(&surface->reference, 1);
+	pipe_resource_reference(&surface->texture, texture);
+	surface->format = texture->format;
+	surface->width = u_minify(texture->width0, level);
+	surface->height = u_minify(texture->height0, level);
+	surface->offset = offset;
+	surface->usage = flags;
+	surface->zslice = zslice;
+	surface->texture = texture;
+	surface->face = face;
+	surface->level = level;
+	return surface;
+}
+
+static void r600_tex_surface_destroy(struct pipe_surface *surface)
+{
+	pipe_resource_reference(&surface->texture, NULL);
+	FREE(surface);
+}
+
+struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen,
+					       const struct pipe_resource *base,
+					       struct winsys_handle *whandle)
+{
+	struct pipe_resource *buffer;
+	struct r600_texture *rtex;
+
+	buffer = r600_buffer_from_handle(screen, whandle);
+	if (buffer == NULL) {
+		return NULL;
+	}
+
+	/* Support only 2D textures without mipmaps */
+	if (base->target != PIPE_TEXTURE_2D || base->depth0 != 1 || base->last_level != 0)
+		return NULL;
+
+	rtex = CALLOC_STRUCT(r600_texture);
+	if (rtex == NULL)
+		return NULL;
+
+	/* one ref already taken */
+	rtex->buffer = buffer;
+
+	rtex->b.b = *base;
+	rtex->b.vtbl = &r600_texture_vtbl;
+	pipe_reference_init(&rtex->b.b.reference, 1);
+	rtex->b.b.screen = screen;
+	rtex->stride_override = whandle->stride;
+	rtex->pitch[0] = whandle->stride / util_format_get_blocksize(base->format);
+	rtex->stride[0] = whandle->stride;
+	rtex->offset[0] = 0;
+	rtex->size = align(rtex->stride[0] * base->height0, 32);
+
+	return &rtex->b.b;
+}
+
+static boolean r600_texture_get_handle(struct pipe_screen* screen,
+				       struct pipe_resource *texture,
+				       struct winsys_handle *whandle)
+{
+	struct r600_screen *rscreen = r600_screen(screen);
+	struct r600_texture* rtex = (struct r600_texture*)texture;
+
+	if (!rtex) {
+		return FALSE;
+	}
+
+	whandle->stride = rtex->stride[0];
+
+	r600_buffer_get_handle(rscreen->rw, rtex->buffer, whandle);
+
+	return TRUE;
+}
+
+static unsigned int r600_texture_is_referenced(struct pipe_context *context,
+						struct pipe_resource *texture,
+						unsigned face, unsigned level)
+{
+	struct r600_texture *rtex = (struct r600_texture*)texture;
+
+	return r600_buffer_is_referenced_by_cs(context, rtex->buffer, face, level);
+}
+
+struct u_resource_vtbl r600_texture_vtbl =
+{
+	r600_texture_get_handle,	/* get_handle */
+	r600_texture_destroy,		/* resource_destroy */
+	r600_texture_is_referenced,	/* is_resource_referenced */
+	r600_texture_get_transfer,	/* get_transfer */
+	r600_texture_transfer_destroy,	/* transfer_destroy */
+	r600_texture_transfer_map,	/* transfer_map */
+	u_default_transfer_flush_region,/* transfer_flush_region */
+	r600_texture_transfer_unmap,	/* transfer_unmap */
+	u_default_transfer_inline_write	/* transfer_inline_write */
+};
+
+void r600_init_screen_texture_functions(struct pipe_screen *screen)
+{
+	screen->get_tex_surface = r600_get_tex_surface;
+	screen->tex_surface_destroy = r600_tex_surface_destroy;
+}
diff --git a/src/gallium/drivers/r600/r600_texture.h b/src/gallium/drivers/r600/r600_texture.h
new file mode 100644
index 0000000000..9bc08d6b04
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_texture.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef R600_TEXTURE_H
+#define R600_TEXTURE_H
+
+#include <pipe/p_state.h>
+
+struct r600_texture {
+	struct u_resource		b;
+	unsigned long			offset[PIPE_MAX_TEXTURE_LEVELS];
+	unsigned long			pitch[PIPE_MAX_TEXTURE_LEVELS];
+	unsigned long			stride[PIPE_MAX_TEXTURE_LEVELS];
+	unsigned long			layer_size[PIPE_MAX_TEXTURE_LEVELS];
+	unsigned long			stride_override;
+	unsigned long			size;
+	struct pipe_resource		*buffer;
+};
+
+struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
+					  const struct pipe_resource *templ);
+unsigned long r600_texture_get_offset(struct r600_texture *rtex, unsigned level, unsigned zslice, unsigned face);
+struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen,
+					       const struct pipe_resource *base,
+					       struct winsys_handle *whandle);
+void r600_init_screen_texture_functions(struct pipe_screen *screen);
+
+/* This should be implemented by winsys. */
+boolean r600_buffer_get_handle(struct radeon *rw,
+			       struct pipe_resource *buf,
+			       struct winsys_handle *whandle);
+
+
+#endif
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
new file mode 100644
index 0000000000..d2c7248ff2
--- /dev/null
+++ b/src/gallium/drivers/r600/r600d.h
@@ -0,0 +1,677 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+#ifndef R600D_H
+#define R600D_H
+
+#define PKT3_NOP                               0x10
+#define PKT3_INDIRECT_BUFFER_END               0x17
+#define PKT3_SET_PREDICATION                   0x20
+#define PKT3_REG_RMW                           0x21
+#define PKT3_COND_EXEC                         0x22
+#define PKT3_PRED_EXEC                         0x23
+#define PKT3_START_3D_CMDBUF                   0x24
+#define PKT3_DRAW_INDEX_2                      0x27
+#define PKT3_CONTEXT_CONTROL                   0x28
+#define PKT3_DRAW_INDEX_IMMD_BE                0x29
+#define PKT3_INDEX_TYPE                        0x2A
+#define PKT3_DRAW_INDEX                        0x2B
+#define PKT3_DRAW_INDEX_AUTO                   0x2D
+#define PKT3_DRAW_INDEX_IMMD                   0x2E
+#define PKT3_NUM_INSTANCES                     0x2F
+#define PKT3_STRMOUT_BUFFER_UPDATE             0x34
+#define PKT3_INDIRECT_BUFFER_MP                0x38
+#define PKT3_MEM_SEMAPHORE                     0x39
+#define PKT3_MPEG_INDEX                        0x3A
+#define PKT3_WAIT_REG_MEM                      0x3C
+#define PKT3_MEM_WRITE                         0x3D
+#define PKT3_INDIRECT_BUFFER                   0x32
+#define PKT3_CP_INTERRUPT                      0x40
+#define PKT3_SURFACE_SYNC                      0x43
+#define PKT3_ME_INITIALIZE                     0x44
+#define PKT3_COND_WRITE                        0x45
+#define PKT3_EVENT_WRITE                       0x46
+#define PKT3_EVENT_WRITE_EOP                   0x47
+#define PKT3_ONE_REG_WRITE                     0x57
+#define PKT3_SET_CONFIG_REG                    0x68
+#define PKT3_SET_CONTEXT_REG                   0x69
+#define PKT3_SET_ALU_CONST                     0x6A
+#define PKT3_SET_BOOL_CONST                    0x6B
+#define PKT3_SET_LOOP_CONST                    0x6C
+#define PKT3_SET_RESOURCE                      0x6D
+#define PKT3_SET_SAMPLER                       0x6E
+#define PKT3_SET_CTL_CONST                     0x6F
+#define PKT3_SURFACE_BASE_UPDATE               0x73
+
+#define PKT_TYPE_S(x)                   (((x) & 0x3) << 30)
+#define PKT_TYPE_G(x)                   (((x) >> 30) & 0x3)
+#define PKT_TYPE_C                      0x3FFFFFFF
+#define PKT_COUNT_S(x)                  (((x) & 0x3FFF) << 16)
+#define PKT_COUNT_G(x)                  (((x) >> 16) & 0x3FFF)
+#define PKT_COUNT_C                     0xC000FFFF
+#define PKT0_BASE_INDEX_S(x)            (((x) & 0xFFFF) << 0)
+#define PKT0_BASE_INDEX_G(x)            (((x) >> 0) & 0xFFFF)
+#define PKT0_BASE_INDEX_C               0xFFFF0000
+#define PKT3_IT_OPCODE_S(x)             (((x) & 0xFF) << 8)
+#define PKT3_IT_OPCODE_G(x)             (((x) >> 8) & 0xFF)
+#define PKT3_IT_OPCODE_C                0xFFFF00FF
+#define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count))
+#define PKT3(op, count) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count))
+
+/* Registers */
+#define R_0280A0_CB_COLOR0_INFO                      0x0280A0
+#define   S_0280A0_ENDIAN(x)                           (((x) & 0x3) << 0)
+#define   G_0280A0_ENDIAN(x)                           (((x) >> 0) & 0x3)
+#define   C_0280A0_ENDIAN                              0xFFFFFFFC
+#define   S_0280A0_FORMAT(x)                           (((x) & 0x3F) << 2)
+#define   G_0280A0_FORMAT(x)                           (((x) >> 2) & 0x3F)
+#define   C_0280A0_FORMAT                              0xFFFFFF03
+#define     V_0280A0_COLOR_INVALID                     0x00000000
+#define     V_0280A0_COLOR_8                           0x00000001
+#define     V_0280A0_COLOR_4_4                         0x00000002
+#define     V_0280A0_COLOR_3_3_2                       0x00000003
+#define     V_0280A0_COLOR_16                          0x00000005
+#define     V_0280A0_COLOR_16_FLOAT                    0x00000006
+#define     V_0280A0_COLOR_8_8                         0x00000007
+#define     V_0280A0_COLOR_5_6_5                       0x00000008
+#define     V_0280A0_COLOR_6_5_5                       0x00000009
+#define     V_0280A0_COLOR_1_5_5_5                     0x0000000A
+#define     V_0280A0_COLOR_4_4_4_4                     0x0000000B
+#define     V_0280A0_COLOR_5_5_5_1                     0x0000000C
+#define     V_0280A0_COLOR_32                          0x0000000D
+#define     V_0280A0_COLOR_32_FLOAT                    0x0000000E
+#define     V_0280A0_COLOR_16_16                       0x0000000F
+#define     V_0280A0_COLOR_16_16_FLOAT                 0x00000010
+#define     V_0280A0_COLOR_8_24                        0x00000011
+#define     V_0280A0_COLOR_8_24_FLOAT                  0x00000012
+#define     V_0280A0_COLOR_24_8                        0x00000013
+#define     V_0280A0_COLOR_24_8_FLOAT                  0x00000014
+#define     V_0280A0_COLOR_10_11_11                    0x00000015
+#define     V_0280A0_COLOR_10_11_11_FLOAT              0x00000016
+#define     V_0280A0_COLOR_11_11_10                    0x00000017
+#define     V_0280A0_COLOR_11_11_10_FLOAT              0x00000018
+#define     V_0280A0_COLOR_2_10_10_10                  0x00000019
+#define     V_0280A0_COLOR_8_8_8_8                     0x0000001A
+#define     V_0280A0_COLOR_10_10_10_2                  0x0000001B
+#define     V_0280A0_COLOR_X24_8_32_FLOAT              0x0000001C
+#define     V_0280A0_COLOR_32_32                       0x0000001D
+#define     V_0280A0_COLOR_32_32_FLOAT                 0x0000001E
+#define     V_0280A0_COLOR_16_16_16_16                 0x0000001F
+#define     V_0280A0_COLOR_16_16_16_16_FLOAT           0x00000020
+#define     V_0280A0_COLOR_32_32_32_32                 0x00000022
+#define     V_0280A0_COLOR_32_32_32_32_FLOAT           0x00000023
+#define   S_0280A0_ARRAY_MODE(x)                       (((x) & 0xF) << 8)
+#define   G_0280A0_ARRAY_MODE(x)                       (((x) >> 8) & 0xF)
+#define   C_0280A0_ARRAY_MODE                          0xFFFFF0FF
+#define     V_0280A0_ARRAY_LINEAR_GENERAL              0x00000000
+#define     V_0280A0_ARRAY_LINEAR_ALIGNED              0x00000001
+#define     V_0280A0_ARRAY_1D_TILED_THIN1              0x00000002
+#define     V_0280A0_ARRAY_2D_TILED_THIN1              0x00000004
+#define   S_0280A0_NUMBER_TYPE(x)                      (((x) & 0x7) << 12)
+#define   G_0280A0_NUMBER_TYPE(x)                      (((x) >> 12) & 0x7)
+#define   C_0280A0_NUMBER_TYPE                         0xFFFF8FFF
+#define   S_0280A0_READ_SIZE(x)                        (((x) & 0x1) << 15)
+#define   G_0280A0_READ_SIZE(x)                        (((x) >> 15) & 0x1)
+#define   C_0280A0_READ_SIZE                           0xFFFF7FFF
+#define   S_0280A0_COMP_SWAP(x)                        (((x) & 0x3) << 16)
+#define   G_0280A0_COMP_SWAP(x)                        (((x) >> 16) & 0x3)
+#define   C_0280A0_COMP_SWAP                           0xFFFCFFFF
+#define   S_0280A0_TILE_MODE(x)                        (((x) & 0x3) << 18)
+#define   G_0280A0_TILE_MODE(x)                        (((x) >> 18) & 0x3)
+#define   C_0280A0_TILE_MODE                           0xFFF3FFFF
+#define   S_0280A0_BLEND_CLAMP(x)                      (((x) & 0x1) << 20)
+#define   G_0280A0_BLEND_CLAMP(x)                      (((x) >> 20) & 0x1)
+#define   C_0280A0_BLEND_CLAMP                         0xFFEFFFFF
+#define   S_0280A0_CLEAR_COLOR(x)                      (((x) & 0x1) << 21)
+#define   G_0280A0_CLEAR_COLOR(x)                      (((x) >> 21) & 0x1)
+#define   C_0280A0_CLEAR_COLOR                         0xFFDFFFFF
+#define   S_0280A0_BLEND_BYPASS(x)                     (((x) & 0x1) << 22)
+#define   G_0280A0_BLEND_BYPASS(x)                     (((x) >> 22) & 0x1)
+#define   C_0280A0_BLEND_BYPASS                        0xFFBFFFFF
+#define   S_0280A0_BLEND_FLOAT32(x)                    (((x) & 0x1) << 23)
+#define   G_0280A0_BLEND_FLOAT32(x)                    (((x) >> 23) & 0x1)
+#define   C_0280A0_BLEND_FLOAT32                       0xFF7FFFFF
+#define   S_0280A0_SIMPLE_FLOAT(x)                     (((x) & 0x1) << 24)
+#define   G_0280A0_SIMPLE_FLOAT(x)                     (((x) >> 24) & 0x1)
+#define   C_0280A0_SIMPLE_FLOAT                        0xFEFFFFFF
+#define   S_0280A0_ROUND_MODE(x)                       (((x) & 0x1) << 25)
+#define   G_0280A0_ROUND_MODE(x)                       (((x) >> 25) & 0x1)
+#define   C_0280A0_ROUND_MODE                          0xFDFFFFFF
+#define   S_0280A0_TILE_COMPACT(x)                     (((x) & 0x1) << 26)
+#define   G_0280A0_TILE_COMPACT(x)                     (((x) >> 26) & 0x1)
+#define   C_0280A0_TILE_COMPACT                        0xFBFFFFFF
+#define   S_0280A0_SOURCE_FORMAT(x)                    (((x) & 0x1) << 27)
+#define   G_0280A0_SOURCE_FORMAT(x)                    (((x) >> 27) & 0x1)
+#define   C_0280A0_SOURCE_FORMAT                       0xF7FFFFFF
+#define R_028060_CB_COLOR0_SIZE                      0x028060
+#define   S_028060_PITCH_TILE_MAX(x)                   (((x) & 0x3FF) << 0)
+#define   G_028060_PITCH_TILE_MAX(x)                   (((x) >> 0) & 0x3FF)
+#define   C_028060_PITCH_TILE_MAX                      0xFFFFFC00
+#define   S_028060_SLICE_TILE_MAX(x)                   (((x) & 0xFFFFF) << 10)
+#define   G_028060_SLICE_TILE_MAX(x)                   (((x) >> 10) & 0xFFFFF)
+#define   C_028060_SLICE_TILE_MAX                      0xC00003FF
+#define R_028800_DB_DEPTH_CONTROL                    0x028800
+#define   S_028800_STENCIL_ENABLE(x)                   (((x) & 0x1) << 0)
+#define   G_028800_STENCIL_ENABLE(x)                   (((x) >> 0) & 0x1)
+#define   C_028800_STENCIL_ENABLE                      0xFFFFFFFE
+#define   S_028800_Z_ENABLE(x)                         (((x) & 0x1) << 1)
+#define   G_028800_Z_ENABLE(x)                         (((x) >> 1) & 0x1)
+#define   C_028800_Z_ENABLE                            0xFFFFFFFD
+#define   S_028800_Z_WRITE_ENABLE(x)                   (((x) & 0x1) << 2)
+#define   G_028800_Z_WRITE_ENABLE(x)                   (((x) >> 2) & 0x1)
+#define   C_028800_Z_WRITE_ENABLE                      0xFFFFFFFB
+#define   S_028800_ZFUNC(x)                            (((x) & 0x7) << 4)
+#define   G_028800_ZFUNC(x)                            (((x) >> 4) & 0x7)
+#define   C_028800_ZFUNC                               0xFFFFFF8F
+#define   S_028800_BACKFACE_ENABLE(x)                  (((x) & 0x1) << 7)
+#define   G_028800_BACKFACE_ENABLE(x)                  (((x) >> 7) & 0x1)
+#define   C_028800_BACKFACE_ENABLE                     0xFFFFFF7F
+#define   S_028800_STENCILFUNC(x)                      (((x) & 0x7) << 8)
+#define   G_028800_STENCILFUNC(x)                      (((x) >> 8) & 0x7)
+#define   C_028800_STENCILFUNC                         0xFFFFF8FF
+#define   S_028800_STENCILFAIL(x)                      (((x) & 0x7) << 11)
+#define   G_028800_STENCILFAIL(x)                      (((x) >> 11) & 0x7)
+#define   C_028800_STENCILFAIL                         0xFFFFC7FF
+#define   S_028800_STENCILZPASS(x)                     (((x) & 0x7) << 14)
+#define   G_028800_STENCILZPASS(x)                     (((x) >> 14) & 0x7)
+#define   C_028800_STENCILZPASS                        0xFFFE3FFF
+#define   S_028800_STENCILZFAIL(x)                     (((x) & 0x7) << 17)
+#define   G_028800_STENCILZFAIL(x)                     (((x) >> 17) & 0x7)
+#define   C_028800_STENCILZFAIL                        0xFFF1FFFF
+#define   S_028800_STENCILFUNC_BF(x)                   (((x) & 0x7) << 20)
+#define   G_028800_STENCILFUNC_BF(x)                   (((x) >> 20) & 0x7)
+#define   C_028800_STENCILFUNC_BF                      0xFF8FFFFF
+#define   S_028800_STENCILFAIL_BF(x)                   (((x) & 0x7) << 23)
+#define   G_028800_STENCILFAIL_BF(x)                   (((x) >> 23) & 0x7)
+#define   C_028800_STENCILFAIL_BF                      0xFC7FFFFF
+#define   S_028800_STENCILZPASS_BF(x)                  (((x) & 0x7) << 26)
+#define   G_028800_STENCILZPASS_BF(x)                  (((x) >> 26) & 0x7)
+#define   C_028800_STENCILZPASS_BF                     0xE3FFFFFF
+#define   S_028800_STENCILZFAIL_BF(x)                  (((x) & 0x7) << 29)
+#define   G_028800_STENCILZFAIL_BF(x)                  (((x) >> 29) & 0x7)
+#define   C_028800_STENCILZFAIL_BF                     0x1FFFFFFF
+#define R_028010_DB_DEPTH_INFO                       0x028010
+#define   S_028010_FORMAT(x)                           (((x) & 0x7) << 0)
+#define   G_028010_FORMAT(x)                           (((x) >> 0) & 0x7)
+#define   C_028010_FORMAT                              0xFFFFFFF8
+#define     V_028010_DEPTH_INVALID                     0x00000000
+#define     V_028010_DEPTH_16                          0x00000001
+#define     V_028010_DEPTH_X8_24                       0x00000002
+#define     V_028010_DEPTH_8_24                        0x00000003
+#define     V_028010_DEPTH_X8_24_FLOAT                 0x00000004
+#define     V_028010_DEPTH_8_24_FLOAT                  0x00000005
+#define     V_028010_DEPTH_32_FLOAT                    0x00000006
+#define     V_028010_DEPTH_X24_8_32_FLOAT              0x00000007
+#define   S_028010_READ_SIZE(x)                        (((x) & 0x1) << 3)
+#define   G_028010_READ_SIZE(x)                        (((x) >> 3) & 0x1)
+#define   C_028010_READ_SIZE                           0xFFFFFFF7
+#define   S_028010_ARRAY_MODE(x)                       (((x) & 0xF) << 15)
+#define   G_028010_ARRAY_MODE(x)                       (((x) >> 15) & 0xF)
+#define   C_028010_ARRAY_MODE                          0xFFF87FFF
+#define   S_028010_TILE_SURFACE_ENABLE(x)              (((x) & 0x1) << 25)
+#define   G_028010_TILE_SURFACE_ENABLE(x)              (((x) >> 25) & 0x1)
+#define   C_028010_TILE_SURFACE_ENABLE                 0xFDFFFFFF
+#define   S_028010_TILE_COMPACT(x)                     (((x) & 0x1) << 26)
+#define   G_028010_TILE_COMPACT(x)                     (((x) >> 26) & 0x1)
+#define   C_028010_TILE_COMPACT                        0xFBFFFFFF
+#define   S_028010_ZRANGE_PRECISION(x)                 (((x) & 0x1) << 31)
+#define   G_028010_ZRANGE_PRECISION(x)                 (((x) >> 31) & 0x1)
+#define   C_028010_ZRANGE_PRECISION                    0x7FFFFFFF
+#define R_028000_DB_DEPTH_SIZE                       0x028000
+#define   S_028000_PITCH_TILE_MAX(x)                   (((x) & 0x3FF) << 0)
+#define   G_028000_PITCH_TILE_MAX(x)                   (((x) >> 0) & 0x3FF)
+#define   C_028000_PITCH_TILE_MAX                      0xFFFFFC00
+#define   S_028000_SLICE_TILE_MAX(x)                   (((x) & 0xFFFFF) << 10)
+#define   G_028000_SLICE_TILE_MAX(x)                   (((x) >> 10) & 0xFFFFF)
+#define   C_028000_SLICE_TILE_MAX                      0xC00003FF
+#define R_028004_DB_DEPTH_VIEW                       0x028004
+#define   S_028004_SLICE_START(x)                      (((x) & 0x7FF) << 0)
+#define   G_028004_SLICE_START(x)                      (((x) >> 0) & 0x7FF)
+#define   C_028004_SLICE_START                         0xFFFFF800
+#define   S_028004_SLICE_MAX(x)                        (((x) & 0x7FF) << 13)
+#define   G_028004_SLICE_MAX(x)                        (((x) >> 13) & 0x7FF)
+#define   C_028004_SLICE_MAX                           0xFF001FFF
+#define R_028D24_DB_HTILE_SURFACE                    0x028D24
+#define   S_028D24_HTILE_WIDTH(x)                      (((x) & 0x1) << 0)
+#define   G_028D24_HTILE_WIDTH(x)                      (((x) >> 0) & 0x1)
+#define   C_028D24_HTILE_WIDTH                         0xFFFFFFFE
+#define   S_028D24_HTILE_HEIGHT(x)                     (((x) & 0x1) << 1)
+#define   G_028D24_HTILE_HEIGHT(x)                     (((x) >> 1) & 0x1)
+#define   C_028D24_HTILE_HEIGHT                        0xFFFFFFFD
+#define   S_028D24_LINEAR(x)                           (((x) & 0x1) << 2)
+#define   G_028D24_LINEAR(x)                           (((x) >> 2) & 0x1)
+#define   C_028D24_LINEAR                              0xFFFFFFFB
+#define   S_028D24_FULL_CACHE(x)                       (((x) & 0x1) << 3)
+#define   G_028D24_FULL_CACHE(x)                       (((x) >> 3) & 0x1)
+#define   C_028D24_FULL_CACHE                          0xFFFFFFF7
+#define   S_028D24_HTILE_USES_PRELOAD_WIN(x)           (((x) & 0x1) << 4)
+#define   G_028D24_HTILE_USES_PRELOAD_WIN(x)           (((x) >> 4) & 0x1)
+#define   C_028D24_HTILE_USES_PRELOAD_WIN              0xFFFFFFEF
+#define   S_028D24_PRELOAD(x)                          (((x) & 0x1) << 5)
+#define   G_028D24_PRELOAD(x)                          (((x) >> 5) & 0x1)
+#define   C_028D24_PRELOAD                             0xFFFFFFDF
+#define   S_028D24_PREFETCH_WIDTH(x)                   (((x) & 0x3F) << 6)
+#define   G_028D24_PREFETCH_WIDTH(x)                   (((x) >> 6) & 0x3F)
+#define   C_028D24_PREFETCH_WIDTH                      0xFFFFF03F
+#define   S_028D24_PREFETCH_HEIGHT(x)                  (((x) & 0x3F) << 12)
+#define   G_028D24_PREFETCH_HEIGHT(x)                  (((x) >> 12) & 0x3F)
+#define   C_028D24_PREFETCH_HEIGHT                     0xFFFC0FFF
+#define R_028D34_DB_PREFETCH_LIMIT                   0x028D34
+#define   S_028D34_DEPTH_HEIGHT_TILE_MAX(x)            (((x) & 0x3FF) << 0)
+#define   G_028D34_DEPTH_HEIGHT_TILE_MAX(x)            (((x) >> 0) & 0x3FF)
+#define   C_028D34_DEPTH_HEIGHT_TILE_MAX               0xFFFFFC00
+#define R_028D10_DB_RENDER_OVERRIDE                  0x028D10
+#define   S_028D10_FORCE_HIZ_ENABLE(x)                 (((x) & 0x3) << 0)
+#define   G_028D10_FORCE_HIZ_ENABLE(x)                 (((x) >> 0) & 0x3)
+#define   C_028D10_FORCE_HIZ_ENABLE                    0xFFFFFFFC
+#define   S_028D10_FORCE_HIS_ENABLE0(x)                (((x) & 0x3) << 2)
+#define   G_028D10_FORCE_HIS_ENABLE0(x)                (((x) >> 2) & 0x3)
+#define   C_028D10_FORCE_HIS_ENABLE0                   0xFFFFFFF3
+#define   S_028D10_FORCE_HIS_ENABLE1(x)                (((x) & 0x3) << 4)
+#define   G_028D10_FORCE_HIS_ENABLE1(x)                (((x) >> 4) & 0x3)
+#define   C_028D10_FORCE_HIS_ENABLE1                   0xFFFFFFCF
+#define   S_028D10_FORCE_SHADER_Z_ORDER(x)             (((x) & 0x1) << 6)
+#define   G_028D10_FORCE_SHADER_Z_ORDER(x)             (((x) >> 6) & 0x1)
+#define   C_028D10_FORCE_SHADER_Z_ORDER                0xFFFFFFBF
+#define   S_028D10_FAST_Z_DISABLE(x)                   (((x) & 0x1) << 7)
+#define   G_028D10_FAST_Z_DISABLE(x)                   (((x) >> 7) & 0x1)
+#define   C_028D10_FAST_Z_DISABLE                      0xFFFFFF7F
+#define   S_028D10_FAST_STENCIL_DISABLE(x)             (((x) & 0x1) << 8)
+#define   G_028D10_FAST_STENCIL_DISABLE(x)             (((x) >> 8) & 0x1)
+#define   C_028D10_FAST_STENCIL_DISABLE                0xFFFFFEFF
+#define   S_028D10_NOOP_CULL_DISABLE(x)                (((x) & 0x1) << 9)
+#define   G_028D10_NOOP_CULL_DISABLE(x)                (((x) >> 9) & 0x1)
+#define   C_028D10_NOOP_CULL_DISABLE                   0xFFFFFDFF
+#define   S_028D10_FORCE_COLOR_KILL(x)                 (((x) & 0x1) << 10)
+#define   G_028D10_FORCE_COLOR_KILL(x)                 (((x) >> 10) & 0x1)
+#define   C_028D10_FORCE_COLOR_KILL                    0xFFFFFBFF
+#define   S_028D10_FORCE_Z_READ(x)                     (((x) & 0x1) << 11)
+#define   G_028D10_FORCE_Z_READ(x)                     (((x) >> 11) & 0x1)
+#define   C_028D10_FORCE_Z_READ                        0xFFFFF7FF
+#define   S_028D10_FORCE_STENCIL_READ(x)               (((x) & 0x1) << 12)
+#define   G_028D10_FORCE_STENCIL_READ(x)               (((x) >> 12) & 0x1)
+#define   C_028D10_FORCE_STENCIL_READ                  0xFFFFEFFF
+#define   S_028D10_FORCE_FULL_Z_RANGE(x)               (((x) & 0x3) << 13)
+#define   G_028D10_FORCE_FULL_Z_RANGE(x)               (((x) >> 13) & 0x3)
+#define   C_028D10_FORCE_FULL_Z_RANGE                  0xFFFF9FFF
+#define   S_028D10_FORCE_QC_SMASK_CONFLICT(x)          (((x) & 0x1) << 15)
+#define   G_028D10_FORCE_QC_SMASK_CONFLICT(x)          (((x) >> 15) & 0x1)
+#define   C_028D10_FORCE_QC_SMASK_CONFLICT             0xFFFF7FFF
+#define   S_028D10_DISABLE_VIEWPORT_CLAMP(x)           (((x) & 0x1) << 16)
+#define   G_028D10_DISABLE_VIEWPORT_CLAMP(x)           (((x) >> 16) & 0x1)
+#define   C_028D10_DISABLE_VIEWPORT_CLAMP              0xFFFEFFFF
+#define   S_028D10_IGNORE_SC_ZRANGE(x)                 (((x) & 0x1) << 17)
+#define   G_028D10_IGNORE_SC_ZRANGE(x)                 (((x) >> 17) & 0x1)
+#define   C_028D10_IGNORE_SC_ZRANGE                    0xFFFDFFFF
+#define R_028A40_VGT_GS_MODE                         0x028A40
+#define   S_028A40_MODE(x)                             (((x) & 0x3) << 0)
+#define   G_028A40_MODE(x)                             (((x) >> 0) & 0x3)
+#define   C_028A40_MODE                                0xFFFFFFFC
+#define   S_028A40_ES_PASSTHRU(x)                      (((x) & 0x1) << 2)
+#define   G_028A40_ES_PASSTHRU(x)                      (((x) >> 2) & 0x1)
+#define   C_028A40_ES_PASSTHRU                         0xFFFFFFFB
+#define   S_028A40_CUT_MODE(x)                         (((x) & 0x3) << 3)
+#define   G_028A40_CUT_MODE(x)                         (((x) >> 3) & 0x3)
+#define   C_028A40_CUT_MODE                            0xFFFFFFE7
+#define R_008040_WAIT_UNTIL                          0x008040
+#define   S_008040_WAIT_CP_DMA_IDLE(x)                 (((x) & 0x1) << 8)
+#define   G_008040_WAIT_CP_DMA_IDLE(x)                 (((x) >> 8) & 0x1)
+#define   C_008040_WAIT_CP_DMA_IDLE                    0xFFFFFEFF
+#define   S_008040_WAIT_CMDFIFO(x)                     (((x) & 0x1) << 10)
+#define   G_008040_WAIT_CMDFIFO(x)                     (((x) >> 10) & 0x1)
+#define   C_008040_WAIT_CMDFIFO                        0xFFFFFBFF
+#define   S_008040_WAIT_2D_IDLE(x)                     (((x) & 0x1) << 14)
+#define   G_008040_WAIT_2D_IDLE(x)                     (((x) >> 14) & 0x1)
+#define   C_008040_WAIT_2D_IDLE                        0xFFFFBFFF
+#define   S_008040_WAIT_3D_IDLE(x)                     (((x) & 0x1) << 15)
+#define   G_008040_WAIT_3D_IDLE(x)                     (((x) >> 15) & 0x1)
+#define   C_008040_WAIT_3D_IDLE                        0xFFFF7FFF
+#define   S_008040_WAIT_2D_IDLECLEAN(x)                (((x) & 0x1) << 16)
+#define   G_008040_WAIT_2D_IDLECLEAN(x)                (((x) >> 16) & 0x1)
+#define   C_008040_WAIT_2D_IDLECLEAN                   0xFFFEFFFF
+#define   S_008040_WAIT_3D_IDLECLEAN(x)                (((x) & 0x1) << 17)
+#define   G_008040_WAIT_3D_IDLECLEAN(x)                (((x) >> 17) & 0x1)
+#define   C_008040_WAIT_3D_IDLECLEAN                   0xFFFDFFFF
+#define   S_008040_WAIT_EXTERN_SIG(x)                  (((x) & 0x1) << 19)
+#define   G_008040_WAIT_EXTERN_SIG(x)                  (((x) >> 19) & 0x1)
+#define   C_008040_WAIT_EXTERN_SIG                     0xFFF7FFFF
+#define   S_008040_CMDFIFO_ENTRIES(x)                  (((x) & 0x1F) << 20)
+#define   G_008040_CMDFIFO_ENTRIES(x)                  (((x) >> 20) & 0x1F)
+#define   C_008040_CMDFIFO_ENTRIES                     0xFE0FFFFF
+#define R_0286CC_SPI_PS_IN_CONTROL_0                 0x0286CC
+#define   S_0286CC_NUM_INTERP(x)                       (((x) & 0x3F) << 0)
+#define   G_0286CC_NUM_INTERP(x)                       (((x) >> 0) & 0x3F)
+#define   C_0286CC_NUM_INTERP                          0xFFFFFFC0
+#define   S_0286CC_POSITION_ENA(x)                     (((x) & 0x1) << 8)
+#define   G_0286CC_POSITION_ENA(x)                     (((x) >> 8) & 0x1)
+#define   C_0286CC_POSITION_ENA                        0xFFFFFEFF
+#define   S_0286CC_POSITION_CENTROID(x)                (((x) & 0x1) << 9)
+#define   G_0286CC_POSITION_CENTROID(x)                (((x) >> 9) & 0x1)
+#define   C_0286CC_POSITION_CENTROID                   0xFFFFFDFF
+#define   S_0286CC_POSITION_ADDR(x)                    (((x) & 0x1F) << 10)
+#define   G_0286CC_POSITION_ADDR(x)                    (((x) >> 10) & 0x1F)
+#define   C_0286CC_POSITION_ADDR                       0xFFFF83FF
+#define   S_0286CC_PARAM_GEN(x)                        (((x) & 0xF) << 15)
+#define   G_0286CC_PARAM_GEN(x)                        (((x) >> 15) & 0xF)
+#define   C_0286CC_PARAM_GEN                           0xFFF87FFF
+#define   S_0286CC_PARAM_GEN_ADDR(x)                   (((x) & 0x7F) << 19)
+#define   G_0286CC_PARAM_GEN_ADDR(x)                   (((x) >> 19) & 0x7F)
+#define   C_0286CC_PARAM_GEN_ADDR                      0xFC07FFFF
+#define   S_0286CC_BARYC_SAMPLE_CNTL(x)                (((x) & 0x3) << 26)
+#define   G_0286CC_BARYC_SAMPLE_CNTL(x)                (((x) >> 26) & 0x3)
+#define   C_0286CC_BARYC_SAMPLE_CNTL                   0xF3FFFFFF
+#define   S_0286CC_PERSP_GRADIENT_ENA(x)               (((x) & 0x1) << 28)
+#define   G_0286CC_PERSP_GRADIENT_ENA(x)               (((x) >> 28) & 0x1)
+#define   C_0286CC_PERSP_GRADIENT_ENA                  0xEFFFFFFF
+#define   S_0286CC_LINEAR_GRADIENT_ENA(x)              (((x) & 0x1) << 29)
+#define   G_0286CC_LINEAR_GRADIENT_ENA(x)              (((x) >> 29) & 0x1)
+#define   C_0286CC_LINEAR_GRADIENT_ENA                 0xDFFFFFFF
+#define   S_0286CC_POSITION_SAMPLE(x)                  (((x) & 0x1) << 30)
+#define   G_0286CC_POSITION_SAMPLE(x)                  (((x) >> 30) & 0x1)
+#define   C_0286CC_POSITION_SAMPLE                     0xBFFFFFFF
+#define   S_0286CC_BARYC_AT_SAMPLE_ENA(x)              (((x) & 0x1) << 31)
+#define   G_0286CC_BARYC_AT_SAMPLE_ENA(x)              (((x) >> 31) & 0x1)
+#define   C_0286CC_BARYC_AT_SAMPLE_ENA                 0x7FFFFFFF
+#define R_0286D0_SPI_PS_IN_CONTROL_1                 0x0286D0
+#define   S_0286D0_GEN_INDEX_PIX(x)                    (((x) & 0x1) << 0)
+#define   G_0286D0_GEN_INDEX_PIX(x)                    (((x) >> 0) & 0x1)
+#define   C_0286D0_GEN_INDEX_PIX                       0xFFFFFFFE
+#define   S_0286D0_GEN_INDEX_PIX_ADDR(x)               (((x) & 0x7F) << 1)
+#define   G_0286D0_GEN_INDEX_PIX_ADDR(x)               (((x) >> 1) & 0x7F)
+#define   C_0286D0_GEN_INDEX_PIX_ADDR                  0xFFFFFF01
+#define   S_0286D0_FRONT_FACE_ENA(x)                   (((x) & 0x1) << 8)
+#define   G_0286D0_FRONT_FACE_ENA(x)                   (((x) >> 8) & 0x1)
+#define   C_0286D0_FRONT_FACE_ENA                      0xFFFFFEFF
+#define   S_0286D0_FRONT_FACE_CHAN(x)                  (((x) & 0x3) << 9)
+#define   G_0286D0_FRONT_FACE_CHAN(x)                  (((x) >> 9) & 0x3)
+#define   C_0286D0_FRONT_FACE_CHAN                     0xFFFFF9FF
+#define   S_0286D0_FRONT_FACE_ALL_BITS(x)              (((x) & 0x1) << 11)
+#define   G_0286D0_FRONT_FACE_ALL_BITS(x)              (((x) >> 11) & 0x1)
+#define   C_0286D0_FRONT_FACE_ALL_BITS                 0xFFFFF7FF
+#define   S_0286D0_FRONT_FACE_ADDR(x)                  (((x) & 0x1F) << 12)
+#define   G_0286D0_FRONT_FACE_ADDR(x)                  (((x) >> 12) & 0x1F)
+#define   C_0286D0_FRONT_FACE_ADDR                     0xFFFE0FFF
+#define   S_0286D0_FOG_ADDR(x)                         (((x) & 0x7F) << 17)
+#define   G_0286D0_FOG_ADDR(x)                         (((x) >> 17) & 0x7F)
+#define   C_0286D0_FOG_ADDR                            0xFF01FFFF
+#define   S_0286D0_FIXED_PT_POSITION_ENA(x)            (((x) & 0x1) << 24)
+#define   G_0286D0_FIXED_PT_POSITION_ENA(x)            (((x) >> 24) & 0x1)
+#define   C_0286D0_FIXED_PT_POSITION_ENA               0xFEFFFFFF
+#define   S_0286D0_FIXED_PT_POSITION_ADDR(x)           (((x) & 0x1F) << 25)
+#define   G_0286D0_FIXED_PT_POSITION_ADDR(x)           (((x) >> 25) & 0x1F)
+#define   C_0286D0_FIXED_PT_POSITION_ADDR              0xC1FFFFFF
+#define R_0286C4_SPI_VS_OUT_CONFIG                   0x0286C4
+#define   S_0286C4_VS_PER_COMPONENT(x)                 (((x) & 0x1) << 0)
+#define   G_0286C4_VS_PER_COMPONENT(x)                 (((x) >> 0) & 0x1)
+#define   C_0286C4_VS_PER_COMPONENT                    0xFFFFFFFE
+#define   S_0286C4_VS_EXPORT_COUNT(x)                  (((x) & 0x1F) << 1)
+#define   G_0286C4_VS_EXPORT_COUNT(x)                  (((x) >> 1) & 0x1F)
+#define   C_0286C4_VS_EXPORT_COUNT                     0xFFFFFFC1
+#define   S_0286C4_VS_EXPORTS_FOG(x)                   (((x) & 0x1) << 8)
+#define   G_0286C4_VS_EXPORTS_FOG(x)                   (((x) >> 8) & 0x1)
+#define   C_0286C4_VS_EXPORTS_FOG                      0xFFFFFEFF
+#define   S_0286C4_VS_OUT_FOG_VEC_ADDR(x)              (((x) & 0x1F) << 9)
+#define   G_0286C4_VS_OUT_FOG_VEC_ADDR(x)              (((x) >> 9) & 0x1F)
+#define   C_0286C4_VS_OUT_FOG_VEC_ADDR                 0xFFFFC1FF
+#define R_028240_PA_SC_GENERIC_SCISSOR_TL            0x028240
+#define   S_028240_TL_X(x)                             (((x) & 0x3FFF) << 0)
+#define   G_028240_TL_X(x)                             (((x) >> 0) & 0x3FFF)
+#define   C_028240_TL_X                                0xFFFFC000
+#define   S_028240_TL_Y(x)                             (((x) & 0x3FFF) << 16)
+#define   G_028240_TL_Y(x)                             (((x) >> 16) & 0x3FFF)
+#define   C_028240_TL_Y                                0xC000FFFF
+#define   S_028240_WINDOW_OFFSET_DISABLE(x)            (((x) & 0x1) << 31)
+#define   G_028240_WINDOW_OFFSET_DISABLE(x)            (((x) >> 31) & 0x1)
+#define   C_028240_WINDOW_OFFSET_DISABLE               0x7FFFFFFF
+#define R_028244_PA_SC_GENERIC_SCISSOR_BR            0x028244
+#define   S_028244_BR_X(x)                             (((x) & 0x3FFF) << 0)
+#define   G_028244_BR_X(x)                             (((x) >> 0) & 0x3FFF)
+#define   C_028244_BR_X                                0xFFFFC000
+#define   S_028244_BR_Y(x)                             (((x) & 0x3FFF) << 16)
+#define   G_028244_BR_Y(x)                             (((x) >> 16) & 0x3FFF)
+#define   C_028244_BR_Y                                0xC000FFFF
+#define R_028030_PA_SC_SCREEN_SCISSOR_TL             0x028030
+#define   S_028030_TL_X(x)                             (((x) & 0x7FFF) << 0)
+#define   G_028030_TL_X(x)                             (((x) >> 0) & 0x7FFF)
+#define   C_028030_TL_X                                0xFFFF8000
+#define   S_028030_TL_Y(x)                             (((x) & 0x7FFF) << 16)
+#define   G_028030_TL_Y(x)                             (((x) >> 16) & 0x7FFF)
+#define   C_028030_TL_Y                                0x8000FFFF
+#define R_028034_PA_SC_SCREEN_SCISSOR_BR             0x028034
+#define   S_028034_BR_X(x)                             (((x) & 0x7FFF) << 0)
+#define   G_028034_BR_X(x)                             (((x) >> 0) & 0x7FFF)
+#define   C_028034_BR_X                                0xFFFF8000
+#define   S_028034_BR_Y(x)                             (((x) & 0x7FFF) << 16)
+#define   G_028034_BR_Y(x)                             (((x) >> 16) & 0x7FFF)
+#define   C_028034_BR_Y                                0x8000FFFF
+#define R_028204_PA_SC_WINDOW_SCISSOR_TL             0x028204
+#define   S_028204_TL_X(x)                             (((x) & 0x3FFF) << 0)
+#define   G_028204_TL_X(x)                             (((x) >> 0) & 0x3FFF)
+#define   C_028204_TL_X                                0xFFFFC000
+#define   S_028204_TL_Y(x)                             (((x) & 0x3FFF) << 16)
+#define   G_028204_TL_Y(x)                             (((x) >> 16) & 0x3FFF)
+#define   C_028204_TL_Y                                0xC000FFFF
+#define   S_028204_WINDOW_OFFSET_DISABLE(x)            (((x) & 0x1) << 31)
+#define   G_028204_WINDOW_OFFSET_DISABLE(x)            (((x) >> 31) & 0x1)
+#define   C_028204_WINDOW_OFFSET_DISABLE               0x7FFFFFFF
+#define R_028208_PA_SC_WINDOW_SCISSOR_BR             0x028208
+#define   S_028208_BR_X(x)                             (((x) & 0x3FFF) << 0)
+#define   G_028208_BR_X(x)                             (((x) >> 0) & 0x3FFF)
+#define   C_028208_BR_X                                0xFFFFC000
+#define   S_028208_BR_Y(x)                             (((x) & 0x3FFF) << 16)
+#define   G_028208_BR_Y(x)                             (((x) >> 16) & 0x3FFF)
+#define   C_028208_BR_Y                                0xC000FFFF
+#define R_0287F0_VGT_DRAW_INITIATOR                  0x0287F0
+#define   S_0287F0_SOURCE_SELECT(x)                    (((x) & 0x3) << 0)
+#define   G_0287F0_SOURCE_SELECT(x)                    (((x) >> 0) & 0x3)
+#define   C_0287F0_SOURCE_SELECT                       0xFFFFFFFC
+#define   S_0287F0_MAJOR_MODE(x)                       (((x) & 0x3) << 2)
+#define   G_0287F0_MAJOR_MODE(x)                       (((x) >> 2) & 0x3)
+#define   C_0287F0_MAJOR_MODE                          0xFFFFFFF3
+#define   S_0287F0_SPRITE_EN(x)                        (((x) & 0x1) << 4)
+#define   G_0287F0_SPRITE_EN(x)                        (((x) >> 4) & 0x1)
+#define   C_0287F0_SPRITE_EN                           0xFFFFFFEF
+#define   S_0287F0_NOT_EOP(x)                          (((x) & 0x1) << 5)
+#define   G_0287F0_NOT_EOP(x)                          (((x) >> 5) & 0x1)
+#define   C_0287F0_NOT_EOP                             0xFFFFFFDF
+#define   S_0287F0_USE_OPAQUE(x)                       (((x) & 0x1) << 6)
+#define   G_0287F0_USE_OPAQUE(x)                       (((x) >> 6) & 0x1)
+#define   C_0287F0_USE_OPAQUE                          0xFFFFFFBF
+#define R_038008_SQ_VTX_CONSTANT_WORD2_0             0x038008
+#define   S_038008_BASE_ADDRESS_HI(x)                  (((x) & 0xFF) << 0)
+#define   G_038008_BASE_ADDRESS_HI(x)                  (((x) >> 0) & 0xFF)
+#define   C_038008_BASE_ADDRESS_HI                     0xFFFFFF00
+#define   S_038008_STRIDE(x)                           (((x) & 0x7FF) << 8)
+#define   G_038008_STRIDE(x)                           (((x) >> 8) & 0x7FF)
+#define   C_038008_STRIDE                              0xFFF800FF
+#define   S_038008_CLAMP_X(x)                          (((x) & 0x1) << 19)
+#define   G_038008_CLAMP_X(x)                          (((x) >> 19) & 0x1)
+#define   C_038008_CLAMP_X                             0xFFF7FFFF
+#define   S_038008_DATA_FORMAT(x)                      (((x) & 0x3F) << 20)
+#define   G_038008_DATA_FORMAT(x)                      (((x) >> 20) & 0x3F)
+#define   C_038008_DATA_FORMAT                         0xFC0FFFFF
+#define     V_038008_COLOR_INVALID                     0x00000000
+#define     V_038008_COLOR_8                           0x00000001
+#define     V_038008_COLOR_4_4                         0x00000002
+#define     V_038008_COLOR_3_3_2                       0x00000003
+#define     V_038008_COLOR_16                          0x00000005
+#define     V_038008_COLOR_16_FLOAT                    0x00000006
+#define     V_038008_COLOR_8_8                         0x00000007
+#define     V_038008_COLOR_5_6_5                       0x00000008
+#define     V_038008_COLOR_6_5_5                       0x00000009
+#define     V_038008_COLOR_1_5_5_5                     0x0000000A
+#define     V_038008_COLOR_4_4_4_4                     0x0000000B
+#define     V_038008_COLOR_5_5_5_1                     0x0000000C
+#define     V_038008_COLOR_32                          0x0000000D
+#define     V_038008_COLOR_32_FLOAT                    0x0000000E
+#define     V_038008_COLOR_16_16                       0x0000000F
+#define     V_038008_COLOR_16_16_FLOAT                 0x00000010
+#define     V_038008_COLOR_8_24                        0x00000011
+#define     V_038008_COLOR_8_24_FLOAT                  0x00000012
+#define     V_038008_COLOR_24_8                        0x00000013
+#define     V_038008_COLOR_24_8_FLOAT                  0x00000014
+#define     V_038008_COLOR_10_11_11                    0x00000015
+#define     V_038008_COLOR_10_11_11_FLOAT              0x00000016
+#define     V_038008_COLOR_11_11_10                    0x00000017
+#define     V_038008_COLOR_11_11_10_FLOAT              0x00000018
+#define     V_038008_COLOR_2_10_10_10                  0x00000019
+#define     V_038008_COLOR_8_8_8_8                     0x0000001A
+#define     V_038008_COLOR_10_10_10_2                  0x0000001B
+#define     V_038008_COLOR_X24_8_32_FLOAT              0x0000001C
+#define     V_038008_COLOR_32_32                       0x0000001D
+#define     V_038008_COLOR_32_32_FLOAT                 0x0000001E
+#define     V_038008_COLOR_16_16_16_16                 0x0000001F
+#define     V_038008_COLOR_16_16_16_16_FLOAT           0x00000020
+#define     V_038008_COLOR_32_32_32_32                 0x00000022
+#define     V_038008_COLOR_32_32_32_32_FLOAT           0x00000023
+#define   S_038008_NUM_FORMAT_ALL(x)                   (((x) & 0x3) << 26)
+#define   G_038008_NUM_FORMAT_ALL(x)                   (((x) >> 26) & 0x3)
+#define   C_038008_NUM_FORMAT_ALL                      0xF3FFFFFF
+#define   S_038008_FORMAT_COMP_ALL(x)                  (((x) & 0x1) << 28)
+#define   G_038008_FORMAT_COMP_ALL(x)                  (((x) >> 28) & 0x1)
+#define   C_038008_FORMAT_COMP_ALL                     0xEFFFFFFF
+#define   S_038008_SRF_MODE_ALL(x)                     (((x) & 0x1) << 29)
+#define   G_038008_SRF_MODE_ALL(x)                     (((x) >> 29) & 0x1)
+#define   C_038008_SRF_MODE_ALL                        0xDFFFFFFF
+#define   S_038008_ENDIAN_SWAP(x)                      (((x) & 0x3) << 30)
+#define   G_038008_ENDIAN_SWAP(x)                      (((x) >> 30) & 0x3)
+#define   C_038008_ENDIAN_SWAP                         0x3FFFFFFF
+#define R_008958_VGT_PRIMITIVE_TYPE                  0x008958
+#define   S_008958_PRIM_TYPE(x)                        (((x) & 0x3F) << 0)
+#define   G_008958_PRIM_TYPE(x)                        (((x) >> 0) & 0x3F)
+#define   C_008958_PRIM_TYPE                           0xFFFFFFC0
+#define     V_008958_DI_PT_NONE                        0x00000000
+#define     V_008958_DI_PT_POINTLIST                   0x00000001
+#define     V_008958_DI_PT_LINELIST                    0x00000002
+#define     V_008958_DI_PT_LINESTRIP                   0x00000003
+#define     V_008958_DI_PT_TRILIST                     0x00000004
+#define     V_008958_DI_PT_TRIFAN                      0x00000005
+#define     V_008958_DI_PT_TRISTRIP                    0x00000006
+#define     V_008958_DI_PT_UNUSED_0                    0x00000007
+#define     V_008958_DI_PT_UNUSED_1                    0x00000008
+#define     V_008958_DI_PT_UNUSED_2                    0x00000009
+#define     V_008958_DI_PT_LINELIST_ADJ                0x0000000A
+#define     V_008958_DI_PT_LINESTRIP_ADJ               0x0000000B
+#define     V_008958_DI_PT_TRILIST_ADJ                 0x0000000C
+#define     V_008958_DI_PT_TRISTRIP_ADJ                0x0000000D
+#define     V_008958_DI_PT_UNUSED_3                    0x0000000E
+#define     V_008958_DI_PT_UNUSED_4                    0x0000000F
+#define     V_008958_DI_PT_TRI_WITH_WFLAGS             0x00000010
+#define     V_008958_DI_PT_RECTLIST                    0x00000011
+#define     V_008958_DI_PT_LINELOOP                    0x00000012
+#define     V_008958_DI_PT_QUADLIST                    0x00000013
+#define     V_008958_DI_PT_QUADSTRIP                   0x00000014
+#define     V_008958_DI_PT_POLYGON                     0x00000015
+#define     V_008958_DI_PT_2D_COPY_RECT_LIST_V0        0x00000016
+#define     V_008958_DI_PT_2D_COPY_RECT_LIST_V1        0x00000017
+#define     V_008958_DI_PT_2D_COPY_RECT_LIST_V2        0x00000018
+#define     V_008958_DI_PT_2D_COPY_RECT_LIST_V3        0x00000019
+#define     V_008958_DI_PT_2D_FILL_RECT_LIST           0x0000001A
+#define     V_008958_DI_PT_2D_LINE_STRIP               0x0000001B
+#define     V_008958_DI_PT_2D_TRI_STRIP                0x0000001C
+#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
+#define   S_028868_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
+#define   G_028868_NUM_GPRS(x)                         (((x) >> 0) & 0xFF)
+#define   C_028868_NUM_GPRS                            0xFFFFFF00
+#define   S_028868_STACK_SIZE(x)                       (((x) & 0xFF) << 8)
+#define   G_028868_STACK_SIZE(x)                       (((x) >> 8) & 0xFF)
+#define   C_028868_STACK_SIZE                          0xFFFF00FF
+#define   S_028868_DX10_CLAMP(x)                       (((x) & 0x1) << 21)
+#define   G_028868_DX10_CLAMP(x)                       (((x) >> 21) & 0x1)
+#define   C_028868_DX10_CLAMP                          0xFFDFFFFF
+#define   S_028868_FETCH_CACHE_LINES(x)                (((x) & 0x7) << 24)
+#define   G_028868_FETCH_CACHE_LINES(x)                (((x) >> 24) & 0x7)
+#define   C_028868_FETCH_CACHE_LINES                   0xF8FFFFFF
+#define   S_028868_UNCACHED_FIRST_INST(x)              (((x) & 0x1) << 28)
+#define   G_028868_UNCACHED_FIRST_INST(x)              (((x) >> 28) & 0x1)
+#define   C_028868_UNCACHED_FIRST_INST                 0xEFFFFFFF
+#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
+#define   S_028850_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
+#define   G_028850_NUM_GPRS(x)                         (((x) >> 0) & 0xFF)
+#define   C_028850_NUM_GPRS                            0xFFFFFF00
+#define   S_028850_STACK_SIZE(x)                       (((x) & 0xFF) << 8)
+#define   G_028850_STACK_SIZE(x)                       (((x) >> 8) & 0xFF)
+#define   C_028850_STACK_SIZE                          0xFFFF00FF
+#define   S_028850_DX10_CLAMP(x)                       (((x) & 0x1) << 21)
+#define   G_028850_DX10_CLAMP(x)                       (((x) >> 21) & 0x1)
+#define   C_028850_DX10_CLAMP                          0xFFDFFFFF
+#define   S_028850_FETCH_CACHE_LINES(x)                (((x) & 0x7) << 24)
+#define   G_028850_FETCH_CACHE_LINES(x)                (((x) >> 24) & 0x7)
+#define   C_028850_FETCH_CACHE_LINES                   0xF8FFFFFF
+#define   S_028850_UNCACHED_FIRST_INST(x)              (((x) & 0x1) << 28)
+#define   G_028850_UNCACHED_FIRST_INST(x)              (((x) >> 28) & 0x1)
+#define   C_028850_UNCACHED_FIRST_INST                 0xEFFFFFFF
+#define   S_028850_CLAMP_CONSTS(x)                     (((x) & 0x1) << 31)
+#define   G_028850_CLAMP_CONSTS(x)                     (((x) >> 31) & 0x1)
+#define   C_028850_CLAMP_CONSTS                        0x7FFFFFFF
+#define R_028644_SPI_PS_INPUT_CNTL_0                 0x028644
+#define   S_028644_SEMANTIC(x)                         (((x) & 0xFF) << 0)
+#define   G_028644_SEMANTIC(x)                         (((x) >> 0) & 0xFF)
+#define   C_028644_SEMANTIC                            0xFFFFFF00
+#define   S_028644_DEFAULT_VAL(x)                      (((x) & 0x3) << 8)
+#define   G_028644_DEFAULT_VAL(x)                      (((x) >> 8) & 0x3)
+#define   C_028644_DEFAULT_VAL                         0xFFFFFCFF
+#define   S_028644_FLAT_SHADE(x)                       (((x) & 0x1) << 10)
+#define   G_028644_FLAT_SHADE(x)                       (((x) >> 10) & 0x1)
+#define   C_028644_FLAT_SHADE                          0xFFFFFBFF
+#define   S_028644_SEL_CENTROID(x)                     (((x) & 0x1) << 11)
+#define   G_028644_SEL_CENTROID(x)                     (((x) >> 11) & 0x1)
+#define   C_028644_SEL_CENTROID                        0xFFFFF7FF
+#define   S_028644_SEL_LINEAR(x)                       (((x) & 0x1) << 12)
+#define   G_028644_SEL_LINEAR(x)                       (((x) >> 12) & 0x1)
+#define   C_028644_SEL_LINEAR                          0xFFFFEFFF
+#define   S_028644_CYL_WRAP(x)                         (((x) & 0xF) << 13)
+#define   G_028644_CYL_WRAP(x)                         (((x) >> 13) & 0xF)
+#define   C_028644_CYL_WRAP                            0xFFFE1FFF
+#define   S_028644_PT_SPRITE_TEX(x)                    (((x) & 0x1) << 17)
+#define   G_028644_PT_SPRITE_TEX(x)                    (((x) >> 17) & 0x1)
+#define   C_028644_PT_SPRITE_TEX                       0xFFFDFFFF
+#define   S_028644_SEL_SAMPLE(x)                       (((x) & 0x1) << 18)
+#define   G_028644_SEL_SAMPLE(x)                       (((x) >> 18) & 0x1)
+#define   C_028644_SEL_SAMPLE                          0xFFFBFFFF
+#define R_0286D4_SPI_INTERP_CONTROL_0                0x0286D4
+#define   S_0286D4_FLAT_SHADE_ENA(x)                   (((x) & 0x1) << 0)
+#define   G_0286D4_FLAT_SHADE_ENA(x)                   (((x) >> 0) & 0x1)
+#define   C_0286D4_FLAT_SHADE_ENA                      0xFFFFFFFE
+#define   S_0286D4_PNT_SPRITE_ENA(x)                   (((x) & 0x1) << 1)
+#define   G_0286D4_PNT_SPRITE_ENA(x)                   (((x) >> 1) & 0x1)
+#define   C_0286D4_PNT_SPRITE_ENA                      0xFFFFFFFD
+#define   S_0286D4_PNT_SPRITE_OVRD_X(x)                (((x) & 0x7) << 2)
+#define   G_0286D4_PNT_SPRITE_OVRD_X(x)                (((x) >> 2) & 0x7)
+#define   C_0286D4_PNT_SPRITE_OVRD_X                   0xFFFFFFE3
+#define   S_0286D4_PNT_SPRITE_OVRD_Y(x)                (((x) & 0x7) << 5)
+#define   G_0286D4_PNT_SPRITE_OVRD_Y(x)                (((x) >> 5) & 0x7)
+#define   C_0286D4_PNT_SPRITE_OVRD_Y                   0xFFFFFF1F
+#define   S_0286D4_PNT_SPRITE_OVRD_Z(x)                (((x) & 0x7) << 8)
+#define   G_0286D4_PNT_SPRITE_OVRD_Z(x)                (((x) >> 8) & 0x7)
+#define   C_0286D4_PNT_SPRITE_OVRD_Z                   0xFFFFF8FF
+#define   S_0286D4_PNT_SPRITE_OVRD_W(x)                (((x) & 0x7) << 11)
+#define   G_0286D4_PNT_SPRITE_OVRD_W(x)                (((x) >> 11) & 0x7)
+#define   C_0286D4_PNT_SPRITE_OVRD_W                   0xFFFFC7FF
+#define   S_0286D4_PNT_SPRITE_TOP_1(x)                 (((x) & 0x1) << 14)
+#define   G_0286D4_PNT_SPRITE_TOP_1(x)                 (((x) >> 14) & 0x1)
+#define   C_0286D4_PNT_SPRITE_TOP_1                    0xFFFFBFFF
+
+#endif
diff --git a/src/gallium/drivers/r600/r700_sq.h b/src/gallium/drivers/r600/r700_sq.h
new file mode 100644
index 0000000000..8266af6d1f
--- /dev/null
+++ b/src/gallium/drivers/r600/r700_sq.h
@@ -0,0 +1,609 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+#ifndef R700_SQ_H
+#define R700_SQ_H
+
+#define P_SQ_CF_WORD0
+#define   S_SQ_CF_WORD0_ADDR(x)                                      (((x) & 0xFFFFFFFF) << 0)
+#define   G_SQ_CF_WORD0_ADDR(x)                                      (((x) >> 0) & 0xFFFFFFFF)
+#define   C_SQ_CF_WORD0_ADDR                                         0x00000000
+#define P_SQ_CF_WORD1
+#define   S_SQ_CF_WORD1_POP_COUNT(x)                                 (((x) & 0x7) << 0)
+#define   G_SQ_CF_WORD1_POP_COUNT(x)                                 (((x) >> 0) & 0x7)
+#define   C_SQ_CF_WORD1_POP_COUNT                                    0xFFFFFFF8
+#define   S_SQ_CF_WORD1_CF_CONST(x)                                  (((x) & 0x1F) << 3)
+#define   G_SQ_CF_WORD1_CF_CONST(x)                                  (((x) >> 3) & 0x1F)
+#define   C_SQ_CF_WORD1_CF_CONST                                     0xFFFFFF07
+#define   S_SQ_CF_WORD1_COND(x)                                      (((x) & 0x3) << 8)
+#define   G_SQ_CF_WORD1_COND(x)                                      (((x) >> 8) & 0x3)
+#define   C_SQ_CF_WORD1_COND                                         0xFFFFFCFF
+#define   S_SQ_CF_WORD1_COUNT(x)                                     (((x) & 0x7) << 10)
+#define   G_SQ_CF_WORD1_COUNT(x)                                     (((x) >> 10) & 0x7)
+#define   C_SQ_CF_WORD1_COUNT                                        0xFFFFE3FF
+#define   S_SQ_CF_WORD1_CALL_COUNT(x)                                (((x) & 0x3F) << 13)
+#define   G_SQ_CF_WORD1_CALL_COUNT(x)                                (((x) >> 13) & 0x3F)
+#define   C_SQ_CF_WORD1_CALL_COUNT                                   0xFFF81FFF
+#define   S_SQ_CF_WORD1_END_OF_PROGRAM(x)                            (((x) & 0x1) << 21)
+#define   G_SQ_CF_WORD1_END_OF_PROGRAM(x)                            (((x) >> 21) & 0x1)
+#define   C_SQ_CF_WORD1_END_OF_PROGRAM                               0xFFDFFFFF
+#define   S_SQ_CF_WORD1_VALID_PIXEL_MODE(x)                          (((x) & 0x1) << 22)
+#define   G_SQ_CF_WORD1_VALID_PIXEL_MODE(x)                          (((x) >> 22) & 0x1)
+#define   C_SQ_CF_WORD1_VALID_PIXEL_MODE                             0xFFBFFFFF
+#define   S_SQ_CF_WORD1_CF_INST(x)                                   (((x) & 0x7F) << 23)
+#define   G_SQ_CF_WORD1_CF_INST(x)                                   (((x) >> 23) & 0x7F)
+#define   C_SQ_CF_WORD1_CF_INST                                      0xC07FFFFF
+#define     V_SQ_CF_WORD1_SQ_CF_INST_NOP                             0x00000000
+#define     V_SQ_CF_WORD1_SQ_CF_INST_TEX                             0x00000001
+#define     V_SQ_CF_WORD1_SQ_CF_INST_VTX                             0x00000002
+#define     V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC                          0x00000003
+#define     V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START                      0x00000004
+#define     V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END                        0x00000005
+#define     V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10                 0x00000006
+#define     V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL                0x00000007
+#define     V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE                   0x00000008
+#define     V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK                      0x00000009
+#define     V_SQ_CF_WORD1_SQ_CF_INST_JUMP                            0x0000000A
+#define     V_SQ_CF_WORD1_SQ_CF_INST_PUSH                            0x0000000B
+#define     V_SQ_CF_WORD1_SQ_CF_INST_PUSH_ELSE                       0x0000000C
+#define     V_SQ_CF_WORD1_SQ_CF_INST_ELSE                            0x0000000D
+#define     V_SQ_CF_WORD1_SQ_CF_INST_POP                             0x0000000E
+#define     V_SQ_CF_WORD1_SQ_CF_INST_POP_JUMP                        0x0000000F
+#define     V_SQ_CF_WORD1_SQ_CF_INST_POP_PUSH                        0x00000010
+#define     V_SQ_CF_WORD1_SQ_CF_INST_POP_PUSH_ELSE                   0x00000011
+#define     V_SQ_CF_WORD1_SQ_CF_INST_CALL                            0x00000012
+#define     V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS                         0x00000013
+#define     V_SQ_CF_WORD1_SQ_CF_INST_RETURN                          0x00000014
+#define     V_SQ_CF_WORD1_SQ_CF_INST_EMIT_VERTEX                     0x00000015
+#define     V_SQ_CF_WORD1_SQ_CF_INST_EMIT_CUT_VERTEX                 0x00000016
+#define     V_SQ_CF_WORD1_SQ_CF_INST_CUT_VERTEX                      0x00000017
+#define     V_SQ_CF_WORD1_SQ_CF_INST_KILL                            0x00000018
+#define   S_SQ_CF_WORD1_WHOLE_QUAD_MODE(x)                           (((x) & 0x1) << 30)
+#define   G_SQ_CF_WORD1_WHOLE_QUAD_MODE(x)                           (((x) >> 30) & 0x1)
+#define   C_SQ_CF_WORD1_WHOLE_QUAD_MODE                              0xBFFFFFFF
+#define   S_SQ_CF_WORD1_BARRIER(x)                                   (((x) & 0x1) << 31)
+#define   G_SQ_CF_WORD1_BARRIER(x)                                   (((x) >> 31) & 0x1)
+#define   C_SQ_CF_WORD1_BARRIER                                      0x7FFFFFFF
+#define   S_SQ_CF_WORD1_COUNT_3(x)                                   (((x) & 0x1) << 19)
+#define   G_SQ_CF_WORD1_COUNT_3(x)                                   (((x) >> 19) & 0x1)
+#define   C_SQ_CF_WORD1_COUNT_3                                      0xFFF7FFFF
+#define P_SQ_CF_ALU_WORD0
+#define   S_SQ_CF_ALU_WORD0_ADDR(x)                                  (((x) & 0x3FFFFF) << 0)
+#define   G_SQ_CF_ALU_WORD0_ADDR(x)                                  (((x) >> 0) & 0x3FFFFF)
+#define   C_SQ_CF_ALU_WORD0_ADDR                                     0xFFC00000
+#define   S_SQ_CF_ALU_WORD0_KCACHE_BANK0(x)                          (((x) & 0xF) << 22)
+#define   G_SQ_CF_ALU_WORD0_KCACHE_BANK0(x)                          (((x) >> 22) & 0xF)
+#define   C_SQ_CF_ALU_WORD0_KCACHE_BANK0                             0xFC3FFFFF
+#define   S_SQ_CF_ALU_WORD0_KCACHE_BANK1(x)                          (((x) & 0xF) << 26)
+#define   G_SQ_CF_ALU_WORD0_KCACHE_BANK1(x)                          (((x) >> 26) & 0xF)
+#define   C_SQ_CF_ALU_WORD0_KCACHE_BANK1                             0xC3FFFFFF
+#define   S_SQ_CF_ALU_WORD0_KCACHE_MODE0(x)                          (((x) & 0x3) << 30)
+#define   G_SQ_CF_ALU_WORD0_KCACHE_MODE0(x)                          (((x) >> 30) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_KCACHE_MODE0                             0x3FFFFFFF
+#define P_SQ_CF_ALU_WORD1
+#define   S_SQ_CF_ALU_WORD1_KCACHE_MODE1(x)                          (((x) & 0x3) << 0)
+#define   G_SQ_CF_ALU_WORD1_KCACHE_MODE1(x)                          (((x) >> 0) & 0x3)
+#define   C_SQ_CF_ALU_WORD1_KCACHE_MODE1                             0xFFFFFFFC
+#define   S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(x)                          (((x) & 0xFF) << 2)
+#define   G_SQ_CF_ALU_WORD1_KCACHE_ADDR0(x)                          (((x) >> 2) & 0xFF)
+#define   C_SQ_CF_ALU_WORD1_KCACHE_ADDR0                             0xFFFFFC03
+#define   S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(x)                          (((x) & 0xFF) << 10)
+#define   G_SQ_CF_ALU_WORD1_KCACHE_ADDR1(x)                          (((x) >> 10) & 0xFF)
+#define   C_SQ_CF_ALU_WORD1_KCACHE_ADDR1                             0xFFFC03FF
+#define   S_SQ_CF_ALU_WORD1_COUNT(x)                                 (((x) & 0x7F) << 18)
+#define   G_SQ_CF_ALU_WORD1_COUNT(x)                                 (((x) >> 18) & 0x7F)
+#define   C_SQ_CF_ALU_WORD1_COUNT                                    0xFE03FFFF
+#define   S_SQ_CF_ALU_WORD1_USES_WATERFALL(x)                        (((x) & 0x1) << 25)
+#define   G_SQ_CF_ALU_WORD1_USES_WATERFALL(x)                        (((x) >> 25) & 0x1)
+#define   C_SQ_CF_ALU_WORD1_USES_WATERFALL                           0xFDFFFFFF
+#define   S_SQ_CF_ALU_WORD1_CF_INST(x)                               (((x) & 0xF) << 26)
+#define   G_SQ_CF_ALU_WORD1_CF_INST(x)                               (((x) >> 26) & 0xF)
+#define   C_SQ_CF_ALU_WORD1_CF_INST                                  0xC3FFFFFF
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU                         0x00000008
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE             0x00000009
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER               0x0000000A
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER              0x0000000B
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_CONTINUE                0x0000000D
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_BREAK                   0x0000000E
+#define     V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_ELSE_AFTER              0x0000000F
+#define   S_SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE(x)                       (((x) & 0x1) << 30)
+#define   G_SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE(x)                       (((x) >> 30) & 0x1)
+#define   C_SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE                          0xBFFFFFFF
+#define   S_SQ_CF_ALU_WORD1_BARRIER(x)                               (((x) & 0x1) << 31)
+#define   G_SQ_CF_ALU_WORD1_BARRIER(x)                               (((x) >> 31) & 0x1)
+#define   C_SQ_CF_ALU_WORD1_BARRIER                                  0x7FFFFFFF
+#define   S_SQ_CF_ALU_WORD1_ALT_CONST(x)                             (((x) & 0x1) << 25)
+#define   G_SQ_CF_ALU_WORD1_ALT_CONST(x)                             (((x) >> 25) & 0x1)
+#define   C_SQ_CF_ALU_WORD1_ALT_CONST                                0xFDFFFFFF
+#define P_SQ_CF_ALLOC_EXPORT_WORD0
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(x)                   (((x) & 0x1FFF) << 0)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(x)                   (((x) >> 0) & 0x1FFF)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE                      0xFFFFE000
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(x)                         (((x) & 0x3) << 13)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(x)                         (((x) >> 13) & 0x3)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_TYPE                            0xFFFF9FFF
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL               0x00000000
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS                 0x00000001
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM               0x00000002
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_SX                  0x00000003
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x)                       (((x) & 0x7F) << 15)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x)                       (((x) >> 15) & 0x7F)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR                          0xFFC07FFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_RW_REL(x)                       (((x) & 0x1) << 22)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_RW_REL(x)                       (((x) >> 22) & 0x1)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_RW_REL                          0xFFBFFFFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(x)                    (((x) & 0x7F) << 23)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(x)                    (((x) >> 23) & 0x7F)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR                       0xC07FFFFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(x)                    (((x) & 0x3) << 30)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(x)                    (((x) >> 30) & 0x3)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE                       0x3FFFFFFF
+#define P_SQ_CF_ALLOC_EXPORT_WORD1
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(x)                  (((x) & 0xF) << 17)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(x)                  (((x) >> 17) & 0xF)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT                     0xFFE1FFFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(x)               (((x) & 0x1) << 21)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(x)               (((x) >> 21) & 0x1)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM                  0xFFDFFFFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE(x)             (((x) & 0x1) << 22)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE(x)             (((x) >> 22) & 0x1)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE                0xFFBFFFFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(x)                      (((x) & 0x7F) << 23)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(x)                      (((x) >> 23) & 0x7F)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST                         0xC07FFFFF
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0        0x00000020
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1        0x00000021
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2        0x00000022
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3        0x00000023
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_SCRATCH        0x00000024
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_REDUCTION      0x00000025
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_RING           0x00000026
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT             0x00000027
+#define     V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE        0x00000028
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE(x)              (((x) & 0x1) << 30)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE(x)              (((x) >> 30) & 0x1)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE                 0xBFFFFFFF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(x)                      (((x) & 0x1) << 31)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(x)                      (((x) >> 31) & 0x1)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER                         0x7FFFFFFF
+#define P_SQ_CF_ALLOC_EXPORT_WORD1_BUF
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(x)               (((x) & 0xFFF) << 0)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(x)               (((x) >> 0) & 0xFFF)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE                  0xFFFFF000
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(x)                (((x) & 0xF) << 12)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(x)                (((x) >> 12) & 0xF)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK                   0xFFFF0FFF
+#define P_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(x)                   (((x) & 0x7) << 0)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(x)                   (((x) >> 0) & 0x7)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X                      0xFFFFFFF8
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(x)                   (((x) & 0x7) << 3)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(x)                   (((x) >> 3) & 0x7)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y                      0xFFFFFFC7
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(x)                   (((x) & 0x7) << 6)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(x)                   (((x) >> 6) & 0x7)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z                      0xFFFFFE3F
+#define   S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(x)                   (((x) & 0x7) << 9)
+#define   G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(x)                   (((x) >> 9) & 0x7)
+#define   C_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W                      0xFFFFF1FF
+#define P_SQ_ALU_WORD0
+#define   S_SQ_ALU_WORD0_SRC0_SEL(x)                                 (((x) & 0x1FF) << 0)
+#define   G_SQ_ALU_WORD0_SRC0_SEL(x)                                 (((x) >> 0) & 0x1FF)
+#define   C_SQ_ALU_WORD0_SRC0_SEL                                    0xFFFFFE00
+#define   S_SQ_ALU_WORD0_SRC0_REL(x)                                 (((x) & 0x1) << 9)
+#define   G_SQ_ALU_WORD0_SRC0_REL(x)                                 (((x) >> 9) & 0x1)
+#define   C_SQ_ALU_WORD0_SRC0_REL                                    0xFFFFFDFF
+#define   S_SQ_ALU_WORD0_SRC0_CHAN(x)                                (((x) & 0x3) << 10)
+#define   G_SQ_ALU_WORD0_SRC0_CHAN(x)                                (((x) >> 10) & 0x3)
+#define   C_SQ_ALU_WORD0_SRC0_CHAN                                   0xFFFFF3FF
+#define   S_SQ_ALU_WORD0_SRC0_NEG(x)                                 (((x) & 0x1) << 12)
+#define   G_SQ_ALU_WORD0_SRC0_NEG(x)                                 (((x) >> 12) & 0x1)
+#define   C_SQ_ALU_WORD0_SRC0_NEG                                    0xFFFFEFFF
+#define   S_SQ_ALU_WORD0_SRC1_SEL(x)                                 (((x) & 0x1FF) << 13)
+#define   G_SQ_ALU_WORD0_SRC1_SEL(x)                                 (((x) >> 13) & 0x1FF)
+#define   C_SQ_ALU_WORD0_SRC1_SEL                                    0xFFC01FFF
+#define   S_SQ_ALU_WORD0_SRC1_REL(x)                                 (((x) & 0x1) << 22)
+#define   G_SQ_ALU_WORD0_SRC1_REL(x)                                 (((x) >> 22) & 0x1)
+#define   C_SQ_ALU_WORD0_SRC1_REL                                    0xFFBFFFFF
+#define   S_SQ_ALU_WORD0_SRC1_CHAN(x)                                (((x) & 0x3) << 23)
+#define   G_SQ_ALU_WORD0_SRC1_CHAN(x)                                (((x) >> 23) & 0x3)
+#define   C_SQ_ALU_WORD0_SRC1_CHAN                                   0xFE7FFFFF
+#define   S_SQ_ALU_WORD0_SRC1_NEG(x)                                 (((x) & 0x1) << 25)
+#define   G_SQ_ALU_WORD0_SRC1_NEG(x)                                 (((x) >> 25) & 0x1)
+#define   C_SQ_ALU_WORD0_SRC1_NEG                                    0xFDFFFFFF
+#define   S_SQ_ALU_WORD0_INDEX_MODE(x)                               (((x) & 0x7) << 26)
+#define   G_SQ_ALU_WORD0_INDEX_MODE(x)                               (((x) >> 26) & 0x7)
+#define   C_SQ_ALU_WORD0_INDEX_MODE                                  0xE3FFFFFF
+#define   S_SQ_ALU_WORD0_PRED_SEL(x)                                 (((x) & 0x3) << 29)
+#define   G_SQ_ALU_WORD0_PRED_SEL(x)                                 (((x) >> 29) & 0x3)
+#define   C_SQ_ALU_WORD0_PRED_SEL                                    0x9FFFFFFF
+#define   S_SQ_ALU_WORD0_LAST(x)                                     (((x) & 0x1) << 31)
+#define   G_SQ_ALU_WORD0_LAST(x)                                     (((x) >> 31) & 0x1)
+#define   C_SQ_ALU_WORD0_LAST                                        0x7FFFFFFF
+#define P_SQ_ALU_WORD1
+#define   S_SQ_ALU_WORD1_ENCODING(x)                                 (((x) & 0x7) << 15)
+#define   G_SQ_ALU_WORD1_ENCODING(x)                                 (((x) >> 15) & 0x7)
+#define   C_SQ_ALU_WORD1_ENCODING                                    0xFFFC7FFF
+#define   S_SQ_ALU_WORD1_BANK_SWIZZLE(x)                             (((x) & 0x7) << 18)
+#define   G_SQ_ALU_WORD1_BANK_SWIZZLE(x)                             (((x) >> 18) & 0x7)
+#define   C_SQ_ALU_WORD1_BANK_SWIZZLE                                0xFFE3FFFF
+#define   S_SQ_ALU_WORD1_DST_GPR(x)                                  (((x) & 0x7F) << 21)
+#define   G_SQ_ALU_WORD1_DST_GPR(x)                                  (((x) >> 21) & 0x7F)
+#define   C_SQ_ALU_WORD1_DST_GPR                                     0xF01FFFFF
+#define   S_SQ_ALU_WORD1_DST_REL(x)                                  (((x) & 0x1) << 28)
+#define   G_SQ_ALU_WORD1_DST_REL(x)                                  (((x) >> 28) & 0x1)
+#define   C_SQ_ALU_WORD1_DST_REL                                     0xEFFFFFFF
+#define   S_SQ_ALU_WORD1_DST_CHAN(x)                                 (((x) & 0x3) << 29)
+#define   G_SQ_ALU_WORD1_DST_CHAN(x)                                 (((x) >> 29) & 0x3)
+#define   C_SQ_ALU_WORD1_DST_CHAN                                    0x9FFFFFFF
+#define   S_SQ_ALU_WORD1_CLAMP(x)                                    (((x) & 0x1) << 31)
+#define   G_SQ_ALU_WORD1_CLAMP(x)                                    (((x) >> 31) & 0x1)
+#define   C_SQ_ALU_WORD1_CLAMP                                       0x7FFFFFFF
+#define P_SQ_ALU_WORD1_OP2
+#define   S_SQ_ALU_WORD1_OP2_SRC0_ABS(x)                             (((x) & 0x1) << 0)
+#define   G_SQ_ALU_WORD1_OP2_SRC0_ABS(x)                             (((x) >> 0) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_SRC0_ABS                                0xFFFFFFFE
+#define   S_SQ_ALU_WORD1_OP2_SRC1_ABS(x)                             (((x) & 0x1) << 1)
+#define   G_SQ_ALU_WORD1_OP2_SRC1_ABS(x)                             (((x) >> 1) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_SRC1_ABS                                0xFFFFFFFD
+#define   S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(x)                  (((x) & 0x1) << 2)
+#define   G_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(x)                  (((x) >> 2) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK                     0xFFFFFFFB
+#define   S_SQ_ALU_WORD1_OP2_UPDATE_PRED(x)                          (((x) & 0x1) << 3)
+#define   G_SQ_ALU_WORD1_OP2_UPDATE_PRED(x)                          (((x) >> 3) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_UPDATE_PRED                             0xFFFFFFF7
+#define   S_SQ_ALU_WORD1_OP2_WRITE_MASK(x)                           (((x) & 0x1) << 4)
+#define   G_SQ_ALU_WORD1_OP2_WRITE_MASK(x)                           (((x) >> 4) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_WRITE_MASK                              0xFFFFFFEF
+#define   S_SQ_ALU_WORD1_OP2_OMOD(x)                                 (((x) & 0x3) << 5)
+#define   G_SQ_ALU_WORD1_OP2_OMOD(x)                                 (((x) >> 5) & 0x3)
+#define   C_SQ_ALU_WORD1_OP2_OMOD                                    0xFFFFFF9F
+#define   S_SQ_ALU_WORD1_OP2_ALU_INST(x)                             (((x) & 0x7FF) << 7)
+#define   G_SQ_ALU_WORD1_OP2_ALU_INST(x)                             (((x) >> 7) & 0x7FF)
+#define   C_SQ_ALU_WORD1_OP2_ALU_INST                                0xFFFC007F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD                       0x00000000
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL                       0x00000001
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_IEEE                  0x00000002
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX                       0x00000003
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN                       0x00000004
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_DX10                  0x00000005
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_DX10                  0x00000006
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE                      0x00000008
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT                     0x00000009
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE                     0x0000000A
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE                     0x0000000B
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_DX10                 0x0000000C
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_DX10                0x0000000D
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_DX10                0x0000000E
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_DX10                0x0000000F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT                     0x00000010
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC                     0x00000011
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL                      0x00000012
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE                     0x00000013
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR                     0x00000014
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA                      0x00000015
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR                0x00000016
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT                  0x00000018
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV                       0x00000019
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP                       0x0000001A
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT           0x0000001E
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT           0x0000001F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE                 0x00000020
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT                0x00000021
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE                0x00000022
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE                0x00000023
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV              0x00000024
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP              0x00000025
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR              0x00000026
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE          0x00000027
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH            0x00000028
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH           0x00000029
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH           0x0000002A
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH           0x0000002B
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE                     0x0000002C
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT                    0x0000002D
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE                    0x0000002E
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE                    0x0000002F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT                   0x00000030
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT                    0x00000031
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT                   0x00000032
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT                   0x00000033
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT                   0x00000034
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT                   0x00000035
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT                   0x00000036
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT                   0x00000037
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT                  0x00000038
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT                  0x00000039
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT                  0x0000003A
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT                 0x0000003B
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT                 0x0000003C
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT                 0x0000003D
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT                0x0000003E
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT                0x0000003F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT               0x00000040
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT               0x00000041
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT             0x00000042
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT            0x00000043
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT            0x00000044
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT            0x00000045
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT                 0x00000046
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT                0x00000047
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT                0x00000048
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT                0x00000049
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT        0x0000004A
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT       0x0000004B
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT       0x0000004C
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT       0x0000004D
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT       0x0000004E
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT       0x0000004F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4                      0x00000050
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE                 0x00000051
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE                      0x00000052
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4                      0x00000053
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT              0x00000060
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE                  0x00000061
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED               0x00000062
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE                  0x00000063
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED             0x00000064
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF                  0x00000065
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE                0x00000066
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED         0x00000067
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF              0x00000068
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE            0x00000069
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE                 0x0000006A
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT                0x0000006B
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT                0x0000006C
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT               0x0000006D
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN                       0x0000006E
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS                       0x0000006F
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT                  0x00000070
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT                  0x00000071
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT                  0x00000072
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT                 0x00000073
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT                 0x00000074
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT                0x00000075
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT                0x00000076
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT                 0x00000077
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT                0x00000078
+#define     V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT               0x00000079
+#define P_SQ_ALU_WORD1_OP3
+#define   S_SQ_ALU_WORD1_OP3_SRC2_SEL(x)                             (((x) & 0x1FF) << 0)
+#define   G_SQ_ALU_WORD1_OP3_SRC2_SEL(x)                             (((x) >> 0) & 0x1FF)
+#define   C_SQ_ALU_WORD1_OP3_SRC2_SEL                                0xFFFFFE00
+#define   S_SQ_ALU_WORD1_OP3_SRC2_REL(x)                             (((x) & 0x1) << 9)
+#define   G_SQ_ALU_WORD1_OP3_SRC2_REL(x)                             (((x) >> 9) & 0x1)
+#define   C_SQ_ALU_WORD1_OP3_SRC2_REL                                0xFFFFFDFF
+#define   S_SQ_ALU_WORD1_OP3_SRC2_CHAN(x)                            (((x) & 0x3) << 10)
+#define   G_SQ_ALU_WORD1_OP3_SRC2_CHAN(x)                            (((x) >> 10) & 0x3)
+#define   C_SQ_ALU_WORD1_OP3_SRC2_CHAN                               0xFFFFF3FF
+#define   S_SQ_ALU_WORD1_OP3_SRC2_NEG(x)                             (((x) & 0x1) << 12)
+#define   G_SQ_ALU_WORD1_OP3_SRC2_NEG(x)                             (((x) >> 12) & 0x1)
+#define   C_SQ_ALU_WORD1_OP3_SRC2_NEG                                0xFFFFEFFF
+#define   S_SQ_ALU_WORD1_OP3_ALU_INST(x)                             (((x) & 0x1F) << 13)
+#define   G_SQ_ALU_WORD1_OP3_ALU_INST(x)                             (((x) >> 13) & 0x1F)
+#define   C_SQ_ALU_WORD1_OP3_ALU_INST                                0xFFFC1FFF
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT                   0x0000000C
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M2                0x0000000D
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M4                0x0000000E
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_D2                0x0000000F
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD                    0x00000010
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_M2                 0x00000011
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_M4                 0x00000012
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_D2                 0x00000013
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_IEEE               0x00000014
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_IEEE_M2            0x00000015
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_IEEE_M4            0x00000016
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_IEEE_D2            0x00000017
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE                      0x00000018
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT                     0x00000019
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE                     0x0000001A
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT                  0x0000001C
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT                 0x0000001D
+#define     V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT                 0x0000001E
+#define P_SQ_VTX_WORD0
+#define   S_SQ_VTX_WORD0_VTX_INST(x)                                 (((x) & 0x1F) << 0)
+#define   G_SQ_VTX_WORD0_VTX_INST(x)                                 (((x) >> 0) & 0x1F)
+#define   C_SQ_VTX_WORD0_VTX_INST                                    0xFFFFFFE0
+#define   S_SQ_VTX_WORD0_FETCH_TYPE(x)                               (((x) & 0x3) << 5)
+#define   G_SQ_VTX_WORD0_FETCH_TYPE(x)                               (((x) >> 5) & 0x3)
+#define   C_SQ_VTX_WORD0_FETCH_TYPE                                  0xFFFFFF9F
+#define   S_SQ_VTX_WORD0_FETCH_WHOLE_QUAD(x)                         (((x) & 0x1) << 7)
+#define   G_SQ_VTX_WORD0_FETCH_WHOLE_QUAD(x)                         (((x) >> 7) & 0x1)
+#define   C_SQ_VTX_WORD0_FETCH_WHOLE_QUAD                            0xFFFFFF7F
+#define   S_SQ_VTX_WORD0_BUFFER_ID(x)                                (((x) & 0xFF) << 8)
+#define   G_SQ_VTX_WORD0_BUFFER_ID(x)                                (((x) >> 8) & 0xFF)
+#define   C_SQ_VTX_WORD0_BUFFER_ID                                   0xFFFF00FF
+#define   S_SQ_VTX_WORD0_SRC_GPR(x)                                  (((x) & 0x7F) << 16)
+#define   G_SQ_VTX_WORD0_SRC_GPR(x)                                  (((x) >> 16) & 0x7F)
+#define   C_SQ_VTX_WORD0_SRC_GPR                                     0xFF80FFFF
+#define   S_SQ_VTX_WORD0_SRC_REL(x)                                  (((x) & 0x1) << 23)
+#define   G_SQ_VTX_WORD0_SRC_REL(x)                                  (((x) >> 23) & 0x1)
+#define   C_SQ_VTX_WORD0_SRC_REL                                     0xFF7FFFFF
+#define   S_SQ_VTX_WORD0_SRC_SEL_X(x)                                (((x) & 0x3) << 24)
+#define   G_SQ_VTX_WORD0_SRC_SEL_X(x)                                (((x) >> 24) & 0x3)
+#define   C_SQ_VTX_WORD0_SRC_SEL_X                                   0xFCFFFFFF
+#define   S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(x)                         (((x) & 0x3F) << 26)
+#define   G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(x)                         (((x) >> 26) & 0x3F)
+#define   C_SQ_VTX_WORD0_MEGA_FETCH_COUNT                            0x03FFFFFF
+#define P_SQ_VTX_WORD1
+#define   S_SQ_VTX_WORD1_DST_SEL_X(x)                                (((x) & 0x7) << 9)
+#define   G_SQ_VTX_WORD1_DST_SEL_X(x)                                (((x) >> 9) & 0x7)
+#define   C_SQ_VTX_WORD1_DST_SEL_X                                   0xFFFFF1FF
+#define   S_SQ_VTX_WORD1_DST_SEL_Y(x)                                (((x) & 0x7) << 12)
+#define   G_SQ_VTX_WORD1_DST_SEL_Y(x)                                (((x) >> 12) & 0x7)
+#define   C_SQ_VTX_WORD1_DST_SEL_Y                                   0xFFFF8FFF
+#define   S_SQ_VTX_WORD1_DST_SEL_Z(x)                                (((x) & 0x7) << 15)
+#define   G_SQ_VTX_WORD1_DST_SEL_Z(x)                                (((x) >> 15) & 0x7)
+#define   C_SQ_VTX_WORD1_DST_SEL_Z                                   0xFFFC7FFF
+#define   S_SQ_VTX_WORD1_DST_SEL_W(x)                                (((x) & 0x7) << 18)
+#define   G_SQ_VTX_WORD1_DST_SEL_W(x)                                (((x) >> 18) & 0x7)
+#define   C_SQ_VTX_WORD1_DST_SEL_W                                   0xFFE3FFFF
+#define   S_SQ_VTX_WORD1_USE_CONST_FIELDS(x)                         (((x) & 0x1) << 21)
+#define   G_SQ_VTX_WORD1_USE_CONST_FIELDS(x)                         (((x) >> 21) & 0x1)
+#define   C_SQ_VTX_WORD1_USE_CONST_FIELDS                            0xFFDFFFFF
+#define   S_SQ_VTX_WORD1_DATA_FORMAT(x)                              (((x) & 0x3F) << 22)
+#define   G_SQ_VTX_WORD1_DATA_FORMAT(x)                              (((x) >> 22) & 0x3F)
+#define   C_SQ_VTX_WORD1_DATA_FORMAT                                 0xF03FFFFF
+#define   S_SQ_VTX_WORD1_NUM_FORMAT_ALL(x)                           (((x) & 0x3) << 28)
+#define   G_SQ_VTX_WORD1_NUM_FORMAT_ALL(x)                           (((x) >> 28) & 0x3)
+#define   C_SQ_VTX_WORD1_NUM_FORMAT_ALL                              0xCFFFFFFF
+#define   S_SQ_VTX_WORD1_FORMAT_COMP_ALL(x)                          (((x) & 0x1) << 30)
+#define   G_SQ_VTX_WORD1_FORMAT_COMP_ALL(x)                          (((x) >> 30) & 0x1)
+#define   C_SQ_VTX_WORD1_FORMAT_COMP_ALL                             0xBFFFFFFF
+#define   S_SQ_VTX_WORD1_SRF_MODE_ALL(x)                             (((x) & 0x1) << 31)
+#define   G_SQ_VTX_WORD1_SRF_MODE_ALL(x)                             (((x) >> 31) & 0x1)
+#define   C_SQ_VTX_WORD1_SRF_MODE_ALL                                0x7FFFFFFF
+#define P_SQ_VTX_WORD1_GPR
+#define   S_SQ_VTX_WORD1_GPR_DST_GPR(x)                              (((x) & 0x7F) << 0)
+#define   G_SQ_VTX_WORD1_GPR_DST_GPR(x)                              (((x) >> 0) & 0x7F)
+#define   C_SQ_VTX_WORD1_GPR_DST_GPR                                 0xFFFFFF80
+#define   S_SQ_VTX_WORD1_GPR_DST_REL(x)                              (((x) & 0x1) << 7)
+#define   G_SQ_VTX_WORD1_GPR_DST_REL(x)                              (((x) >> 7) & 0x1)
+#define   C_SQ_VTX_WORD1_GPR_DST_REL                                 0xFFFFFF7F
+#define P_SQ_VTX_WORD1_SEM
+#define   S_SQ_VTX_WORD1_SEM_SEMANTIC_ID(x)                          (((x) & 0xFF) << 0)
+#define   G_SQ_VTX_WORD1_SEM_SEMANTIC_ID(x)                          (((x) >> 0) & 0xFF)
+#define   C_SQ_VTX_WORD1_SEM_SEMANTIC_ID                             0xFFFFFF00
+#define P_SQ_VTX_WORD2
+#define   S_SQ_VTX_WORD2_OFFSET(x)                                   (((x) & 0xFFFF) << 0)
+#define   G_SQ_VTX_WORD2_OFFSET(x)                                   (((x) >> 0) & 0xFFFF)
+#define   C_SQ_VTX_WORD2_OFFSET                                      0xFFFF0000
+#define   S_SQ_VTX_WORD2_ENDIAN_SWAP(x)                              (((x) & 0x3) << 16)
+#define   G_SQ_VTX_WORD2_ENDIAN_SWAP(x)                              (((x) >> 16) & 0x3)
+#define   C_SQ_VTX_WORD2_ENDIAN_SWAP                                 0xFFFCFFFF
+#define   S_SQ_VTX_WORD2_CONST_BUF_NO_STRIDE(x)                      (((x) & 0x1) << 18)
+#define   G_SQ_VTX_WORD2_CONST_BUF_NO_STRIDE(x)                      (((x) >> 18) & 0x1)
+#define   C_SQ_VTX_WORD2_CONST_BUF_NO_STRIDE                         0xFFFBFFFF
+#define   S_SQ_VTX_WORD2_MEGA_FETCH(x)                               (((x) & 0x1) << 19)
+#define   G_SQ_VTX_WORD2_MEGA_FETCH(x)                               (((x) >> 19) & 0x1)
+#define   C_SQ_VTX_WORD2_MEGA_FETCH                                  0xFFF7FFFF
+#define   S_SQ_VTX_WORD2_ALT_CONST(x)                                (((x) & 0x1) << 20)
+#define   G_SQ_VTX_WORD2_ALT_CONST(x)                                (((x) >> 20) & 0x1)
+#define   C_SQ_VTX_WORD2_ALT_CONST                                   0xFFEFFFFF
+#define P_SQ_TEX_WORD0
+#define   S_SQ_TEX_WORD0_TEX_INST(x)                                 (((x) & 0x1F) << 0)
+#define   G_SQ_TEX_WORD0_TEX_INST(x)                                 (((x) >> 0) & 0x1F)
+#define   C_SQ_TEX_WORD0_TEX_INST                                    0xFFFFFFE0
+#define   S_SQ_TEX_WORD0_BC_FRAC_MODE(x)                             (((x) & 0x1) << 5)
+#define   G_SQ_TEX_WORD0_BC_FRAC_MODE(x)                             (((x) >> 5) & 0x1)
+#define   C_SQ_TEX_WORD0_BC_FRAC_MODE                                0xFFFFFFDF
+#define   S_SQ_TEX_WORD0_FETCH_WHOLE_QUAD(x)                         (((x) & 0x1) << 7)
+#define   G_SQ_TEX_WORD0_FETCH_WHOLE_QUAD(x)                         (((x) >> 7) & 0x1)
+#define   C_SQ_TEX_WORD0_FETCH_WHOLE_QUAD                            0xFFFFFF7F
+#define   S_SQ_TEX_WORD0_RESOURCE_ID(x)                              (((x) & 0xFF) << 8)
+#define   G_SQ_TEX_WORD0_RESOURCE_ID(x)                              (((x) >> 8) & 0xFF)
+#define   C_SQ_TEX_WORD0_RESOURCE_ID                                 0xFFFF00FF
+#define   S_SQ_TEX_WORD0_SRC_GPR(x)                                  (((x) & 0x7F) << 16)
+#define   G_SQ_TEX_WORD0_SRC_GPR(x)                                  (((x) >> 16) & 0x7F)
+#define   C_SQ_TEX_WORD0_SRC_GPR                                     0xFF80FFFF
+#define   S_SQ_TEX_WORD0_SRC_REL(x)                                  (((x) & 0x1) << 23)
+#define   G_SQ_TEX_WORD0_SRC_REL(x)                                  (((x) >> 23) & 0x1)
+#define   C_SQ_TEX_WORD0_SRC_REL                                     0xFF7FFFFF
+#define   S_SQ_TEX_WORD0_ALT_CONST(x)                                (((x) & 0x1) << 24)
+#define   G_SQ_TEX_WORD0_ALT_CONST(x)                                (((x) >> 24) & 0x1)
+#define   C_SQ_TEX_WORD0_ALT_CONST                                   0xFEFFFFFF
+#define P_SQ_TEX_WORD1
+#define   S_SQ_TEX_WORD1_DST_GPR(x)                                  (((x) & 0x7F) << 0)
+#define   G_SQ_TEX_WORD1_DST_GPR(x)                                  (((x) >> 0) & 0x7F)
+#define   C_SQ_TEX_WORD1_DST_GPR                                     0xFFFFFF80
+#define   S_SQ_TEX_WORD1_DST_REL(x)                                  (((x) & 0x1) << 7)
+#define   G_SQ_TEX_WORD1_DST_REL(x)                                  (((x) >> 7) & 0x1)
+#define   C_SQ_TEX_WORD1_DST_REL                                     0xFFFFFF7F
+#define   S_SQ_TEX_WORD1_DST_SEL_X(x)                                (((x) & 0x7) << 9)
+#define   G_SQ_TEX_WORD1_DST_SEL_X(x)                                (((x) >> 9) & 0x7)
+#define   C_SQ_TEX_WORD1_DST_SEL_X                                   0xFFFFF1FF
+#define   S_SQ_TEX_WORD1_DST_SEL_Y(x)                                (((x) & 0x7) << 12)
+#define   G_SQ_TEX_WORD1_DST_SEL_Y(x)                                (((x) >> 12) & 0x7)
+#define   C_SQ_TEX_WORD1_DST_SEL_Y                                   0xFFFF8FFF
+#define   S_SQ_TEX_WORD1_DST_SEL_Z(x)                                (((x) & 0x7) << 15)
+#define   G_SQ_TEX_WORD1_DST_SEL_Z(x)                                (((x) >> 15) & 0x7)
+#define   C_SQ_TEX_WORD1_DST_SEL_Z                                   0xFFFC7FFF
+#define   S_SQ_TEX_WORD1_DST_SEL_W(x)                                (((x) & 0x7) << 18)
+#define   G_SQ_TEX_WORD1_DST_SEL_W(x)                                (((x) >> 18) & 0x7)
+#define   C_SQ_TEX_WORD1_DST_SEL_W                                   0xFFE3FFFF
+#define   S_SQ_TEX_WORD1_LOD_BIAS(x)                                 (((x) & 0x7F) << 21)
+#define   G_SQ_TEX_WORD1_LOD_BIAS(x)                                 (((x) >> 21) & 0x7F)
+#define   C_SQ_TEX_WORD1_LOD_BIAS                                    0xF01FFFFF
+#define   S_SQ_TEX_WORD1_COORD_TYPE_X(x)                             (((x) & 0x1) << 28)
+#define   G_SQ_TEX_WORD1_COORD_TYPE_X(x)                             (((x) >> 28) & 0x1)
+#define   C_SQ_TEX_WORD1_COORD_TYPE_X                                0xEFFFFFFF
+#define   S_SQ_TEX_WORD1_COORD_TYPE_Y(x)                             (((x) & 0x1) << 29)
+#define   G_SQ_TEX_WORD1_COORD_TYPE_Y(x)                             (((x) >> 29) & 0x1)
+#define   C_SQ_TEX_WORD1_COORD_TYPE_Y                                0xDFFFFFFF
+#define   S_SQ_TEX_WORD1_COORD_TYPE_Z(x)                             (((x) & 0x1) << 30)
+#define   G_SQ_TEX_WORD1_COORD_TYPE_Z(x)                             (((x) >> 30) & 0x1)
+#define   C_SQ_TEX_WORD1_COORD_TYPE_Z                                0xBFFFFFFF
+#define   S_SQ_TEX_WORD1_COORD_TYPE_W(x)                             (((x) & 0x1) << 31)
+#define   G_SQ_TEX_WORD1_COORD_TYPE_W(x)                             (((x) >> 31) & 0x1)
+#define   C_SQ_TEX_WORD1_COORD_TYPE_W                                0x7FFFFFFF
+#define P_SQ_TEX_WORD2
+#define   S_SQ_TEX_WORD2_OFFSET_X(x)                                 (((x) & 0x1F) << 0)
+#define   G_SQ_TEX_WORD2_OFFSET_X(x)                                 (((x) >> 0) & 0x1F)
+#define   C_SQ_TEX_WORD2_OFFSET_X                                    0xFFFFFFE0
+#define   S_SQ_TEX_WORD2_OFFSET_Y(x)                                 (((x) & 0x1F) << 5)
+#define   G_SQ_TEX_WORD2_OFFSET_Y(x)                                 (((x) >> 5) & 0x1F)
+#define   C_SQ_TEX_WORD2_OFFSET_Y                                    0xFFFFFC1F
+#define   S_SQ_TEX_WORD2_OFFSET_Z(x)                                 (((x) & 0x1F) << 10)
+#define   G_SQ_TEX_WORD2_OFFSET_Z(x)                                 (((x) >> 10) & 0x1F)
+#define   C_SQ_TEX_WORD2_OFFSET_Z                                    0xFFFF83FF
+#define   S_SQ_TEX_WORD2_SAMPLER_ID(x)                               (((x) & 0x1F) << 15)
+#define   G_SQ_TEX_WORD2_SAMPLER_ID(x)                               (((x) >> 15) & 0x1F)
+#define   C_SQ_TEX_WORD2_SAMPLER_ID                                  0xFFF07FFF
+#define   S_SQ_TEX_WORD2_SRC_SEL_X(x)                                (((x) & 0x7) << 20)
+#define   G_SQ_TEX_WORD2_SRC_SEL_X(x)                                (((x) >> 20) & 0x7)
+#define   C_SQ_TEX_WORD2_SRC_SEL_X                                   0xFF8FFFFF
+#define   S_SQ_TEX_WORD2_SRC_SEL_Y(x)                                (((x) & 0x7) << 23)
+#define   G_SQ_TEX_WORD2_SRC_SEL_Y(x)                                (((x) >> 23) & 0x7)
+#define   C_SQ_TEX_WORD2_SRC_SEL_Y                                   0xFC7FFFFF
+#define   S_SQ_TEX_WORD2_SRC_SEL_Z(x)                                (((x) & 0x7) << 26)
+#define   G_SQ_TEX_WORD2_SRC_SEL_Z(x)                                (((x) >> 26) & 0x7)
+#define   C_SQ_TEX_WORD2_SRC_SEL_Z                                   0xE3FFFFFF
+#define   S_SQ_TEX_WORD2_SRC_SEL_W(x)                                (((x) & 0x7) << 29)
+#define   G_SQ_TEX_WORD2_SRC_SEL_W(x)                                (((x) >> 29) & 0x7)
+#define   C_SQ_TEX_WORD2_SRC_SEL_W                                   0x1FFFFFFF
+#define P_SQ_ALU_WORD1_OP2_V2
+#define   S_SQ_ALU_WORD1_OP2_V2_SRC0_ABS(x)                          (((x) & 0x1) << 0)
+#define   G_SQ_ALU_WORD1_OP2_V2_SRC0_ABS(x)                          (((x) >> 0) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_V2_SRC0_ABS                             0xFFFFFFFE
+#define   S_SQ_ALU_WORD1_OP2_V2_SRC1_ABS(x)                          (((x) & 0x1) << 1)
+#define   G_SQ_ALU_WORD1_OP2_V2_SRC1_ABS(x)                          (((x) >> 1) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_V2_SRC1_ABS                             0xFFFFFFFD
+#define   S_SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK(x)               (((x) & 0x1) << 2)
+#define   G_SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK(x)               (((x) >> 2) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK                  0xFFFFFFFB
+#define   S_SQ_ALU_WORD1_OP2_V2_UPDATE_PRED(x)                       (((x) & 0x1) << 3)
+#define   G_SQ_ALU_WORD1_OP2_V2_UPDATE_PRED(x)                       (((x) >> 3) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_V2_UPDATE_PRED                          0xFFFFFFF7
+#define   S_SQ_ALU_WORD1_OP2_V2_WRITE_MASK(x)                        (((x) & 0x1) << 4)
+#define   G_SQ_ALU_WORD1_OP2_V2_WRITE_MASK(x)                        (((x) >> 4) & 0x1)
+#define   C_SQ_ALU_WORD1_OP2_V2_WRITE_MASK                           0xFFFFFFEF
+#define   S_SQ_ALU_WORD1_OP2_V2_OMOD(x)                              (((x) & 0x3) << 5)
+#define   G_SQ_ALU_WORD1_OP2_V2_OMOD(x)                              (((x) >> 5) & 0x3)
+#define   C_SQ_ALU_WORD1_OP2_V2_OMOD                                 0xFFFFFF9F
+#define   S_SQ_ALU_WORD1_OP2_V2_ALU_INST(x)                          (((x) & 0x7FF) << 7)
+#define   G_SQ_ALU_WORD1_OP2_V2_ALU_INST(x)                          (((x) >> 7) & 0x7FF)
+#define   C_SQ_ALU_WORD1_OP2_V2_ALU_INST                             0xFFFC007F
+
+#endif
diff --git a/src/gallium/drivers/r600/radeon.h b/src/gallium/drivers/r600/radeon.h
new file mode 100644
index 0000000000..ec94b112d6
--- /dev/null
+++ b/src/gallium/drivers/r600/radeon.h
@@ -0,0 +1,602 @@
+/*
+ * Copyright © 2009 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef RADEON_H
+#define RADEON_H
+
+#define RADEON_CTX_MAX_PM4	(64 * 1024 / 4)
+
+#include <stdint.h>
+
+typedef uint64_t		u64;
+typedef uint32_t		u32;
+typedef uint16_t		u16;
+typedef uint8_t			u8;
+
+struct radeon;
+
+struct pipe_screen *radeon_create_screen(struct radeon *rw);
+
+enum radeon_family {
+	CHIP_UNKNOWN,
+	CHIP_R100,
+	CHIP_RV100,
+	CHIP_RS100,
+	CHIP_RV200,
+	CHIP_RS200,
+	CHIP_R200,
+	CHIP_RV250,
+	CHIP_RS300,
+	CHIP_RV280,
+	CHIP_R300,
+	CHIP_R350,
+	CHIP_RV350,
+	CHIP_RV380,
+	CHIP_R420,
+	CHIP_R423,
+	CHIP_RV410,
+	CHIP_RS400,
+	CHIP_RS480,
+	CHIP_RS600,
+	CHIP_RS690,
+	CHIP_RS740,
+	CHIP_RV515,
+	CHIP_R520,
+	CHIP_RV530,
+	CHIP_RV560,
+	CHIP_RV570,
+	CHIP_R580,
+	CHIP_R600,
+	CHIP_RV610,
+	CHIP_RV630,
+	CHIP_RV670,
+	CHIP_RV620,
+	CHIP_RV635,
+	CHIP_RS780,
+	CHIP_RS880,
+	CHIP_RV770,
+	CHIP_RV730,
+	CHIP_RV710,
+	CHIP_RV740,
+	CHIP_CEDAR,
+	CHIP_REDWOOD,
+	CHIP_JUNIPER,
+	CHIP_CYPRESS,
+	CHIP_HEMLOCK,
+	CHIP_LAST,
+};
+
+/*
+ * radeon object functions
+ */
+struct radeon_bo {
+	unsigned			refcount;
+	unsigned			handle;
+	unsigned			size;
+	unsigned			alignment;
+	unsigned			map_count;
+	void				*data;
+};
+struct radeon_bo *radeon_bo(struct radeon *radeon, unsigned handle,
+			unsigned size, unsigned alignment, void *ptr);
+int radeon_bo_map(struct radeon *radeon, struct radeon_bo *bo);
+void radeon_bo_unmap(struct radeon *radeon, struct radeon_bo *bo);
+struct radeon_bo *radeon_bo_incref(struct radeon *radeon, struct radeon_bo *bo);
+struct radeon_bo *radeon_bo_decref(struct radeon *radeon, struct radeon_bo *bo);
+int radeon_bo_wait(struct radeon *radeon, struct radeon_bo *bo);
+
+/*
+ * states functions
+ */
+struct radeon_state {
+	struct radeon			*radeon;
+	unsigned			refcount;
+	unsigned			type;
+	unsigned			id;
+	unsigned			nstates;
+	u32				*states;
+	unsigned			npm4;
+	unsigned			cpm4;
+	u32				pm4_crc;
+	u32				*pm4;
+	u32				nimmd;
+	u32				*immd;
+	unsigned			nbo;
+	struct radeon_bo		*bo[4];
+	unsigned			nreloc;
+	unsigned			reloc_pm4_id[8];
+	unsigned			reloc_bo_id[8];
+	u32				placement[8];
+	unsigned			bo_dirty[4];
+};
+
+struct radeon_state *radeon_state(struct radeon *radeon, u32 type, u32 id);
+struct radeon_state *radeon_state_incref(struct radeon_state *state);
+struct radeon_state *radeon_state_decref(struct radeon_state *state);
+int radeon_state_pm4(struct radeon_state *state);
+
+/*
+ * draw functions
+ */
+struct radeon_draw {
+	unsigned			refcount;
+	struct radeon			*radeon;
+	unsigned			nstate;
+	struct radeon_state		**state;
+	unsigned			cpm4;
+};
+
+struct radeon_draw *radeon_draw(struct radeon *radeon);
+struct radeon_draw *radeon_draw_duplicate(struct radeon_draw *draw);
+struct radeon_draw *radeon_draw_incref(struct radeon_draw *draw);
+struct radeon_draw *radeon_draw_decref(struct radeon_draw *draw);
+int radeon_draw_set(struct radeon_draw *draw, struct radeon_state *state);
+int radeon_draw_set_new(struct radeon_draw *draw, struct radeon_state *state);
+int radeon_draw_check(struct radeon_draw *draw);
+
+struct radeon_ctx *radeon_ctx(struct radeon *radeon);
+struct radeon_ctx *radeon_ctx_decref(struct radeon_ctx *ctx);
+struct radeon_ctx *radeon_ctx_incref(struct radeon_ctx *ctx);
+int radeon_ctx_set_draw(struct radeon_ctx *ctx, struct radeon_draw *draw);
+int radeon_ctx_set_draw_new(struct radeon_ctx *ctx, struct radeon_draw *draw);
+int radeon_ctx_pm4(struct radeon_ctx *ctx);
+int radeon_ctx_submit(struct radeon_ctx *ctx);
+void radeon_ctx_dump_bof(struct radeon_ctx *ctx, const char *file);
+
+/*
+ * R600/R700
+ */
+
+#define R600_NSTATE				1273
+#define R600_NTYPE				25
+
+#define R600_CONFIG				0
+#define R600_CONFIG_TYPE				0
+#define R600_CB_CNTL				1
+#define R600_CB_CNTL_TYPE				1
+#define R600_RASTERIZER				2
+#define R600_RASTERIZER_TYPE				2
+#define R600_VIEWPORT				3
+#define R600_VIEWPORT_TYPE				3
+#define R600_SCISSOR				4
+#define R600_SCISSOR_TYPE				4
+#define R600_BLEND				5
+#define R600_BLEND_TYPE				5
+#define R600_DSA				6
+#define R600_DSA_TYPE				6
+#define R600_VS_SHADER				7
+#define R600_VS_SHADER_TYPE				7
+#define R600_PS_SHADER				8
+#define R600_PS_SHADER_TYPE				8
+#define R600_PS_CONSTANT				9
+#define R600_PS_CONSTANT_TYPE				9
+#define R600_VS_CONSTANT				265
+#define R600_VS_CONSTANT_TYPE				10
+#define R600_PS_RESOURCE				521
+#define R600_PS_RESOURCE_TYPE				11
+#define R600_VS_RESOURCE				681
+#define R600_VS_RESOURCE_TYPE				12
+#define R600_FS_RESOURCE				841
+#define R600_FS_RESOURCE_TYPE				13
+#define R600_GS_RESOURCE				1001
+#define R600_GS_RESOURCE_TYPE				14
+#define R600_PS_SAMPLER				1161
+#define R600_PS_SAMPLER_TYPE				15
+#define R600_VS_SAMPLER				1179
+#define R600_VS_SAMPLER_TYPE				16
+#define R600_GS_SAMPLER				1197
+#define R600_GS_SAMPLER_TYPE				17
+#define R600_PS_SAMPLER_BORDER				1215
+#define R600_PS_SAMPLER_BORDER_TYPE				18
+#define R600_VS_SAMPLER_BORDER				1233
+#define R600_VS_SAMPLER_BORDER_TYPE				19
+#define R600_GS_SAMPLER_BORDER				1251
+#define R600_GS_SAMPLER_BORDER_TYPE				20
+#define R600_CB0				1269
+#define R600_CB0_TYPE				21
+#define R600_DB				1270
+#define R600_DB_TYPE				22
+#define R600_VGT				1271
+#define R600_VGT_TYPE				23
+#define R600_DRAW				1272
+#define R600_DRAW_TYPE				24
+/* R600_CONFIG */
+#define R600_CONFIG__SQ_CONFIG			0
+#define R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1			1
+#define R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2			2
+#define R600_CONFIG__SQ_THREAD_RESOURCE_MGMT			3
+#define R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1			4
+#define R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2			5
+#define R600_CONFIG__SQ_DYN_GPR_CNTL_PS_FLUSH_REQ			6
+#define R600_CONFIG__TA_CNTL_AUX			7
+#define R600_CONFIG__VC_ENHANCE			8
+#define R600_CONFIG__DB_DEBUG			9
+#define R600_CONFIG__DB_WATERMARKS			10
+#define R600_CONFIG__SX_MISC			11
+#define R600_CONFIG__SPI_THREAD_GROUPING			12
+#define R600_CONFIG__CB_SHADER_CONTROL			13
+#define R600_CONFIG__SQ_ESGS_RING_ITEMSIZE			14
+#define R600_CONFIG__SQ_GSVS_RING_ITEMSIZE			15
+#define R600_CONFIG__SQ_ESTMP_RING_ITEMSIZE			16
+#define R600_CONFIG__SQ_GSTMP_RING_ITEMSIZE			17
+#define R600_CONFIG__SQ_VSTMP_RING_ITEMSIZE			18
+#define R600_CONFIG__SQ_PSTMP_RING_ITEMSIZE			19
+#define R600_CONFIG__SQ_FBUF_RING_ITEMSIZE			20
+#define R600_CONFIG__SQ_REDUC_RING_ITEMSIZE			21
+#define R600_CONFIG__SQ_GS_VERT_ITEMSIZE			22
+#define R600_CONFIG__VGT_OUTPUT_PATH_CNTL			23
+#define R600_CONFIG__VGT_HOS_CNTL			24
+#define R600_CONFIG__VGT_HOS_MAX_TESS_LEVEL			25
+#define R600_CONFIG__VGT_HOS_MIN_TESS_LEVEL			26
+#define R600_CONFIG__VGT_HOS_REUSE_DEPTH			27
+#define R600_CONFIG__VGT_GROUP_PRIM_TYPE			28
+#define R600_CONFIG__VGT_GROUP_FIRST_DECR			29
+#define R600_CONFIG__VGT_GROUP_DECR			30
+#define R600_CONFIG__VGT_GROUP_VECT_0_CNTL			31
+#define R600_CONFIG__VGT_GROUP_VECT_1_CNTL			32
+#define R600_CONFIG__VGT_GROUP_VECT_0_FMT_CNTL			33
+#define R600_CONFIG__VGT_GROUP_VECT_1_FMT_CNTL			34
+#define R600_CONFIG__VGT_GS_MODE			35
+#define R600_CONFIG__PA_SC_MODE_CNTL			36
+#define R600_CONFIG__VGT_STRMOUT_EN			37
+#define R600_CONFIG__VGT_REUSE_OFF			38
+#define R600_CONFIG__VGT_VTX_CNT_EN			39
+#define R600_CONFIG__VGT_STRMOUT_BUFFER_EN			40
+#define R600_CONFIG_SIZE				41
+#define R600_CONFIG_PM4				128
+/* R600_CB_CNTL */
+#define R600_CB_CNTL__CB_CLEAR_RED			0
+#define R600_CB_CNTL__CB_CLEAR_GREEN			1
+#define R600_CB_CNTL__CB_CLEAR_BLUE			2
+#define R600_CB_CNTL__CB_CLEAR_ALPHA			3
+#define R600_CB_CNTL__CB_SHADER_MASK			4
+#define R600_CB_CNTL__CB_TARGET_MASK			5
+#define R600_CB_CNTL__CB_FOG_RED			6
+#define R600_CB_CNTL__CB_FOG_GREEN			7
+#define R600_CB_CNTL__CB_FOG_BLUE			8
+#define R600_CB_CNTL__CB_COLOR_CONTROL			9
+#define R600_CB_CNTL__PA_SC_AA_CONFIG			10
+#define R600_CB_CNTL__PA_SC_AA_SAMPLE_LOCS_MCTX			11
+#define R600_CB_CNTL__PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX			12
+#define R600_CB_CNTL__CB_CLRCMP_CONTROL			13
+#define R600_CB_CNTL__CB_CLRCMP_SRC			14
+#define R600_CB_CNTL__CB_CLRCMP_DST			15
+#define R600_CB_CNTL__CB_CLRCMP_MSK			16
+#define R600_CB_CNTL__PA_SC_AA_MASK			17
+#define R600_CB_CNTL_SIZE				18
+#define R600_CB_CNTL_PM4				128
+/* R600_RASTERIZER */
+#define R600_RASTERIZER__SPI_INTERP_CONTROL_0			0
+#define R600_RASTERIZER__PA_CL_CLIP_CNTL			1
+#define R600_RASTERIZER__PA_SU_SC_MODE_CNTL			2
+#define R600_RASTERIZER__PA_CL_VS_OUT_CNTL			3
+#define R600_RASTERIZER__PA_CL_NANINF_CNTL			4
+#define R600_RASTERIZER__PA_SU_POINT_SIZE			5
+#define R600_RASTERIZER__PA_SU_POINT_MINMAX			6
+#define R600_RASTERIZER__PA_SU_LINE_CNTL			7
+#define R600_RASTERIZER__PA_SC_LINE_STIPPLE			8
+#define R600_RASTERIZER__PA_SC_MPASS_PS_CNTL			9
+#define R600_RASTERIZER__PA_SC_LINE_CNTL			10
+#define R600_RASTERIZER__PA_CL_GB_VERT_CLIP_ADJ			11
+#define R600_RASTERIZER__PA_CL_GB_VERT_DISC_ADJ			12
+#define R600_RASTERIZER__PA_CL_GB_HORZ_CLIP_ADJ			13
+#define R600_RASTERIZER__PA_CL_GB_HORZ_DISC_ADJ			14
+#define R600_RASTERIZER__PA_SU_POLY_OFFSET_DB_FMT_CNTL			15
+#define R600_RASTERIZER__PA_SU_POLY_OFFSET_CLAMP			16
+#define R600_RASTERIZER__PA_SU_POLY_OFFSET_FRONT_SCALE			17
+#define R600_RASTERIZER__PA_SU_POLY_OFFSET_FRONT_OFFSET			18
+#define R600_RASTERIZER__PA_SU_POLY_OFFSET_BACK_SCALE			19
+#define R600_RASTERIZER__PA_SU_POLY_OFFSET_BACK_OFFSET			20
+#define R600_RASTERIZER_SIZE				21
+#define R600_RASTERIZER_PM4				128
+/* R600_VIEWPORT */
+#define R600_VIEWPORT__PA_SC_VPORT_ZMIN_0			0
+#define R600_VIEWPORT__PA_SC_VPORT_ZMAX_0			1
+#define R600_VIEWPORT__PA_CL_VPORT_XSCALE_0			2
+#define R600_VIEWPORT__PA_CL_VPORT_YSCALE_0			3
+#define R600_VIEWPORT__PA_CL_VPORT_ZSCALE_0			4
+#define R600_VIEWPORT__PA_CL_VPORT_XOFFSET_0			5
+#define R600_VIEWPORT__PA_CL_VPORT_YOFFSET_0			6
+#define R600_VIEWPORT__PA_CL_VPORT_ZOFFSET_0			7
+#define R600_VIEWPORT__PA_CL_VTE_CNTL			8
+#define R600_VIEWPORT_SIZE				9
+#define R600_VIEWPORT_PM4				128
+/* R600_SCISSOR */
+#define R600_SCISSOR__PA_SC_SCREEN_SCISSOR_TL			0
+#define R600_SCISSOR__PA_SC_SCREEN_SCISSOR_BR			1
+#define R600_SCISSOR__PA_SC_WINDOW_OFFSET			2
+#define R600_SCISSOR__PA_SC_WINDOW_SCISSOR_TL			3
+#define R600_SCISSOR__PA_SC_WINDOW_SCISSOR_BR			4
+#define R600_SCISSOR__PA_SC_CLIPRECT_RULE			5
+#define R600_SCISSOR__PA_SC_CLIPRECT_0_TL			6
+#define R600_SCISSOR__PA_SC_CLIPRECT_0_BR			7
+#define R600_SCISSOR__PA_SC_CLIPRECT_1_TL			8
+#define R600_SCISSOR__PA_SC_CLIPRECT_1_BR			9
+#define R600_SCISSOR__PA_SC_CLIPRECT_2_TL			10
+#define R600_SCISSOR__PA_SC_CLIPRECT_2_BR			11
+#define R600_SCISSOR__PA_SC_CLIPRECT_3_TL			12
+#define R600_SCISSOR__PA_SC_CLIPRECT_3_BR			13
+#define R600_SCISSOR__PA_SC_EDGERULE			14
+#define R600_SCISSOR__PA_SC_GENERIC_SCISSOR_TL			15
+#define R600_SCISSOR__PA_SC_GENERIC_SCISSOR_BR			16
+#define R600_SCISSOR__PA_SC_VPORT_SCISSOR_0_TL			17
+#define R600_SCISSOR__PA_SC_VPORT_SCISSOR_0_BR			18
+#define R600_SCISSOR_SIZE				19
+#define R600_SCISSOR_PM4				128
+/* R600_BLEND */
+#define R600_BLEND__CB_BLEND_RED			0
+#define R600_BLEND__CB_BLEND_GREEN			1
+#define R600_BLEND__CB_BLEND_BLUE			2
+#define R600_BLEND__CB_BLEND_ALPHA			3
+#define R600_BLEND__CB_BLEND0_CONTROL			4
+#define R600_BLEND__CB_BLEND1_CONTROL			5
+#define R600_BLEND__CB_BLEND2_CONTROL			6
+#define R600_BLEND__CB_BLEND3_CONTROL			7
+#define R600_BLEND__CB_BLEND4_CONTROL			8
+#define R600_BLEND__CB_BLEND5_CONTROL			9
+#define R600_BLEND__CB_BLEND6_CONTROL			10
+#define R600_BLEND__CB_BLEND7_CONTROL			11
+#define R600_BLEND__CB_BLEND_CONTROL			12
+#define R600_BLEND_SIZE				13
+#define R600_BLEND_PM4				128
+/* R600_DSA */
+#define R600_DSA__DB_STENCIL_CLEAR			0
+#define R600_DSA__DB_DEPTH_CLEAR			1
+#define R600_DSA__SX_ALPHA_TEST_CONTROL			2
+#define R600_DSA__DB_STENCILREFMASK			3
+#define R600_DSA__DB_STENCILREFMASK_BF			4
+#define R600_DSA__SX_ALPHA_REF			5
+#define R600_DSA__SPI_FOG_FUNC_SCALE			6
+#define R600_DSA__SPI_FOG_FUNC_BIAS			7
+#define R600_DSA__SPI_FOG_CNTL			8
+#define R600_DSA__DB_DEPTH_CONTROL			9
+#define R600_DSA__DB_SHADER_CONTROL			10
+#define R600_DSA__DB_RENDER_CONTROL			11
+#define R600_DSA__DB_RENDER_OVERRIDE			12
+#define R600_DSA__DB_SRESULTS_COMPARE_STATE1			13
+#define R600_DSA__DB_PRELOAD_CONTROL			14
+#define R600_DSA__DB_ALPHA_TO_MASK			15
+#define R600_DSA_SIZE				16
+#define R600_DSA_PM4				128
+/* R600_VS_SHADER */
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_0			0
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_1			1
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_2			2
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_3			3
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_4			4
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_5			5
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_6			6
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_7			7
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_8			8
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_9			9
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_10			10
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_11			11
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_12			12
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_13			13
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_14			14
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_15			15
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_16			16
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_17			17
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_18			18
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_19			19
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_20			20
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_21			21
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_22			22
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_23			23
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_24			24
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_25			25
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_26			26
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_27			27
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_28			28
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_29			29
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_30			30
+#define R600_VS_SHADER__SQ_VTX_SEMANTIC_31			31
+#define R600_VS_SHADER__SPI_VS_OUT_ID_0			32
+#define R600_VS_SHADER__SPI_VS_OUT_ID_1			33
+#define R600_VS_SHADER__SPI_VS_OUT_ID_2			34
+#define R600_VS_SHADER__SPI_VS_OUT_ID_3			35
+#define R600_VS_SHADER__SPI_VS_OUT_ID_4			36
+#define R600_VS_SHADER__SPI_VS_OUT_ID_5			37
+#define R600_VS_SHADER__SPI_VS_OUT_ID_6			38
+#define R600_VS_SHADER__SPI_VS_OUT_ID_7			39
+#define R600_VS_SHADER__SPI_VS_OUT_ID_8			40
+#define R600_VS_SHADER__SPI_VS_OUT_ID_9			41
+#define R600_VS_SHADER__SPI_VS_OUT_CONFIG			42
+#define R600_VS_SHADER__SQ_PGM_START_VS			43
+#define R600_VS_SHADER__SQ_PGM_RESOURCES_VS			44
+#define R600_VS_SHADER__SQ_PGM_START_FS			45
+#define R600_VS_SHADER__SQ_PGM_RESOURCES_FS			46
+#define R600_VS_SHADER__SQ_PGM_CF_OFFSET_VS			47
+#define R600_VS_SHADER__SQ_PGM_CF_OFFSET_FS			48
+#define R600_VS_SHADER_SIZE				49
+#define R600_VS_SHADER_PM4				128
+/* R600_PS_SHADER */
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_0			0
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_1			1
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_2			2
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_3			3
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_4			4
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_5			5
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_6			6
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_7			7
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_8			8
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_9			9
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_10			10
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_11			11
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_12			12
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_13			13
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_14			14
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_15			15
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_16			16
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_17			17
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_18			18
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_19			19
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_20			20
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_21			21
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_22			22
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_23			23
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_24			24
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_25			25
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_26			26
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_27			27
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_28			28
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_29			29
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_30			30
+#define R600_PS_SHADER__SPI_PS_INPUT_CNTL_31			31
+#define R600_PS_SHADER__SPI_PS_IN_CONTROL_0			32
+#define R600_PS_SHADER__SPI_PS_IN_CONTROL_1			33
+#define R600_PS_SHADER__SPI_INPUT_Z			34
+#define R600_PS_SHADER__SQ_PGM_START_PS			35
+#define R600_PS_SHADER__SQ_PGM_RESOURCES_PS			36
+#define R600_PS_SHADER__SQ_PGM_EXPORTS_PS			37
+#define R600_PS_SHADER__SQ_PGM_CF_OFFSET_PS			38
+#define R600_PS_SHADER_SIZE				39
+#define R600_PS_SHADER_PM4				128
+/* R600_PS_CONSTANT */
+#define R600_PS_CONSTANT__SQ_ALU_CONSTANT0_0			0
+#define R600_PS_CONSTANT__SQ_ALU_CONSTANT1_0			1
+#define R600_PS_CONSTANT__SQ_ALU_CONSTANT2_0			2
+#define R600_PS_CONSTANT__SQ_ALU_CONSTANT3_0			3
+#define R600_PS_CONSTANT_SIZE				4
+#define R600_PS_CONSTANT_PM4				128
+/* R600_VS_CONSTANT */
+#define R600_VS_CONSTANT__SQ_ALU_CONSTANT0_256			0
+#define R600_VS_CONSTANT__SQ_ALU_CONSTANT1_256			1
+#define R600_VS_CONSTANT__SQ_ALU_CONSTANT2_256			2
+#define R600_VS_CONSTANT__SQ_ALU_CONSTANT3_256			3
+#define R600_VS_CONSTANT_SIZE				4
+#define R600_VS_CONSTANT_PM4				128
+/* R600_PS_RESOURCE */
+#define R600_PS_RESOURCE__RESOURCE0_WORD0			0
+#define R600_PS_RESOURCE__RESOURCE0_WORD1			1
+#define R600_PS_RESOURCE__RESOURCE0_WORD2			2
+#define R600_PS_RESOURCE__RESOURCE0_WORD3			3
+#define R600_PS_RESOURCE__RESOURCE0_WORD4			4
+#define R600_PS_RESOURCE__RESOURCE0_WORD5			5
+#define R600_PS_RESOURCE__RESOURCE0_WORD6			6
+#define R600_PS_RESOURCE_SIZE				7
+#define R600_PS_RESOURCE_PM4				128
+/* R600_VS_RESOURCE */
+#define R600_VS_RESOURCE__RESOURCE160_WORD0			0
+#define R600_VS_RESOURCE__RESOURCE160_WORD1			1
+#define R600_VS_RESOURCE__RESOURCE160_WORD2			2
+#define R600_VS_RESOURCE__RESOURCE160_WORD3			3
+#define R600_VS_RESOURCE__RESOURCE160_WORD4			4
+#define R600_VS_RESOURCE__RESOURCE160_WORD5			5
+#define R600_VS_RESOURCE__RESOURCE160_WORD6			6
+#define R600_VS_RESOURCE_SIZE				7
+#define R600_VS_RESOURCE_PM4				128
+/* R600_FS_RESOURCE */
+#define R600_FS_RESOURCE__RESOURCE320_WORD0			0
+#define R600_FS_RESOURCE__RESOURCE320_WORD1			1
+#define R600_FS_RESOURCE__RESOURCE320_WORD2			2
+#define R600_FS_RESOURCE__RESOURCE320_WORD3			3
+#define R600_FS_RESOURCE__RESOURCE320_WORD4			4
+#define R600_FS_RESOURCE__RESOURCE320_WORD5			5
+#define R600_FS_RESOURCE__RESOURCE320_WORD6			6
+#define R600_FS_RESOURCE_SIZE				7
+#define R600_FS_RESOURCE_PM4				128
+/* R600_GS_RESOURCE */
+#define R600_GS_RESOURCE__RESOURCE336_WORD0			0
+#define R600_GS_RESOURCE__RESOURCE336_WORD1			1
+#define R600_GS_RESOURCE__RESOURCE336_WORD2			2
+#define R600_GS_RESOURCE__RESOURCE336_WORD3			3
+#define R600_GS_RESOURCE__RESOURCE336_WORD4			4
+#define R600_GS_RESOURCE__RESOURCE336_WORD5			5
+#define R600_GS_RESOURCE__RESOURCE336_WORD6			6
+#define R600_GS_RESOURCE_SIZE				7
+#define R600_GS_RESOURCE_PM4				128
+/* R600_PS_SAMPLER */
+#define R600_PS_SAMPLER__SQ_TEX_SAMPLER_WORD0_0			0
+#define R600_PS_SAMPLER__SQ_TEX_SAMPLER_WORD1_0			1
+#define R600_PS_SAMPLER__SQ_TEX_SAMPLER_WORD2_0			2
+#define R600_PS_SAMPLER_SIZE				3
+#define R600_PS_SAMPLER_PM4				128
+/* R600_VS_SAMPLER */
+#define R600_VS_SAMPLER__SQ_TEX_SAMPLER_WORD0_18			0
+#define R600_VS_SAMPLER__SQ_TEX_SAMPLER_WORD1_18			1
+#define R600_VS_SAMPLER__SQ_TEX_SAMPLER_WORD2_18			2
+#define R600_VS_SAMPLER_SIZE				3
+#define R600_VS_SAMPLER_PM4				128
+/* R600_GS_SAMPLER */
+#define R600_GS_SAMPLER__SQ_TEX_SAMPLER_WORD0_36			0
+#define R600_GS_SAMPLER__SQ_TEX_SAMPLER_WORD1_36			1
+#define R600_GS_SAMPLER__SQ_TEX_SAMPLER_WORD2_36			2
+#define R600_GS_SAMPLER_SIZE				3
+#define R600_GS_SAMPLER_PM4				128
+/* R600_PS_SAMPLER_BORDER */
+#define R600_PS_SAMPLER_BORDER__TD_PS_SAMPLER0_BORDER_RED			0
+#define R600_PS_SAMPLER_BORDER__TD_PS_SAMPLER0_BORDER_GREEN			1
+#define R600_PS_SAMPLER_BORDER__TD_PS_SAMPLER0_BORDER_BLUE			2
+#define R600_PS_SAMPLER_BORDER__TD_PS_SAMPLER0_BORDER_ALPHA			3
+#define R600_PS_SAMPLER_BORDER_SIZE				4
+#define R600_PS_SAMPLER_BORDER_PM4				128
+/* R600_VS_SAMPLER_BORDER */
+#define R600_VS_SAMPLER_BORDER__TD_VS_SAMPLER0_BORDER_RED			0
+#define R600_VS_SAMPLER_BORDER__TD_VS_SAMPLER0_BORDER_GREEN			1
+#define R600_VS_SAMPLER_BORDER__TD_VS_SAMPLER0_BORDER_BLUE			2
+#define R600_VS_SAMPLER_BORDER__TD_VS_SAMPLER0_BORDER_ALPHA			3
+#define R600_VS_SAMPLER_BORDER_SIZE				4
+#define R600_VS_SAMPLER_BORDER_PM4				128
+/* R600_GS_SAMPLER_BORDER */
+#define R600_GS_SAMPLER_BORDER__TD_GS_SAMPLER0_BORDER_RED			0
+#define R600_GS_SAMPLER_BORDER__TD_GS_SAMPLER0_BORDER_GREEN			1
+#define R600_GS_SAMPLER_BORDER__TD_GS_SAMPLER0_BORDER_BLUE			2
+#define R600_GS_SAMPLER_BORDER__TD_GS_SAMPLER0_BORDER_ALPHA			3
+#define R600_GS_SAMPLER_BORDER_SIZE				4
+#define R600_GS_SAMPLER_BORDER_PM4				128
+/* R600_CB0 */
+#define R600_CB0__CB_COLOR0_BASE			0
+#define R600_CB0__CB_COLOR0_INFO			1
+#define R600_CB0__CB_COLOR0_SIZE			2
+#define R600_CB0__CB_COLOR0_VIEW			3
+#define R600_CB0__CB_COLOR0_FRAG			4
+#define R600_CB0__CB_COLOR0_TILE			5
+#define R600_CB0__CB_COLOR0_MASK			6
+#define R600_CB0_SIZE				7
+#define R600_CB0_PM4				128
+/* R600_DB */
+#define R600_DB__DB_DEPTH_BASE			0
+#define R600_DB__DB_DEPTH_SIZE			1
+#define R600_DB__DB_DEPTH_VIEW			2
+#define R600_DB__DB_DEPTH_INFO			3
+#define R600_DB__DB_HTILE_SURFACE			4
+#define R600_DB__DB_PREFETCH_LIMIT			5
+#define R600_DB_SIZE				6
+#define R600_DB_PM4				128
+/* R600_VGT */
+#define R600_VGT__VGT_PRIMITIVE_TYPE			0
+#define R600_VGT__VGT_MAX_VTX_INDX			1
+#define R600_VGT__VGT_MIN_VTX_INDX			2
+#define R600_VGT__VGT_INDX_OFFSET			3
+#define R600_VGT__VGT_MULTI_PRIM_IB_RESET_INDX			4
+#define R600_VGT__VGT_DMA_INDEX_TYPE			5
+#define R600_VGT__VGT_PRIMITIVEID_EN			6
+#define R600_VGT__VGT_DMA_NUM_INSTANCES			7
+#define R600_VGT__VGT_MULTI_PRIM_IB_RESET_EN			8
+#define R600_VGT__VGT_INSTANCE_STEP_RATE_0			9
+#define R600_VGT__VGT_INSTANCE_STEP_RATE_1			10
+#define R600_VGT_SIZE				11
+#define R600_VGT_PM4				128
+/* R600_DRAW */
+#define R600_DRAW__VGT_NUM_INDICES			0
+#define R600_DRAW__VGT_DMA_BASE_HI			1
+#define R600_DRAW__VGT_DMA_BASE			2
+#define R600_DRAW__VGT_DRAW_INITIATOR			3
+#define R600_DRAW_SIZE				4
+#define R600_DRAW_PM4				128
+
+#endif
diff --git a/src/gallium/drivers/rbug/Makefile b/src/gallium/drivers/rbug/Makefile
new file mode 100644
index 0000000000..64e172fe5c
--- /dev/null
+++ b/src/gallium/drivers/rbug/Makefile
@@ -0,0 +1,12 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = rbug
+
+C_SOURCES = \
+	rbug_core.c \
+	rbug_context.c \
+	rbug_objects.c \
+	rbug_screen.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/rbug/README b/src/gallium/drivers/rbug/README
new file mode 100644
index 0000000000..b6d3a5cf35
--- /dev/null
+++ b/src/gallium/drivers/rbug/README
@@ -0,0 +1,58 @@
+                              RBUG PIPE DRIVER
+
+
+= About =
+
+This directory contains a Gallium3D remote debugger pipe driver.
+It provides remote debugging functionality.
+
+
+= Build Instructions =
+
+To build, invoke scons on the top dir as
+ 
+ scons dri=no statetrackers=mesa winsys=xlib
+
+
+= Usage =
+
+To use do
+
+   export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/lib
+
+ensure the right libGL.so is being picked by doing
+
+   ldd progs/trivial/tri 
+
+   export XMESA_TRACE=y
+   GALLIUM_RBUG=true progs/trivial/tri
+
+which should open gallium remote debugging session. While the program is running
+you can launch the small remote debugging application from progs/rbug. More
+information is in that directory. Also for a gui see:
+
+   http://cgit.freedesktop.org/mesa/rbug-gui
+
+
+= Integrating =
+
+You can integrate the rbug pipe driver either inside the state tracker or the 
+target. The procedure on both cases is the same. Let's assume you have a 
+pipe_screen obtained by the usual means (variable and function names are just
+for illustration purposes):
+
+  real_screen = real_screen_create(...);
+
+The rbug screen is then created by doing
+
+  rbug_screen = rbug_screen_create(real_screen);
+
+You can then simply use rbug_screen instead of real_screen.
+
+You can create as many contexts you wish from rbug_screen::context_create they
+are automatically wrapped by rbug_screen.
+
+
+--
+Jose Fonseca <jrfonseca@tungstengraphics.com>
+Jakob Bornecrantz <jakob@vmware.com>
diff --git a/src/gallium/drivers/rbug/SConscript b/src/gallium/drivers/rbug/SConscript
new file mode 100644
index 0000000000..3da6ac104a
--- /dev/null
+++ b/src/gallium/drivers/rbug/SConscript
@@ -0,0 +1,14 @@
+Import('*')
+
+env = env.Clone()
+
+rbug = env.ConvenienceLibrary(
+	target = 'rbug',
+	source = [
+		'rbug_context.c',
+		'rbug_core.c',
+		'rbug_objects.c',
+		'rbug_screen.c',
+	])
+
+Export('rbug')
diff --git a/src/gallium/drivers/rbug/rbug_context.c b/src/gallium/drivers/rbug/rbug_context.c
new file mode 100644
index 0000000000..00b167e256
--- /dev/null
+++ b/src/gallium/drivers/rbug/rbug_context.c
@@ -0,0 +1,1116 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_simple_list.h"
+
+#include "rbug/rbug_context.h"
+
+#include "rbug_context.h"
+#include "rbug_objects.h"
+
+
+static void
+rbug_destroy(struct pipe_context *_pipe)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->destroy(pipe);
+
+   FREE(rb_pipe);
+}
+
+static void
+rbug_draw_block_locked(struct rbug_context *rb_pipe, int flag)
+{
+
+   if (rb_pipe->draw_blocker & flag) {
+      rb_pipe->draw_blocked |= flag;
+   } else if ((rb_pipe->draw_rule.blocker & flag) &&
+              (rb_pipe->draw_blocker & RBUG_BLOCK_RULE)) {
+      int k;
+      boolean block = FALSE;
+      debug_printf("%s (%p %p) (%p %p) (%p %u) (%p %u)\n", __FUNCTION__,
+                   (void *) rb_pipe->draw_rule.fs, (void *) rb_pipe->curr.fs,
+                   (void *) rb_pipe->draw_rule.vs, (void *) rb_pipe->curr.vs,
+                   (void *) rb_pipe->draw_rule.surf, 0,
+                   (void *) rb_pipe->draw_rule.texture, 0);
+      if (rb_pipe->draw_rule.fs &&
+          rb_pipe->draw_rule.fs == rb_pipe->curr.fs)
+         block = TRUE;
+      if (rb_pipe->draw_rule.vs &&
+          rb_pipe->draw_rule.vs == rb_pipe->curr.vs)
+         block = TRUE;
+      if (rb_pipe->draw_rule.surf &&
+          rb_pipe->draw_rule.surf == rb_pipe->curr.zsbuf)
+            block = TRUE;
+      if (rb_pipe->draw_rule.surf)
+         for (k = 0; k < rb_pipe->curr.nr_cbufs; k++)
+            if (rb_pipe->draw_rule.surf == rb_pipe->curr.cbufs[k])
+               block = TRUE;
+      if (rb_pipe->draw_rule.texture) {
+         for (k = 0; k < rb_pipe->curr.num_fs_views; k++)
+            if (rb_pipe->draw_rule.texture == rb_pipe->curr.fs_texs[k])
+               block = TRUE;
+         for (k = 0; k < rb_pipe->curr.num_vs_views; k++) {
+            if (rb_pipe->draw_rule.texture == rb_pipe->curr.vs_texs[k]) {
+               block = TRUE;
+            }
+         }
+      }
+
+      if (block)
+         rb_pipe->draw_blocked |= (flag | RBUG_BLOCK_RULE);
+   }
+
+   if (rb_pipe->draw_blocked)
+      rbug_notify_draw_blocked(rb_pipe);
+
+   /* wait for rbug to clear the blocked flag */
+   while (rb_pipe->draw_blocked & flag) {
+      rb_pipe->draw_blocked |= flag;
+#ifdef PIPE_THREAD_HAVE_CONDVAR
+      pipe_condvar_wait(rb_pipe->draw_cond, rb_pipe->draw_mutex);
+#else
+      pipe_mutex_unlock(rb_pipe->draw_mutex);
+#ifdef PIPE_SUBSYSTEM_WINDOWS_USER
+      Sleep(1);
+#endif
+      pipe_mutex_lock(rb_pipe->draw_mutex);
+#endif
+   }
+
+}
+
+static void
+rbug_draw_arrays(struct pipe_context *_pipe,
+                 unsigned prim,
+                 unsigned start,
+                 unsigned count)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe_mutex_lock(rb_pipe->draw_mutex);
+   rbug_draw_block_locked(rb_pipe, RBUG_BLOCK_BEFORE);
+
+   pipe->draw_arrays(pipe,
+                     prim,
+                     start,
+                     count);
+
+   rbug_draw_block_locked(rb_pipe, RBUG_BLOCK_AFTER);
+   pipe_mutex_unlock(rb_pipe->draw_mutex);
+}
+
+static void
+rbug_draw_elements(struct pipe_context *_pipe,
+                   struct pipe_resource *_indexResource,
+                   unsigned indexSize,
+                   int indexBias,
+                   unsigned prim,
+                   unsigned start,
+                   unsigned count)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct rbug_resource *rb_resource = rbug_resource(_indexResource);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   struct pipe_resource *indexResource = rb_resource->resource;
+
+   pipe_mutex_lock(rb_pipe->draw_mutex);
+   rbug_draw_block_locked(rb_pipe, RBUG_BLOCK_BEFORE);
+
+   pipe->draw_elements(pipe,
+                       indexResource,
+                       indexSize,
+                       indexBias,
+                       prim,
+                       start,
+                       count);
+
+   rbug_draw_block_locked(rb_pipe, RBUG_BLOCK_AFTER);
+   pipe_mutex_unlock(rb_pipe->draw_mutex);
+}
+
+static void
+rbug_draw_range_elements(struct pipe_context *_pipe,
+                         struct pipe_resource *_indexResource,
+                         unsigned indexSize,
+                         int indexBias,
+                         unsigned minIndex,
+                         unsigned maxIndex,
+                         unsigned mode,
+                         unsigned start,
+                         unsigned count)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct rbug_resource *rb_resource = rbug_resource(_indexResource);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   struct pipe_resource *indexResource = rb_resource->resource;
+
+   pipe_mutex_lock(rb_pipe->draw_mutex);
+   rbug_draw_block_locked(rb_pipe, RBUG_BLOCK_BEFORE);
+
+   pipe->draw_range_elements(pipe,
+                             indexResource,
+                             indexSize,
+                             indexBias,
+                             minIndex,
+                             maxIndex,
+                             mode,
+                             start,
+                             count);
+
+   rbug_draw_block_locked(rb_pipe, RBUG_BLOCK_AFTER);
+   pipe_mutex_unlock(rb_pipe->draw_mutex);
+}
+
+static struct pipe_query *
+rbug_create_query(struct pipe_context *_pipe,
+                  unsigned query_type)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   return pipe->create_query(pipe,
+                             query_type);
+}
+
+static void
+rbug_destroy_query(struct pipe_context *_pipe,
+                   struct pipe_query *query)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->destroy_query(pipe,
+                       query);
+}
+
+static void
+rbug_begin_query(struct pipe_context *_pipe,
+                 struct pipe_query *query)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->begin_query(pipe,
+                     query);
+}
+
+static void
+rbug_end_query(struct pipe_context *_pipe,
+               struct pipe_query *query)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->end_query(pipe,
+                   query);
+}
+
+static boolean
+rbug_get_query_result(struct pipe_context *_pipe,
+                      struct pipe_query *query,
+                      boolean wait,
+                      void *result)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   return pipe->get_query_result(pipe,
+                                 query,
+                                 wait,
+                                 result);
+}
+
+static void *
+rbug_create_blend_state(struct pipe_context *_pipe,
+                        const struct pipe_blend_state *blend)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   return pipe->create_blend_state(pipe,
+                                   blend);
+}
+
+static void
+rbug_bind_blend_state(struct pipe_context *_pipe,
+                      void *blend)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->bind_blend_state(pipe,
+                              blend);
+}
+
+static void
+rbug_delete_blend_state(struct pipe_context *_pipe,
+                        void *blend)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->delete_blend_state(pipe,
+                            blend);
+}
+
+static void *
+rbug_create_sampler_state(struct pipe_context *_pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   return pipe->create_sampler_state(pipe,
+                                     sampler);
+}
+
+static void
+rbug_bind_fragment_sampler_states(struct pipe_context *_pipe,
+                                  unsigned num_samplers,
+                                  void **samplers)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->bind_fragment_sampler_states(pipe,
+                                      num_samplers,
+                                      samplers);
+}
+
+static void
+rbug_bind_vertex_sampler_states(struct pipe_context *_pipe,
+                                unsigned num_samplers,
+                                void **samplers)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->bind_vertex_sampler_states(pipe,
+                                    num_samplers,
+                                    samplers);
+}
+
+static void
+rbug_delete_sampler_state(struct pipe_context *_pipe,
+                          void *sampler)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->delete_sampler_state(pipe,
+                              sampler);
+}
+
+static void *
+rbug_create_rasterizer_state(struct pipe_context *_pipe,
+                             const struct pipe_rasterizer_state *rasterizer)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   return pipe->create_rasterizer_state(pipe,
+                                        rasterizer);
+}
+
+static void
+rbug_bind_rasterizer_state(struct pipe_context *_pipe,
+                           void *rasterizer)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->bind_rasterizer_state(pipe,
+                               rasterizer);
+}
+
+static void
+rbug_delete_rasterizer_state(struct pipe_context *_pipe,
+                             void *rasterizer)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->delete_rasterizer_state(pipe,
+                                 rasterizer);
+}
+
+static void *
+rbug_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                      const struct pipe_depth_stencil_alpha_state *depth_stencil_alpha)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   return pipe->create_depth_stencil_alpha_state(pipe,
+                                                 depth_stencil_alpha);
+}
+
+static void
+rbug_bind_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                    void *depth_stencil_alpha)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->bind_depth_stencil_alpha_state(pipe,
+                                        depth_stencil_alpha);
+}
+
+static void
+rbug_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                      void *depth_stencil_alpha)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->delete_depth_stencil_alpha_state(pipe,
+                                          depth_stencil_alpha);
+}
+
+static void *
+rbug_create_fs_state(struct pipe_context *_pipe,
+                     const struct pipe_shader_state *state)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   void *result;
+
+   result = pipe->create_fs_state(pipe, state);
+   if (!result)
+      return NULL;
+
+   return rbug_shader_create(rb_pipe, state, result, RBUG_SHADER_FRAGMENT);
+}
+
+static void
+rbug_bind_fs_state(struct pipe_context *_pipe,
+                   void *_fs)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   void *fs;
+
+   fs = rbug_shader_unwrap(_fs);
+   rb_pipe->curr.fs = rbug_shader(_fs);
+   pipe->bind_fs_state(pipe,
+                       fs);
+}
+
+static void
+rbug_delete_fs_state(struct pipe_context *_pipe,
+                     void *_fs)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct rbug_shader *rb_shader = rbug_shader(_fs);
+
+   rbug_shader_destroy(rb_pipe, rb_shader);
+}
+
+static void *
+rbug_create_vs_state(struct pipe_context *_pipe,
+                     const struct pipe_shader_state *state)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   void *result;
+
+   result = pipe->create_vs_state(pipe, state);
+   if (!result)
+      return NULL;
+
+   return rbug_shader_create(rb_pipe, state, result, RBUG_SHADER_VERTEX);
+}
+
+static void
+rbug_bind_vs_state(struct pipe_context *_pipe,
+                   void *_vs)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   void *vs;
+
+   vs = rbug_shader_unwrap(_vs);
+   rb_pipe->curr.vs = rbug_shader(_vs);
+   pipe->bind_vs_state(pipe,
+                       vs);
+}
+
+static void
+rbug_delete_vs_state(struct pipe_context *_pipe,
+                     void *_vs)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct rbug_shader *rb_shader = rbug_shader(_vs);
+
+   rbug_shader_destroy(rb_pipe, rb_shader);
+}
+
+static void *
+rbug_create_gs_state(struct pipe_context *_pipe,
+                     const struct pipe_shader_state *state)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   void *result;
+
+   result = pipe->create_gs_state(pipe, state);
+   if (!result)
+      return NULL;
+
+   return rbug_shader_create(rb_pipe, state, result, RBUG_SHADER_GEOM);
+}
+
+static void
+rbug_bind_gs_state(struct pipe_context *_pipe,
+                   void *_gs)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   void *gs;
+
+   gs = rbug_shader_unwrap(_gs);
+   rb_pipe->curr.gs = rbug_shader(_gs);
+   pipe->bind_gs_state(pipe,
+                       gs);
+}
+
+static void
+rbug_delete_gs_state(struct pipe_context *_pipe,
+                     void *_gs)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct rbug_shader *rb_shader = rbug_shader(_gs);
+
+   rbug_shader_destroy(rb_pipe, rb_shader);
+}
+
+static void *
+rbug_create_vertex_elements_state(struct pipe_context *_pipe,
+                                  unsigned num_elements,
+                                  const struct pipe_vertex_element *vertex_elements)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   return pipe->create_vertex_elements_state(pipe,
+                                             num_elements,
+                                             vertex_elements);
+}
+
+static void
+rbug_bind_vertex_elements_state(struct pipe_context *_pipe,
+                                void *velems)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->bind_vertex_elements_state(pipe,
+                                    velems);
+}
+
+static void
+rbug_delete_vertex_elements_state(struct pipe_context *_pipe,
+                                  void *velems)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->delete_vertex_elements_state(pipe,
+                                      velems);
+}
+
+static void
+rbug_set_blend_color(struct pipe_context *_pipe,
+                     const struct pipe_blend_color *blend_color)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->set_blend_color(pipe,
+                         blend_color);
+}
+
+static void
+rbug_set_stencil_ref(struct pipe_context *_pipe,
+                     const struct pipe_stencil_ref *stencil_ref)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->set_stencil_ref(pipe,
+                         stencil_ref);
+}
+
+static void
+rbug_set_clip_state(struct pipe_context *_pipe,
+                    const struct pipe_clip_state *clip)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->set_clip_state(pipe,
+                        clip);
+}
+
+static void
+rbug_set_constant_buffer(struct pipe_context *_pipe,
+                         uint shader,
+                         uint index,
+                         struct pipe_resource *_resource)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   struct pipe_resource *unwrapped_resource;
+   struct pipe_resource *resource = NULL;
+
+   /* XXX hmm? unwrap the input state */
+   if (_resource) {
+      unwrapped_resource = rbug_resource_unwrap(_resource);
+      resource = unwrapped_resource;
+   }
+
+   pipe->set_constant_buffer(pipe,
+                             shader,
+                             index,
+                             resource);
+}
+
+static void
+rbug_set_framebuffer_state(struct pipe_context *_pipe,
+                           const struct pipe_framebuffer_state *_state)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   struct pipe_framebuffer_state unwrapped_state;
+   struct pipe_framebuffer_state *state = NULL;
+   unsigned i;
+
+   rb_pipe->curr.nr_cbufs = 0;
+   memset(rb_pipe->curr.cbufs, 0, sizeof(rb_pipe->curr.cbufs));
+
+   /* unwrap the input state */
+   if (_state) {
+      memcpy(&unwrapped_state, _state, sizeof(unwrapped_state));
+
+      rb_pipe->curr.nr_cbufs = _state->nr_cbufs;
+      for(i = 0; i < _state->nr_cbufs; i++) {
+         unwrapped_state.cbufs[i] = rbug_surface_unwrap(_state->cbufs[i]);
+         if (_state->cbufs[i])
+            rb_pipe->curr.cbufs[i] = rbug_resource(_state->cbufs[i]->texture);
+      }
+      unwrapped_state.zsbuf = rbug_surface_unwrap(_state->zsbuf);
+      state = &unwrapped_state;
+   }
+
+   pipe->set_framebuffer_state(pipe,
+                               state);
+}
+
+static void
+rbug_set_polygon_stipple(struct pipe_context *_pipe,
+                         const struct pipe_poly_stipple *poly_stipple)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->set_polygon_stipple(pipe,
+                             poly_stipple);
+}
+
+static void
+rbug_set_scissor_state(struct pipe_context *_pipe,
+                       const struct pipe_scissor_state *scissor)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->set_scissor_state(pipe,
+                           scissor);
+}
+
+static void
+rbug_set_viewport_state(struct pipe_context *_pipe,
+                        const struct pipe_viewport_state *viewport)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->set_viewport_state(pipe,
+                            viewport);
+}
+
+static void
+rbug_set_fragment_sampler_views(struct pipe_context *_pipe,
+                                unsigned num,
+                                struct pipe_sampler_view **_views)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   struct pipe_sampler_view *unwrapped_views[PIPE_MAX_SAMPLERS];
+   struct pipe_sampler_view **views = NULL;
+   unsigned i;
+
+   rb_pipe->curr.num_fs_views = 0;
+   memset(rb_pipe->curr.fs_views, 0, sizeof(rb_pipe->curr.fs_views));
+   memset(rb_pipe->curr.fs_texs, 0, sizeof(rb_pipe->curr.fs_texs));
+   memset(unwrapped_views, 0, sizeof(unwrapped_views));
+
+   if (_views) {
+      rb_pipe->curr.num_fs_views = num;
+      for (i = 0; i < num; i++) {
+         rb_pipe->curr.fs_views[i] = rbug_sampler_view(_views[i]);
+         rb_pipe->curr.fs_texs[i] = rbug_resource(_views[i] ? _views[i]->texture : NULL);
+         unwrapped_views[i] = rbug_sampler_view_unwrap(_views[i]);
+      }
+      views = unwrapped_views;
+   }
+
+   pipe->set_fragment_sampler_views(pipe, num, views);
+}
+
+static void
+rbug_set_vertex_sampler_views(struct pipe_context *_pipe,
+                              unsigned num,
+                              struct pipe_sampler_view **_views)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   struct pipe_sampler_view *unwrapped_views[PIPE_MAX_VERTEX_SAMPLERS];
+   struct pipe_sampler_view **views = NULL;
+   unsigned i;
+
+   rb_pipe->curr.num_vs_views = 0;
+   memset(rb_pipe->curr.vs_views, 0, sizeof(rb_pipe->curr.vs_views));
+   memset(rb_pipe->curr.vs_texs, 0, sizeof(rb_pipe->curr.vs_texs));
+   memset(unwrapped_views, 0, sizeof(unwrapped_views));
+
+   if (_views) {
+      rb_pipe->curr.num_vs_views = num;
+      for (i = 0; i < num; i++) {
+         rb_pipe->curr.vs_views[i] = rbug_sampler_view(_views[i]);
+         rb_pipe->curr.vs_texs[i] = rbug_resource(_views[i]->texture);
+         unwrapped_views[i] = rbug_sampler_view_unwrap(_views[i]);
+      }
+      views = unwrapped_views;
+   }
+
+   pipe->set_vertex_sampler_views(pipe, num, views);
+}
+
+static void
+rbug_set_vertex_buffers(struct pipe_context *_pipe,
+                        unsigned num_buffers,
+                        const struct pipe_vertex_buffer *_buffers)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   struct pipe_vertex_buffer unwrapped_buffers[PIPE_MAX_SHADER_INPUTS];
+   struct pipe_vertex_buffer *buffers = NULL;
+   unsigned i;
+
+   if (num_buffers) {
+      memcpy(unwrapped_buffers, _buffers, num_buffers * sizeof(*_buffers));
+      for (i = 0; i < num_buffers; i++)
+         unwrapped_buffers[i].buffer = rbug_resource_unwrap(_buffers[i].buffer);
+      buffers = unwrapped_buffers;
+   }
+
+   pipe->set_vertex_buffers(pipe,
+                            num_buffers,
+                            buffers);
+}
+
+static void
+rbug_set_sample_mask(struct pipe_context *_pipe,
+                     unsigned sample_mask)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->set_sample_mask(pipe, sample_mask);
+}
+
+static void
+rbug_resource_copy_region(struct pipe_context *_pipe,
+                          struct pipe_resource *_dst,
+                          struct pipe_subresource subdst,
+                          unsigned dstx,
+                          unsigned dsty,
+                          unsigned dstz,
+                          struct pipe_resource *_src,
+                          struct pipe_subresource subsrc,
+                          unsigned srcx,
+                          unsigned srcy,
+                          unsigned srcz,
+                          unsigned width,
+                          unsigned height)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct rbug_resource *rb_resource_dst = rbug_resource(_dst);
+   struct rbug_resource *rb_resource_src = rbug_resource(_src);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   struct pipe_resource *dst = rb_resource_dst->resource;
+   struct pipe_resource *src = rb_resource_src->resource;
+
+   pipe->resource_copy_region(pipe,
+                              dst,
+                              subdst,
+                              dstx,
+                              dsty,
+                              dstz,
+                              src,
+                              subsrc,
+                              srcx,
+                              srcy,
+                              srcz,
+                              width,
+                              height);
+}
+
+static void
+rbug_clear(struct pipe_context *_pipe,
+           unsigned buffers,
+           const float *rgba,
+           double depth,
+           unsigned stencil)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->clear(pipe,
+               buffers,
+               rgba,
+               depth,
+               stencil);
+}
+
+static void
+rbug_clear_render_target(struct pipe_context *_pipe,
+                         struct pipe_surface *_dst,
+                         const float *rgba,
+                         unsigned dstx, unsigned dsty,
+                         unsigned width, unsigned height)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct rbug_surface *rb_surface_dst = rbug_surface(_dst);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   struct pipe_surface *dst = rb_surface_dst->surface;
+
+   pipe->clear_render_target(pipe,
+                             dst,
+                             rgba,
+                             dstx,
+                             dsty,
+                             width,
+                             height);
+}
+
+static void
+rbug_clear_depth_stencil(struct pipe_context *_pipe,
+                         struct pipe_surface *_dst,
+                         unsigned clear_flags,
+                         double depth,
+                         unsigned stencil,
+                         unsigned dstx, unsigned dsty,
+                         unsigned width, unsigned height)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct rbug_surface *rb_surface_dst = rbug_surface(_dst);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   struct pipe_surface *dst = rb_surface_dst->surface;
+
+   pipe->clear_depth_stencil(pipe,
+                             dst,
+                             clear_flags,
+                             depth,
+                             stencil,
+                             dstx,
+                             dsty,
+                             width,
+                             height);
+}
+
+static void
+rbug_flush(struct pipe_context *_pipe,
+           unsigned flags,
+           struct pipe_fence_handle **fence)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe->flush(pipe,
+               flags,
+               fence);
+}
+
+static unsigned int
+rbug_is_resource_referenced(struct pipe_context *_pipe,
+                            struct pipe_resource *_resource,
+                            unsigned face,
+                            unsigned level)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct rbug_resource *rb_resource = rbug_resource(_resource);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   struct pipe_resource *resource = rb_resource->resource;
+
+   return pipe->is_resource_referenced(pipe,
+                                       resource,
+                                       face,
+                                       level);
+}
+
+static struct pipe_sampler_view *
+rbug_context_create_sampler_view(struct pipe_context *_pipe,
+                                 struct pipe_resource *_resource,
+                                 const struct pipe_sampler_view *templ)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct rbug_resource *rb_resource = rbug_resource(_resource);
+   struct pipe_context *pipe = rb_pipe->pipe;
+   struct pipe_resource *resource = rb_resource->resource;
+   struct pipe_sampler_view *result;
+
+   result = pipe->create_sampler_view(pipe,
+                                      resource,
+                                      templ);
+
+   if (result)
+      return rbug_sampler_view_create(rb_pipe, rb_resource, result);
+   return NULL;
+}
+
+static void
+rbug_context_sampler_view_destroy(struct pipe_context *_pipe,
+                                  struct pipe_sampler_view *_view)
+{
+   rbug_sampler_view_destroy(rbug_context(_pipe),
+                             rbug_sampler_view(_view));
+}
+
+static struct pipe_transfer *
+rbug_context_get_transfer(struct pipe_context *_context,
+                          struct pipe_resource *_resource,
+                          struct pipe_subresource sr,
+                          unsigned usage,
+                          const struct pipe_box *box)
+{
+   struct rbug_context *rb_pipe = rbug_context(_context);
+   struct rbug_resource *rb_resource = rbug_resource(_resource);
+   struct pipe_context *context = rb_pipe->pipe;
+   struct pipe_resource *resource = rb_resource->resource;
+   struct pipe_transfer *result;
+
+   result = context->get_transfer(context,
+                                  resource,
+                                  sr,
+                                  usage,
+                                  box);
+
+   if (result)
+      return rbug_transfer_create(rb_pipe, rb_resource, result);
+   return NULL;
+}
+
+static void
+rbug_context_transfer_destroy(struct pipe_context *_pipe,
+                              struct pipe_transfer *_transfer)
+{
+   rbug_transfer_destroy(rbug_context(_pipe),
+                             rbug_transfer(_transfer));
+}
+
+static void *
+rbug_context_transfer_map(struct pipe_context *_context,
+                          struct pipe_transfer *_transfer)
+{
+   struct rbug_context *rb_pipe = rbug_context(_context);
+   struct rbug_transfer *rb_transfer = rbug_transfer(_transfer);
+   struct pipe_context *context = rb_pipe->pipe;
+   struct pipe_transfer *transfer = rb_transfer->transfer;
+
+   return context->transfer_map(context,
+                                transfer);
+}
+
+
+
+static void
+rbug_context_transfer_flush_region(struct pipe_context *_context,
+                                   struct pipe_transfer *_transfer,
+                                   const struct pipe_box *box)
+{
+   struct rbug_context *rb_pipe = rbug_context(_context);
+   struct rbug_transfer *rb_transfer = rbug_transfer(_transfer);
+   struct pipe_context *context = rb_pipe->pipe;
+   struct pipe_transfer *transfer = rb_transfer->transfer;
+
+   context->transfer_flush_region(context,
+                                  transfer,
+                                  box);
+}
+
+
+static void
+rbug_context_transfer_unmap(struct pipe_context *_context,
+                            struct pipe_transfer *_transfer)
+{
+   struct rbug_context *rb_pipe = rbug_context(_context);
+   struct rbug_transfer *rb_transfer = rbug_transfer(_transfer);
+   struct pipe_context *context = rb_pipe->pipe;
+   struct pipe_transfer *transfer = rb_transfer->transfer;
+
+   context->transfer_unmap(context,
+                           transfer);
+}
+
+
+static void
+rbug_context_transfer_inline_write(struct pipe_context *_context,
+                                   struct pipe_resource *_resource,
+                                   struct pipe_subresource sr,
+                                   unsigned usage,
+                                   const struct pipe_box *box,
+                                   const void *data,
+                                   unsigned stride,
+                                   unsigned slice_stride)
+{
+   struct rbug_context *rb_pipe = rbug_context(_context);
+   struct rbug_resource *rb_resource = rbug_resource(_resource);
+   struct pipe_context *context = rb_pipe->pipe;
+   struct pipe_resource *resource = rb_resource->resource;
+
+   context->transfer_inline_write(context,
+                                  resource,
+                                  sr,
+                                  usage,
+                                  box,
+                                  data,
+                                  stride,
+                                  slice_stride);
+}
+
+
+struct pipe_context *
+rbug_context_create(struct pipe_screen *_screen, struct pipe_context *pipe)
+{
+   struct rbug_context *rb_pipe;
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+
+   if (!rb_screen)
+      return NULL;
+
+   rb_pipe = CALLOC_STRUCT(rbug_context);
+   if (!rb_pipe)
+      return NULL;
+
+   pipe_mutex_init(rb_pipe->draw_mutex);
+   pipe_condvar_init(rb_pipe->draw_cond);
+   pipe_mutex_init(rb_pipe->call_mutex);
+   pipe_mutex_init(rb_pipe->list_mutex);
+   make_empty_list(&rb_pipe->shaders);
+
+   rb_pipe->base.winsys = NULL;
+   rb_pipe->base.screen = _screen;
+   rb_pipe->base.priv = pipe->priv; /* expose wrapped data */
+   rb_pipe->base.draw = NULL;
+
+   rb_pipe->base.destroy = rbug_destroy;
+   rb_pipe->base.draw_arrays = rbug_draw_arrays;
+   rb_pipe->base.draw_elements = rbug_draw_elements;
+   rb_pipe->base.draw_range_elements = rbug_draw_range_elements;
+   rb_pipe->base.create_query = rbug_create_query;
+   rb_pipe->base.destroy_query = rbug_destroy_query;
+   rb_pipe->base.begin_query = rbug_begin_query;
+   rb_pipe->base.end_query = rbug_end_query;
+   rb_pipe->base.get_query_result = rbug_get_query_result;
+   rb_pipe->base.create_blend_state = rbug_create_blend_state;
+   rb_pipe->base.bind_blend_state = rbug_bind_blend_state;
+   rb_pipe->base.delete_blend_state = rbug_delete_blend_state;
+   rb_pipe->base.create_sampler_state = rbug_create_sampler_state;
+   rb_pipe->base.bind_fragment_sampler_states = rbug_bind_fragment_sampler_states;
+   rb_pipe->base.bind_vertex_sampler_states = rbug_bind_vertex_sampler_states;
+   rb_pipe->base.delete_sampler_state = rbug_delete_sampler_state;
+   rb_pipe->base.create_rasterizer_state = rbug_create_rasterizer_state;
+   rb_pipe->base.bind_rasterizer_state = rbug_bind_rasterizer_state;
+   rb_pipe->base.delete_rasterizer_state = rbug_delete_rasterizer_state;
+   rb_pipe->base.create_depth_stencil_alpha_state = rbug_create_depth_stencil_alpha_state;
+   rb_pipe->base.bind_depth_stencil_alpha_state = rbug_bind_depth_stencil_alpha_state;
+   rb_pipe->base.delete_depth_stencil_alpha_state = rbug_delete_depth_stencil_alpha_state;
+   rb_pipe->base.create_fs_state = rbug_create_fs_state;
+   rb_pipe->base.bind_fs_state = rbug_bind_fs_state;
+   rb_pipe->base.delete_fs_state = rbug_delete_fs_state;
+   rb_pipe->base.create_vs_state = rbug_create_vs_state;
+   rb_pipe->base.bind_vs_state = rbug_bind_vs_state;
+   rb_pipe->base.delete_vs_state = rbug_delete_vs_state;
+   rb_pipe->base.create_gs_state = rbug_create_gs_state;
+   rb_pipe->base.bind_gs_state = rbug_bind_gs_state;
+   rb_pipe->base.delete_gs_state = rbug_delete_gs_state;
+   rb_pipe->base.create_vertex_elements_state = rbug_create_vertex_elements_state;
+   rb_pipe->base.bind_vertex_elements_state = rbug_bind_vertex_elements_state;
+   rb_pipe->base.delete_vertex_elements_state = rbug_delete_vertex_elements_state;
+   rb_pipe->base.set_blend_color = rbug_set_blend_color;
+   rb_pipe->base.set_stencil_ref = rbug_set_stencil_ref;
+   rb_pipe->base.set_clip_state = rbug_set_clip_state;
+   rb_pipe->base.set_constant_buffer = rbug_set_constant_buffer;
+   rb_pipe->base.set_framebuffer_state = rbug_set_framebuffer_state;
+   rb_pipe->base.set_polygon_stipple = rbug_set_polygon_stipple;
+   rb_pipe->base.set_scissor_state = rbug_set_scissor_state;
+   rb_pipe->base.set_viewport_state = rbug_set_viewport_state;
+   rb_pipe->base.set_fragment_sampler_views = rbug_set_fragment_sampler_views;
+   rb_pipe->base.set_vertex_sampler_views = rbug_set_vertex_sampler_views;
+   rb_pipe->base.set_vertex_buffers = rbug_set_vertex_buffers;
+   rb_pipe->base.set_sample_mask = rbug_set_sample_mask;
+   rb_pipe->base.resource_copy_region = rbug_resource_copy_region;
+   rb_pipe->base.clear = rbug_clear;
+   rb_pipe->base.clear_render_target = rbug_clear_render_target;
+   rb_pipe->base.clear_depth_stencil = rbug_clear_depth_stencil;
+   rb_pipe->base.flush = rbug_flush;
+   rb_pipe->base.is_resource_referenced = rbug_is_resource_referenced;
+   rb_pipe->base.create_sampler_view = rbug_context_create_sampler_view;
+   rb_pipe->base.sampler_view_destroy = rbug_context_sampler_view_destroy;
+   rb_pipe->base.get_transfer = rbug_context_get_transfer;
+   rb_pipe->base.transfer_destroy = rbug_context_transfer_destroy;
+   rb_pipe->base.transfer_map = rbug_context_transfer_map;
+   rb_pipe->base.transfer_unmap = rbug_context_transfer_unmap;
+   rb_pipe->base.transfer_flush_region = rbug_context_transfer_flush_region;
+   rb_pipe->base.transfer_inline_write = rbug_context_transfer_inline_write;
+
+   rb_pipe->pipe = pipe;
+
+   rbug_screen_add_to_list(rb_screen, contexts, rb_pipe);
+
+   return &rb_pipe->base;
+}
diff --git a/src/gallium/drivers/rbug/rbug_context.h b/src/gallium/drivers/rbug/rbug_context.h
new file mode 100644
index 0000000000..80c803da83
--- /dev/null
+++ b/src/gallium/drivers/rbug/rbug_context.h
@@ -0,0 +1,111 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef RBUG_CONTEXT_H
+#define RBUG_CONTEXT_H
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+
+#include "rbug_screen.h"
+
+
+struct rbug_context {
+   struct pipe_context base;  /**< base class */
+
+   struct pipe_context *pipe;
+
+   struct rbug_list list;
+
+   /* call locking */
+   pipe_mutex call_mutex;
+
+   /* current state */
+   struct {
+      struct rbug_shader *fs;
+      struct rbug_shader *vs;
+      struct rbug_shader *gs;
+
+      struct rbug_sampler_view *fs_views[PIPE_MAX_SAMPLERS];
+      struct rbug_resource *fs_texs[PIPE_MAX_SAMPLERS];
+      unsigned num_fs_views;
+
+      struct rbug_sampler_view *vs_views[PIPE_MAX_VERTEX_SAMPLERS];
+      struct rbug_resource *vs_texs[PIPE_MAX_VERTEX_SAMPLERS];
+      unsigned num_vs_views;
+
+      unsigned nr_cbufs;
+      struct rbug_resource *cbufs[PIPE_MAX_COLOR_BUFS];
+      struct rbug_resource *zsbuf;
+   } curr;
+
+   /* draw locking */
+   pipe_mutex draw_mutex;
+   pipe_condvar draw_cond;
+   unsigned draw_num_rules;
+   int draw_blocker;
+   int draw_blocked;
+
+   struct {
+      struct rbug_shader *fs;
+      struct rbug_shader *vs;
+
+      struct rbug_resource *texture;
+      struct rbug_resource *surf;
+
+      int blocker;
+   } draw_rule;
+
+   /* list of state objects */
+   pipe_mutex list_mutex;
+   unsigned num_shaders;
+   struct rbug_list shaders;
+};
+
+static INLINE struct rbug_context *
+rbug_context(struct pipe_context *pipe)
+{
+   return (struct rbug_context *)pipe;
+}
+
+
+/**********************************************************
+ * rbug_context.c
+ */
+
+struct pipe_context *
+rbug_context_create(struct pipe_screen *screen, struct pipe_context *pipe);
+
+
+/**********************************************************
+ * rbug_core.c
+ */
+
+void rbug_notify_draw_blocked(struct rbug_context *rb_context);
+
+
+#endif /* RBUG_CONTEXT_H */
diff --git a/src/gallium/drivers/rbug/rbug_core.c b/src/gallium/drivers/rbug/rbug_core.c
new file mode 100644
index 0000000000..f1aab3869b
--- /dev/null
+++ b/src/gallium/drivers/rbug/rbug_core.c
@@ -0,0 +1,890 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "os/os_thread.h"
+#include "util/u_format.h"
+#include "util/u_string.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+#include "util/u_network.h"
+#include "os/os_time.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "rbug_context.h"
+#include "rbug_objects.h"
+
+#include "rbug/rbug.h"
+
+#include <errno.h>
+
+#define U642VOID(x) ((void *)(unsigned long)(x))
+#define VOID2U64(x) ((uint64_t)(unsigned long)(x))
+
+#define container_of(ptr, type, field) \
+   (type*)((char*)ptr - offsetof(type, field))
+
+struct rbug_rbug
+{
+   struct rbug_screen *rb_screen;
+   struct rbug_connection *con;
+   pipe_thread thread;
+   boolean running;
+};
+
+PIPE_THREAD_ROUTINE(rbug_thread, void_rbug);
+
+
+/**********************************************************
+ * Helper functions
+ */
+
+
+static struct rbug_context *
+rbug_get_context_locked(struct rbug_screen *rb_screen, rbug_context_t ctx)
+{
+   struct rbug_context *rb_context = NULL;
+   struct rbug_list *ptr;
+
+   foreach(ptr, &rb_screen->contexts) {
+      rb_context = container_of(ptr, struct rbug_context, list);
+      if (ctx == VOID2U64(rb_context))
+         break;
+      rb_context = NULL;
+   }
+
+   return rb_context;
+}
+
+static struct rbug_shader *
+rbug_get_shader_locked(struct rbug_context *rb_context, rbug_shader_t shdr)
+{
+   struct rbug_shader *tr_shdr = NULL;
+   struct rbug_list *ptr;
+
+   foreach(ptr, &rb_context->shaders) {
+      tr_shdr = container_of(ptr, struct rbug_shader, list);
+      if (shdr == VOID2U64(tr_shdr))
+         break;
+      tr_shdr = NULL;
+   }
+
+   return tr_shdr;
+}
+
+static void *
+rbug_shader_create_locked(struct pipe_context *pipe,
+                          struct rbug_shader *rb_shader,
+                          struct tgsi_token *tokens)
+{
+   void *state = NULL;
+   struct pipe_shader_state pss = { 0 };
+   pss.tokens = tokens;
+
+   switch(rb_shader->type) {
+   case RBUG_SHADER_FRAGMENT:
+      state = pipe->create_fs_state(pipe, &pss);
+      break;
+   case RBUG_SHADER_VERTEX:
+      state = pipe->create_vs_state(pipe, &pss);
+      break;
+   case RBUG_SHADER_GEOM:
+      state = pipe->create_gs_state(pipe, &pss);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   return state;
+}
+
+static void
+rbug_shader_bind_locked(struct pipe_context *pipe,
+                        struct rbug_shader *rb_shader,
+                        void *state)
+{
+   switch(rb_shader->type) {
+   case RBUG_SHADER_FRAGMENT:
+      pipe->bind_fs_state(pipe, state);
+      break;
+   case RBUG_SHADER_VERTEX:
+      pipe->bind_vs_state(pipe, state);
+      break;
+   case RBUG_SHADER_GEOM:
+      pipe->bind_gs_state(pipe, state);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+static void
+rbug_shader_delete_locked(struct pipe_context *pipe,
+                          struct rbug_shader *rb_shader,
+                          void *state)
+{
+   switch(rb_shader->type) {
+   case RBUG_SHADER_FRAGMENT:
+      pipe->delete_fs_state(pipe, state);
+      break;
+   case RBUG_SHADER_VERTEX:
+      pipe->delete_vs_state(pipe, state);
+      break;
+   case RBUG_SHADER_GEOM:
+      pipe->delete_gs_state(pipe, state);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+/************************************************
+ * Request handler functions
+ */
+
+
+static int
+rbug_texture_list(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_resource *tr_tex = NULL;
+   struct rbug_list *ptr;
+   rbug_texture_t *texs;
+   int i = 0;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   texs = MALLOC(rb_screen->num_resources * sizeof(rbug_texture_t));
+   foreach(ptr, &rb_screen->resources) {
+      tr_tex = container_of(ptr, struct rbug_resource, list);
+      texs[i++] = VOID2U64(tr_tex);
+   }
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   rbug_send_texture_list_reply(tr_rbug->con, serial, texs, i, NULL);
+   FREE(texs);
+
+   return 0;
+}
+
+static int
+rbug_texture_info(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_resource *tr_tex = NULL;
+   struct rbug_proto_texture_info *gpti = (struct rbug_proto_texture_info *)header;
+   struct rbug_list *ptr;
+   struct pipe_resource *t;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   foreach(ptr, &rb_screen->resources) {
+      tr_tex = container_of(ptr, struct rbug_resource, list);
+      if (gpti->texture == VOID2U64(tr_tex))
+         break;
+      tr_tex = NULL;
+   }
+
+   if (!tr_tex) {
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   t = tr_tex->resource;
+   rbug_send_texture_info_reply(tr_rbug->con, serial,
+                               t->target, t->format,
+                               &t->width0, 1,
+                               &t->height0, 1,
+                               &t->depth0, 1,
+                               util_format_get_blockwidth(t->format),
+                               util_format_get_blockheight(t->format),
+                               util_format_get_blocksize(t->format),
+                               t->last_level,
+                               t->nr_samples,
+                               t->bind,
+                               NULL);
+
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   return 0;
+}
+
+static int
+rbug_texture_read(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_texture_read *gptr = (struct rbug_proto_texture_read *)header;
+
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_resource *tr_tex = NULL;
+   struct rbug_list *ptr;
+
+   struct pipe_context *context = rb_screen->private_context;
+   struct pipe_resource *tex;
+   struct pipe_transfer *t;
+
+   void *map;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   foreach(ptr, &rb_screen->resources) {
+      tr_tex = container_of(ptr, struct rbug_resource, list);
+      if (gptr->texture == VOID2U64(tr_tex))
+         break;
+      tr_tex = NULL;
+   }
+
+   if (!tr_tex) {
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   tex = tr_tex->resource;
+   t = pipe_get_transfer(context, tex,
+				 gptr->face, gptr->level, gptr->zslice,
+				 PIPE_TRANSFER_READ,
+				 gptr->x, gptr->y, gptr->w, gptr->h);
+
+   map = context->transfer_map(context, t);
+
+   rbug_send_texture_read_reply(tr_rbug->con, serial,
+                                t->resource->format,
+                                util_format_get_blockwidth(t->resource->format),
+                                util_format_get_blockheight(t->resource->format),
+                                util_format_get_blocksize(t->resource->format),
+                                (uint8_t*)map,
+                                t->stride * util_format_get_nblocksy(t->resource->format,
+								     t->box.height),
+                                t->stride,
+                                NULL);
+
+   context->transfer_unmap(context, t);
+   context->transfer_destroy(context, t);
+
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   return 0;
+}
+
+static int
+rbug_context_list(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_list *ptr;
+   struct rbug_context *rb_context = NULL;
+   rbug_context_t *ctxs;
+   int i = 0;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   ctxs = MALLOC(rb_screen->num_contexts * sizeof(rbug_context_t));
+   foreach(ptr, &rb_screen->contexts) {
+      rb_context = container_of(ptr, struct rbug_context, list);
+      ctxs[i++] = VOID2U64(rb_context);
+   }
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   rbug_send_context_list_reply(tr_rbug->con, serial, ctxs, i, NULL);
+   FREE(ctxs);
+
+   return 0;
+}
+
+static int
+rbug_context_info(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_context_info *info = (struct rbug_proto_context_info *)header;
+
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_context *rb_context = NULL;
+   rbug_texture_t cbufs[PIPE_MAX_COLOR_BUFS];
+   rbug_texture_t texs[PIPE_MAX_SAMPLERS];
+   int i;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   rb_context = rbug_get_context_locked(rb_screen, info->context);
+
+   if (!rb_context) {
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   /* protect the pipe context */
+   pipe_mutex_lock(rb_context->draw_mutex);
+   pipe_mutex_lock(rb_context->call_mutex);
+
+   for (i = 0; i < rb_context->curr.nr_cbufs; i++)
+      cbufs[i] = VOID2U64(rb_context->curr.cbufs[i]);
+
+   for (i = 0; i < rb_context->curr.num_fs_views; i++)
+      texs[i] = VOID2U64(rb_context->curr.fs_texs[i]);
+
+   rbug_send_context_info_reply(tr_rbug->con, serial,
+                                VOID2U64(rb_context->curr.vs), VOID2U64(rb_context->curr.fs),
+                                texs, rb_context->curr.num_fs_views,
+                                cbufs, rb_context->curr.nr_cbufs,
+                                VOID2U64(rb_context->curr.zsbuf),
+                                rb_context->draw_blocker, rb_context->draw_blocked, NULL);
+
+   pipe_mutex_unlock(rb_context->call_mutex);
+   pipe_mutex_unlock(rb_context->draw_mutex);
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   return 0;
+}
+
+static int
+rbug_context_draw_block(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_context_draw_block *block = (struct rbug_proto_context_draw_block *)header;
+
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_context *rb_context = NULL;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   rb_context = rbug_get_context_locked(rb_screen, block->context);
+
+   if (!rb_context) {
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(rb_context->draw_mutex);
+   rb_context->draw_blocker |= block->block;
+   pipe_mutex_unlock(rb_context->draw_mutex);
+
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   return 0;
+}
+
+static int
+rbug_context_draw_step(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_context_draw_step *step = (struct rbug_proto_context_draw_step *)header;
+
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_context *rb_context = NULL;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   rb_context = rbug_get_context_locked(rb_screen, step->context);
+
+   if (!rb_context) {
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(rb_context->draw_mutex);
+   if (rb_context->draw_blocked & RBUG_BLOCK_RULE) {
+      if (step->step & RBUG_BLOCK_RULE)
+         rb_context->draw_blocked &= ~RBUG_BLOCK_MASK;
+   } else {
+      rb_context->draw_blocked &= ~step->step;
+   }
+   pipe_mutex_unlock(rb_context->draw_mutex);
+
+#ifdef PIPE_THREAD_HAVE_CONDVAR
+   pipe_condvar_broadcast(rb_context->draw_cond);
+#endif
+
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   return 0;
+}
+
+static int
+rbug_context_draw_unblock(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_context_draw_unblock *unblock = (struct rbug_proto_context_draw_unblock *)header;
+
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_context *rb_context = NULL;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   rb_context = rbug_get_context_locked(rb_screen, unblock->context);
+
+   if (!rb_context) {
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(rb_context->draw_mutex);
+   if (rb_context->draw_blocked & RBUG_BLOCK_RULE) {
+      if (unblock->unblock & RBUG_BLOCK_RULE)
+         rb_context->draw_blocked &= ~RBUG_BLOCK_MASK;
+   } else {
+      rb_context->draw_blocked &= ~unblock->unblock;
+   }
+   rb_context->draw_blocker &= ~unblock->unblock;
+   pipe_mutex_unlock(rb_context->draw_mutex);
+
+#ifdef PIPE_THREAD_HAVE_CONDVAR
+   pipe_condvar_broadcast(rb_context->draw_cond);
+#endif
+
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   return 0;
+}
+
+static int
+rbug_context_draw_rule(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_context_draw_rule *rule = (struct rbug_proto_context_draw_rule *)header;
+
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_context *rb_context = NULL;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   rb_context = rbug_get_context_locked(rb_screen, rule->context);
+
+   if (!rb_context) {
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(rb_context->draw_mutex);
+   rb_context->draw_rule.vs = U642VOID(rule->vertex);
+   rb_context->draw_rule.fs = U642VOID(rule->fragment);
+   rb_context->draw_rule.texture = U642VOID(rule->texture);
+   rb_context->draw_rule.surf = U642VOID(rule->surface);
+   rb_context->draw_rule.blocker = rule->block;
+   rb_context->draw_blocker |= RBUG_BLOCK_RULE;
+   pipe_mutex_unlock(rb_context->draw_mutex);
+
+#ifdef PIPE_THREAD_HAVE_CONDVAR
+   pipe_condvar_broadcast(rb_context->draw_cond);
+#endif
+
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   return 0;
+}
+
+static int
+rbug_context_flush(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_context_flush *flush = (struct rbug_proto_context_flush *)header;
+
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_context *rb_context = NULL;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   rb_context = rbug_get_context_locked(rb_screen, flush->context);
+
+   if (!rb_context) {
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   /* protect the pipe context */
+   pipe_mutex_lock(rb_context->call_mutex);
+
+   rb_context->pipe->flush(rb_context->pipe, flush->flags, NULL);
+
+   pipe_mutex_unlock(rb_context->call_mutex);
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   return 0;
+}
+
+static int
+rbug_shader_list(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_shader_list *list = (struct rbug_proto_shader_list *)header;
+
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_context *rb_context = NULL;
+   struct rbug_shader *tr_shdr = NULL;
+   struct rbug_list *ptr;
+   rbug_shader_t *shdrs;
+   int i = 0;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   rb_context = rbug_get_context_locked(rb_screen, list->context);
+
+   if (!rb_context) {
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(rb_context->list_mutex);
+   shdrs = MALLOC(rb_context->num_shaders * sizeof(rbug_shader_t));
+   foreach(ptr, &rb_context->shaders) {
+      tr_shdr = container_of(ptr, struct rbug_shader, list);
+      shdrs[i++] = VOID2U64(tr_shdr);
+   }
+
+   pipe_mutex_unlock(rb_context->list_mutex);
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   rbug_send_shader_list_reply(tr_rbug->con, serial, shdrs, i, NULL);
+   FREE(shdrs);
+
+   return 0;
+}
+
+static int
+rbug_shader_info(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   struct rbug_proto_shader_info *info = (struct rbug_proto_shader_info *)header;
+
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_context *rb_context = NULL;
+   struct rbug_shader *tr_shdr = NULL;
+   unsigned original_len;
+   unsigned replaced_len;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   rb_context = rbug_get_context_locked(rb_screen, info->context);
+
+   if (!rb_context) {
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(rb_context->list_mutex);
+
+   tr_shdr = rbug_get_shader_locked(rb_context, info->shader);
+
+   if (!tr_shdr) {
+      pipe_mutex_unlock(rb_context->list_mutex);
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   /* just in case */
+   assert(sizeof(struct tgsi_token) == 4);
+
+   original_len = tgsi_num_tokens(tr_shdr->tokens);
+   if (tr_shdr->replaced_tokens)
+      replaced_len = tgsi_num_tokens(tr_shdr->replaced_tokens);
+   else
+      replaced_len = 0;
+
+   rbug_send_shader_info_reply(tr_rbug->con, serial,
+                               (uint32_t*)tr_shdr->tokens, original_len,
+                               (uint32_t*)tr_shdr->replaced_tokens, replaced_len,
+                               tr_shdr->disabled,
+                               NULL);
+
+   pipe_mutex_unlock(rb_context->list_mutex);
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   return 0;
+}
+
+static int
+rbug_shader_disable(struct rbug_rbug *tr_rbug, struct rbug_header *header)
+{
+   struct rbug_proto_shader_disable *dis = (struct rbug_proto_shader_disable *)header;
+
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_context *rb_context = NULL;
+   struct rbug_shader *tr_shdr = NULL;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   rb_context = rbug_get_context_locked(rb_screen, dis->context);
+
+   if (!rb_context) {
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(rb_context->list_mutex);
+
+   tr_shdr = rbug_get_shader_locked(rb_context, dis->shader);
+
+   if (!tr_shdr) {
+      pipe_mutex_unlock(rb_context->list_mutex);
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   tr_shdr->disabled = dis->disable;
+
+   pipe_mutex_unlock(rb_context->list_mutex);
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   return 0;
+}
+
+static int
+rbug_shader_replace(struct rbug_rbug *tr_rbug, struct rbug_header *header)
+{
+   struct rbug_proto_shader_replace *rep = (struct rbug_proto_shader_replace *)header;
+
+   struct rbug_screen *rb_screen = tr_rbug->rb_screen;
+   struct rbug_context *rb_context = NULL;
+   struct rbug_shader *tr_shdr = NULL;
+   struct pipe_context *pipe = NULL;
+   void *state;
+
+   pipe_mutex_lock(rb_screen->list_mutex);
+   rb_context = rbug_get_context_locked(rb_screen, rep->context);
+
+   if (!rb_context) {
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   pipe_mutex_lock(rb_context->list_mutex);
+
+   tr_shdr = rbug_get_shader_locked(rb_context, rep->shader);
+
+   if (!tr_shdr) {
+      pipe_mutex_unlock(rb_context->list_mutex);
+      pipe_mutex_unlock(rb_screen->list_mutex);
+      return -ESRCH;
+   }
+
+   /* protect the pipe context */
+   pipe_mutex_lock(rb_context->call_mutex);
+
+   pipe = rb_context->pipe;
+
+   /* remove old replaced shader */
+   if (tr_shdr->replaced_shader) {
+      /* if this shader is bound rebind the original shader */
+      if (rb_context->curr.fs == tr_shdr || rb_context->curr.vs == tr_shdr)
+         rbug_shader_bind_locked(pipe, tr_shdr, tr_shdr->shader);
+
+      FREE(tr_shdr->replaced_tokens);
+      rbug_shader_delete_locked(pipe, tr_shdr, tr_shdr->replaced_shader);
+      tr_shdr->replaced_shader = NULL;
+      tr_shdr->replaced_tokens = NULL;
+   }
+
+   /* empty inputs means restore old which we did above */
+   if (rep->tokens_len == 0)
+      goto out;
+
+   tr_shdr->replaced_tokens = tgsi_dup_tokens((struct tgsi_token *)rep->tokens);
+   if (!tr_shdr->replaced_tokens)
+      goto err;
+
+   state = rbug_shader_create_locked(pipe, tr_shdr, tr_shdr->replaced_tokens);
+   if (!state)
+      goto err;
+
+   /* bind new shader if the shader is currently a bound */
+   if (rb_context->curr.fs == tr_shdr || rb_context->curr.vs == tr_shdr)
+      rbug_shader_bind_locked(pipe, tr_shdr, state);
+
+   /* save state */
+   tr_shdr->replaced_shader = state;
+
+out:
+   pipe_mutex_unlock(rb_context->call_mutex);
+   pipe_mutex_unlock(rb_context->list_mutex);
+   pipe_mutex_unlock(rb_screen->list_mutex);
+
+   return 0;
+
+err:
+   FREE(tr_shdr->replaced_tokens);
+   tr_shdr->replaced_shader = NULL;
+   tr_shdr->replaced_tokens = NULL;
+
+   pipe_mutex_unlock(rb_context->call_mutex);
+   pipe_mutex_unlock(rb_context->list_mutex);
+   pipe_mutex_unlock(rb_screen->list_mutex);
+   return -EINVAL;
+}
+
+static boolean
+rbug_header(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
+{
+   int ret = 0;
+
+   switch(header->opcode) {
+      case RBUG_OP_PING:
+         rbug_send_ping_reply(tr_rbug->con, serial, NULL);
+         break;
+      case RBUG_OP_TEXTURE_LIST:
+         ret = rbug_texture_list(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_TEXTURE_INFO:
+         ret = rbug_texture_info(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_TEXTURE_READ:
+         ret = rbug_texture_read(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_LIST:
+         ret = rbug_context_list(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_INFO:
+         ret = rbug_context_info(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_DRAW_BLOCK:
+         ret = rbug_context_draw_block(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_DRAW_STEP:
+         ret = rbug_context_draw_step(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_DRAW_UNBLOCK:
+         ret = rbug_context_draw_unblock(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_DRAW_RULE:
+         ret = rbug_context_draw_rule(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_CONTEXT_FLUSH:
+         ret = rbug_context_flush(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_SHADER_LIST:
+         ret = rbug_shader_list(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_SHADER_INFO:
+         ret = rbug_shader_info(tr_rbug, header, serial);
+         break;
+      case RBUG_OP_SHADER_DISABLE:
+         ret = rbug_shader_disable(tr_rbug, header);
+         break;
+      case RBUG_OP_SHADER_REPLACE:
+         ret = rbug_shader_replace(tr_rbug, header);
+         break;
+      default:
+         debug_printf("%s - unsupported opcode %u\n", __FUNCTION__, header->opcode);
+         ret = -ENOSYS;
+         break;
+   }
+   rbug_free_header(header);
+
+   if (ret)
+      rbug_send_error_reply(tr_rbug->con, serial, ret, NULL);
+
+   return TRUE;
+}
+
+static void
+rbug_con(struct rbug_rbug *tr_rbug)
+{
+   struct rbug_header *header;
+   uint32_t serial;
+
+   debug_printf("%s - connection received\n", __FUNCTION__);
+
+   while(tr_rbug->running) {
+      header = rbug_get_message(tr_rbug->con, &serial);
+      if (!header)
+         break;
+
+      if (!rbug_header(tr_rbug, header, serial))
+         break;
+   }
+
+   debug_printf("%s - connection closed\n", __FUNCTION__);
+
+   rbug_disconnect(tr_rbug->con);
+   tr_rbug->con = NULL;
+}
+
+PIPE_THREAD_ROUTINE(rbug_thread, void_tr_rbug)
+{
+   struct rbug_rbug *tr_rbug = void_tr_rbug;
+   uint16_t port = 13370;
+   int s = -1;
+   int c;
+
+   u_socket_init();
+
+   for (;port <= 13379 && s < 0; port++)
+      s = u_socket_listen_on_port(port);
+
+   if (s < 0) {
+      debug_printf("rbug_rbug - failed to listen\n");
+      return NULL;
+   }
+
+   u_socket_block(s, false);
+
+   debug_printf("rbug_rbug - remote debugging listening on port %u\n", --port);
+
+   while(tr_rbug->running) {
+      os_time_sleep(1);
+
+      c = u_socket_accept(s);
+      if (c < 0)
+         continue;
+
+      u_socket_block(c, true);
+      tr_rbug->con = rbug_from_socket(c);
+
+      rbug_con(tr_rbug);
+
+      u_socket_close(c);
+   }
+
+   u_socket_close(s);
+
+   u_socket_stop();
+
+   return NULL;
+}
+
+/**********************************************************
+ *
+ */
+
+struct rbug_rbug *
+rbug_start(struct rbug_screen *rb_screen)
+{
+   struct rbug_rbug *tr_rbug = CALLOC_STRUCT(rbug_rbug);
+   if (!tr_rbug)
+      return NULL;
+
+   tr_rbug->rb_screen = rb_screen;
+   tr_rbug->running = TRUE;
+   tr_rbug->thread = pipe_thread_create(rbug_thread, tr_rbug);
+
+   return tr_rbug;
+}
+
+void
+rbug_stop(struct rbug_rbug *tr_rbug)
+{
+   if (!tr_rbug)
+      return;
+
+   tr_rbug->running = false;
+   pipe_thread_wait(tr_rbug->thread);
+
+   FREE(tr_rbug);
+
+   return;
+}
+
+void
+rbug_notify_draw_blocked(struct rbug_context *rb_context)
+{
+   struct rbug_screen *rb_screen = rbug_screen(rb_context->base.screen);
+   struct rbug_rbug *tr_rbug = rb_screen->rbug;
+
+   if (tr_rbug && tr_rbug->con)
+      rbug_send_context_draw_blocked(tr_rbug->con,
+                                     VOID2U64(rb_context), rb_context->draw_blocked, NULL);
+}
diff --git a/src/gallium/drivers/rbug/rbug_objects.c b/src/gallium/drivers/rbug/rbug_objects.c
new file mode 100644
index 0000000000..0979fcff95
--- /dev/null
+++ b/src/gallium/drivers/rbug/rbug_objects.c
@@ -0,0 +1,247 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "rbug_screen.h"
+#include "rbug_objects.h"
+#include "rbug_context.h"
+
+
+
+struct pipe_resource *
+rbug_resource_create(struct rbug_screen *rb_screen,
+                     struct pipe_resource *resource)
+{
+   struct rbug_resource *rb_resource;
+
+   if(!resource)
+      goto error;
+
+   assert(resource->screen == rb_screen->screen);
+
+   rb_resource = CALLOC_STRUCT(rbug_resource);
+   if(!rb_resource)
+      goto error;
+
+   memcpy(&rb_resource->base, resource, sizeof(struct pipe_resource));
+
+   pipe_reference_init(&rb_resource->base.reference, 1);
+   rb_resource->base.screen = &rb_screen->base;
+   rb_resource->resource = resource;
+
+   rbug_screen_add_to_list(rb_screen, resources, rb_resource);
+
+   return &rb_resource->base;
+
+error:
+   pipe_resource_reference(&resource, NULL);
+   return NULL;
+}
+
+void
+rbug_resource_destroy(struct rbug_resource *rb_resource)
+{
+   struct rbug_screen *rb_screen = rbug_screen(rb_resource->base.screen);
+   rbug_screen_remove_from_list(rb_screen, resources, rb_resource);
+
+   pipe_resource_reference(&rb_resource->resource, NULL);
+   FREE(rb_resource);
+}
+
+
+struct pipe_surface *
+rbug_surface_create(struct rbug_resource *rb_resource,
+                    struct pipe_surface *surface)
+{
+   struct rbug_surface *rb_surface;
+
+   if(!surface)
+      goto error;
+
+   assert(surface->texture == rb_resource->resource);
+
+   rb_surface = CALLOC_STRUCT(rbug_surface);
+   if(!rb_surface)
+      goto error;
+
+   memcpy(&rb_surface->base, surface, sizeof(struct pipe_surface));
+
+   pipe_reference_init(&rb_surface->base.reference, 1);
+   rb_surface->base.texture = NULL;
+   pipe_resource_reference(&rb_surface->base.texture, &rb_resource->base);
+   rb_surface->surface = surface;
+
+   return &rb_surface->base;
+
+error:
+   pipe_surface_reference(&surface, NULL);
+   return NULL;
+}
+
+void
+rbug_surface_destroy(struct rbug_surface *rb_surface)
+{
+   pipe_resource_reference(&rb_surface->base.texture, NULL);
+   pipe_surface_reference(&rb_surface->surface, NULL);
+   FREE(rb_surface);
+}
+
+
+struct pipe_sampler_view *
+rbug_sampler_view_create(struct rbug_context *rb_context,
+                         struct rbug_resource *rb_resource,
+                         struct pipe_sampler_view *view)
+{
+   struct rbug_sampler_view *rb_view;
+
+   if (!view)
+      goto error;
+
+   assert(view->texture == rb_resource->resource);
+
+   rb_view = MALLOC(sizeof(struct rbug_sampler_view));
+
+   rb_view->base = *view;
+   rb_view->base.reference.count = 1;
+   rb_view->base.texture = NULL;
+   pipe_resource_reference(&rb_view->base.texture, &rb_resource->base);
+   rb_view->base.context = rb_context->pipe;
+   rb_view->sampler_view = view;
+
+   return &rb_view->base;
+error:
+   return NULL;
+}
+
+void
+rbug_sampler_view_destroy(struct rbug_context *rb_context,
+                          struct rbug_sampler_view *rb_view)
+{
+   pipe_resource_reference(&rb_view->base.texture, NULL);
+   rb_context->pipe->sampler_view_destroy(rb_context->pipe,
+                                          rb_view->sampler_view);
+   FREE(rb_view);
+}
+
+
+struct pipe_transfer *
+rbug_transfer_create(struct rbug_context *rb_context,
+                     struct rbug_resource *rb_resource,
+                     struct pipe_transfer *transfer)
+{
+   struct rbug_transfer *rb_transfer;
+
+   if(!transfer)
+      goto error;
+
+   assert(transfer->resource == rb_resource->resource);
+
+   rb_transfer = CALLOC_STRUCT(rbug_transfer);
+   if(!rb_transfer)
+      goto error;
+
+   memcpy(&rb_transfer->base, transfer, sizeof(struct pipe_transfer));
+
+   rb_transfer->base.resource = NULL;
+   rb_transfer->transfer = transfer;
+   rb_transfer->pipe = rb_context->pipe;
+
+   pipe_resource_reference(&rb_transfer->base.resource, &rb_resource->base);
+   assert(rb_transfer->base.resource == &rb_resource->base);
+
+   return &rb_transfer->base;
+
+error:
+   rb_context->pipe->transfer_destroy(rb_context->pipe, transfer);
+   return NULL;
+}
+
+void
+rbug_transfer_destroy(struct rbug_context *rb_context,
+                      struct rbug_transfer *rb_transfer)
+{
+   pipe_resource_reference(&rb_transfer->base.resource, NULL);
+   rb_transfer->pipe->transfer_destroy(rb_context->pipe,
+                                       rb_transfer->transfer);
+   FREE(rb_transfer);
+}
+
+void *
+rbug_shader_create(struct rbug_context *rb_context,
+                   const struct pipe_shader_state *state,
+                   void *result, enum rbug_shader_type type)
+{
+   struct rbug_shader *rb_shader = CALLOC_STRUCT(rbug_shader);
+
+   rb_shader->type = type;
+   rb_shader->shader = result;
+   rb_shader->tokens = tgsi_dup_tokens(state->tokens);
+
+   /* works on context as well since its just a macro */
+   rbug_screen_add_to_list(rb_context, shaders, rb_shader);
+
+   return rb_shader;
+}
+
+void
+rbug_shader_destroy(struct rbug_context *rb_context,
+                    struct rbug_shader *rb_shader)
+{
+   struct pipe_context *pipe = rb_context->pipe;
+
+   /* works on context as well since its just a macro */
+   rbug_screen_remove_from_list(rb_context, shaders, rb_shader);
+
+   switch(rb_shader->type) {
+   case RBUG_SHADER_FRAGMENT:
+      if (rb_shader->replaced_shader)
+         pipe->delete_fs_state(pipe, rb_shader->replaced_shader);
+      pipe->delete_fs_state(pipe, rb_shader->shader);
+      break;
+   case RBUG_SHADER_VERTEX:
+      if (rb_shader->replaced_shader)
+         pipe->delete_vs_state(pipe, rb_shader->replaced_shader);
+      pipe->delete_vs_state(pipe, rb_shader->shader);
+      break;
+   case RBUG_SHADER_GEOM:
+      if (rb_shader->replaced_shader)
+         pipe->delete_gs_state(pipe, rb_shader->replaced_shader);
+      pipe->delete_gs_state(pipe, rb_shader->shader);
+      break;
+   default:
+      assert(0);
+   }
+
+   FREE(rb_shader->replaced_tokens);
+   FREE(rb_shader->tokens);
+   FREE(rb_shader);
+}
diff --git a/src/gallium/drivers/rbug/rbug_objects.h b/src/gallium/drivers/rbug/rbug_objects.h
new file mode 100644
index 0000000000..49c128d3d1
--- /dev/null
+++ b/src/gallium/drivers/rbug/rbug_objects.h
@@ -0,0 +1,226 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef RBUG_OBJECTS_H
+#define RBUG_OBJECTS_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "rbug_screen.h"
+
+struct rbug_context;
+
+
+struct rbug_resource
+{
+   struct pipe_resource base;
+
+   struct pipe_resource *resource;
+
+   struct rbug_list list;
+};
+
+
+enum rbug_shader_type
+{
+   RBUG_SHADER_GEOM,
+   RBUG_SHADER_VERTEX,
+   RBUG_SHADER_FRAGMENT,
+};
+
+struct rbug_shader
+{
+   struct rbug_list list;
+
+   void *shader;
+   void *tokens;
+   void *replaced_shader;
+   void *replaced_tokens;
+
+   enum rbug_shader_type type;
+   boolean disabled;
+};
+
+
+struct rbug_sampler_view
+{
+   struct pipe_sampler_view base;
+
+   struct pipe_sampler_view *sampler_view;
+};
+
+
+struct rbug_surface
+{
+   struct pipe_surface base;
+
+   struct pipe_surface *surface;
+};
+
+
+struct rbug_transfer
+{
+   struct pipe_transfer base;
+
+   struct pipe_context *pipe;
+   struct pipe_transfer *transfer;
+};
+
+
+static INLINE struct rbug_resource *
+rbug_resource(struct pipe_resource *_resource)
+{
+   if (!_resource)
+      return NULL;
+   (void)rbug_screen(_resource->screen);
+   return (struct rbug_resource *)_resource;
+}
+
+static INLINE struct rbug_sampler_view *
+rbug_sampler_view(struct pipe_sampler_view *_sampler_view)
+{
+   if (!_sampler_view)
+      return NULL;
+   (void)rbug_resource(_sampler_view->texture);
+   return (struct rbug_sampler_view *)_sampler_view;
+}
+
+static INLINE struct rbug_surface *
+rbug_surface(struct pipe_surface *_surface)
+{
+   if (!_surface)
+      return NULL;
+   (void)rbug_resource(_surface->texture);
+   return (struct rbug_surface *)_surface;
+}
+
+static INLINE struct rbug_transfer *
+rbug_transfer(struct pipe_transfer *_transfer)
+{
+   if (!_transfer)
+      return NULL;
+   (void)rbug_resource(_transfer->resource);
+   return (struct rbug_transfer *)_transfer;
+}
+
+static INLINE struct rbug_shader *
+rbug_shader(void *_state)
+{
+   if (!_state)
+      return NULL;
+   return (struct rbug_shader *)_state;
+}
+
+static INLINE struct pipe_resource *
+rbug_resource_unwrap(struct pipe_resource *_resource)
+{
+   if (!_resource)
+      return NULL;
+   return rbug_resource(_resource)->resource;
+}
+
+static INLINE struct pipe_sampler_view *
+rbug_sampler_view_unwrap(struct pipe_sampler_view *_sampler_view)
+{
+   if (!_sampler_view)
+      return NULL;
+   return rbug_sampler_view(_sampler_view)->sampler_view;
+}
+
+static INLINE struct pipe_surface *
+rbug_surface_unwrap(struct pipe_surface *_surface)
+{
+   if (!_surface)
+      return NULL;
+   return rbug_surface(_surface)->surface;
+}
+
+static INLINE struct pipe_transfer *
+rbug_transfer_unwrap(struct pipe_transfer *_transfer)
+{
+   if (!_transfer)
+      return NULL;
+   return rbug_transfer(_transfer)->transfer;
+}
+
+static INLINE void *
+rbug_shader_unwrap(void *_state)
+{
+   struct rbug_shader *shader;
+   if (!_state)
+      return NULL;
+
+   shader = rbug_shader(_state);
+   return shader->replaced_shader ? shader->replaced_shader : shader->shader;
+}
+
+
+struct pipe_resource *
+rbug_resource_create(struct rbug_screen *rb_screen,
+                     struct pipe_resource *resource);
+
+void
+rbug_resource_destroy(struct rbug_resource *rb_resource);
+
+struct pipe_surface *
+rbug_surface_create(struct rbug_resource *rb_resource,
+                    struct pipe_surface *surface);
+
+void
+rbug_surface_destroy(struct rbug_surface *rb_surface);
+
+struct pipe_sampler_view *
+rbug_sampler_view_create(struct rbug_context *rb_context,
+                         struct rbug_resource *rb_resource,
+                         struct pipe_sampler_view *view);
+
+void
+rbug_sampler_view_destroy(struct rbug_context *rb_context,
+                          struct rbug_sampler_view *rb_sampler_view);
+
+struct pipe_transfer *
+rbug_transfer_create(struct rbug_context *rb_context,
+                     struct rbug_resource *rb_resource,
+                     struct pipe_transfer *transfer);
+
+void
+rbug_transfer_destroy(struct rbug_context *rb_context,
+                      struct rbug_transfer *rb_transfer);
+
+void *
+rbug_shader_create(struct rbug_context *rb_context,
+                   const struct pipe_shader_state *state,
+                   void *result, enum rbug_shader_type type);
+
+void
+rbug_shader_destroy(struct rbug_context *rb_context,
+                    struct rbug_shader *rb_shader);
+
+
+#endif /* RBUG_OBJECTS_H */
diff --git a/src/gallium/drivers/rbug/rbug_public.h b/src/gallium/drivers/rbug/rbug_public.h
new file mode 100644
index 0000000000..b66740b49c
--- /dev/null
+++ b/src/gallium/drivers/rbug/rbug_public.h
@@ -0,0 +1,40 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef RBUG_PUBLIC_H
+#define RBUG_PUBLIC_H
+
+struct pipe_screen;
+struct pipe_context;
+
+struct pipe_screen *
+rbug_screen_create(struct pipe_screen *screen);
+
+boolean
+rbug_enabled(void);
+
+#endif /* RBUG_PUBLIC_H */
diff --git a/src/gallium/drivers/rbug/rbug_screen.c b/src/gallium/drivers/rbug/rbug_screen.c
new file mode 100644
index 0000000000..b9f32ee6a9
--- /dev/null
+++ b/src/gallium/drivers/rbug/rbug_screen.c
@@ -0,0 +1,353 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+#include "util/u_debug.h"
+#include "util/u_simple_list.h"
+
+#include "rbug_public.h"
+#include "rbug_screen.h"
+#include "rbug_context.h"
+#include "rbug_objects.h"
+
+DEBUG_GET_ONCE_BOOL_OPTION(rbug, "GALLIUM_RBUG", FALSE)
+
+static void
+rbug_screen_destroy(struct pipe_screen *_screen)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+
+   screen->destroy(screen);
+
+   FREE(rb_screen);
+}
+
+static const char *
+rbug_screen_get_name(struct pipe_screen *_screen)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+
+   return screen->get_name(screen);
+}
+
+static const char *
+rbug_screen_get_vendor(struct pipe_screen *_screen)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+
+   return screen->get_vendor(screen);
+}
+
+static int
+rbug_screen_get_param(struct pipe_screen *_screen,
+                      enum pipe_cap param)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+
+   return screen->get_param(screen,
+                            param);
+}
+
+static float
+rbug_screen_get_paramf(struct pipe_screen *_screen,
+                       enum pipe_cap param)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+
+   return screen->get_paramf(screen,
+                             param);
+}
+
+static boolean
+rbug_screen_is_format_supported(struct pipe_screen *_screen,
+                                enum pipe_format format,
+                                enum pipe_texture_target target,
+                                unsigned sample_count,
+                                unsigned tex_usage,
+                                unsigned geom_flags)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+
+   return screen->is_format_supported(screen,
+                                      format,
+                                      target,
+                                      sample_count,
+                                      tex_usage,
+                                      geom_flags);
+}
+
+static struct pipe_context *
+rbug_screen_context_create(struct pipe_screen *_screen,
+                           void *priv)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+   struct pipe_context *result;
+
+   result = screen->context_create(screen, priv);
+   if (result)
+      return rbug_context_create(_screen, result);
+   return NULL;
+}
+
+static struct pipe_resource *
+rbug_screen_resource_create(struct pipe_screen *_screen,
+                            const struct pipe_resource *templat)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+   struct pipe_resource *result;
+
+   result = screen->resource_create(screen,
+                                    templat);
+
+   if (result)
+      return rbug_resource_create(rb_screen, result);
+   return NULL;
+}
+
+static struct pipe_resource *
+rbug_screen_resource_from_handle(struct pipe_screen *_screen,
+                                 const struct pipe_resource *templ,
+                                 struct winsys_handle *handle)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+   struct pipe_resource *result;
+
+   result = screen->resource_from_handle(screen, templ, handle);
+
+   result = rbug_resource_create(rbug_screen(_screen), result);
+
+   return result;
+}
+
+static boolean
+rbug_screen_resource_get_handle(struct pipe_screen *_screen,
+                                struct pipe_resource *_resource,
+                                struct winsys_handle *handle)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct rbug_resource *rb_resource = rbug_resource(_resource);
+   struct pipe_screen *screen = rb_screen->screen;
+   struct pipe_resource *resource = rb_resource->resource;
+
+   return screen->resource_get_handle(screen, resource, handle);
+}
+
+
+
+static void
+rbug_screen_resource_destroy(struct pipe_screen *screen,
+                             struct pipe_resource *_resource)
+{
+   rbug_resource_destroy(rbug_resource(_resource));
+}
+
+static struct pipe_surface *
+rbug_screen_get_tex_surface(struct pipe_screen *_screen,
+                            struct pipe_resource *_resource,
+                            unsigned face,
+                            unsigned level,
+                            unsigned zslice,
+                            unsigned usage)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct rbug_resource *rb_resource = rbug_resource(_resource);
+   struct pipe_screen *screen = rb_screen->screen;
+   struct pipe_resource *resource = rb_resource->resource;
+   struct pipe_surface *result;
+
+   result = screen->get_tex_surface(screen,
+                                    resource,
+                                    face,
+                                    level,
+                                    zslice,
+                                    usage);
+
+   if (result)
+      return rbug_surface_create(rb_resource, result);
+   return NULL;
+}
+
+static void
+rbug_screen_tex_surface_destroy(struct pipe_surface *_surface)
+{
+   rbug_surface_destroy(rbug_surface(_surface));
+}
+
+
+
+static struct pipe_resource *
+rbug_screen_user_buffer_create(struct pipe_screen *_screen,
+                               void *ptr,
+                               unsigned bytes,
+                               unsigned usage)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+   struct pipe_resource *result;
+
+   result = screen->user_buffer_create(screen,
+                                       ptr,
+                                       bytes,
+                                       usage);
+
+   if (result)
+      return rbug_resource_create(rb_screen, result);
+   return NULL;
+}
+
+
+
+static void
+rbug_screen_flush_frontbuffer(struct pipe_screen *_screen,
+                              struct pipe_surface *_surface,
+                              void *context_private)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct rbug_surface *rb_surface = rbug_surface(_surface);
+   struct pipe_screen *screen = rb_screen->screen;
+   struct pipe_surface *surface = rb_surface->surface;
+
+   screen->flush_frontbuffer(screen,
+                             surface,
+                             context_private);
+}
+
+static void
+rbug_screen_fence_reference(struct pipe_screen *_screen,
+                            struct pipe_fence_handle **ptr,
+                            struct pipe_fence_handle *fence)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+
+   screen->fence_reference(screen,
+                           ptr,
+                           fence);
+}
+
+static int
+rbug_screen_fence_signalled(struct pipe_screen *_screen,
+                            struct pipe_fence_handle *fence,
+                            unsigned flags)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+
+   return screen->fence_signalled(screen,
+                                  fence,
+                                  flags);
+}
+
+static int
+rbug_screen_fence_finish(struct pipe_screen *_screen,
+                         struct pipe_fence_handle *fence,
+                         unsigned flags)
+{
+   struct rbug_screen *rb_screen = rbug_screen(_screen);
+   struct pipe_screen *screen = rb_screen->screen;
+
+   return screen->fence_finish(screen,
+                               fence,
+                               flags);
+}
+
+boolean
+rbug_enabled()
+{
+   return debug_get_option_rbug();
+}
+
+struct pipe_screen *
+rbug_screen_create(struct pipe_screen *screen)
+{
+   struct rbug_screen *rb_screen;
+
+   if (!debug_get_option_rbug())
+      return screen;
+
+   rb_screen = CALLOC_STRUCT(rbug_screen);
+   if (!rb_screen)
+      return screen;
+
+   pipe_mutex_init(rb_screen->list_mutex);
+   make_empty_list(&rb_screen->contexts);
+   make_empty_list(&rb_screen->resources);
+   make_empty_list(&rb_screen->surfaces);
+   make_empty_list(&rb_screen->transfers);
+
+   rb_screen->base.winsys = NULL;
+
+   rb_screen->base.destroy = rbug_screen_destroy;
+   rb_screen->base.get_name = rbug_screen_get_name;
+   rb_screen->base.get_vendor = rbug_screen_get_vendor;
+   rb_screen->base.get_param = rbug_screen_get_param;
+   rb_screen->base.get_paramf = rbug_screen_get_paramf;
+   rb_screen->base.is_format_supported = rbug_screen_is_format_supported;
+   rb_screen->base.context_create = rbug_screen_context_create;
+   rb_screen->base.resource_create = rbug_screen_resource_create;
+   rb_screen->base.resource_from_handle = rbug_screen_resource_from_handle;
+   rb_screen->base.resource_get_handle = rbug_screen_resource_get_handle;
+   rb_screen->base.resource_destroy = rbug_screen_resource_destroy;
+   rb_screen->base.get_tex_surface = rbug_screen_get_tex_surface;
+   rb_screen->base.tex_surface_destroy = rbug_screen_tex_surface_destroy;
+   rb_screen->base.user_buffer_create = rbug_screen_user_buffer_create;
+   rb_screen->base.flush_frontbuffer = rbug_screen_flush_frontbuffer;
+   rb_screen->base.fence_reference = rbug_screen_fence_reference;
+   rb_screen->base.fence_signalled = rbug_screen_fence_signalled;
+   rb_screen->base.fence_finish = rbug_screen_fence_finish;
+
+   rb_screen->screen = screen;
+
+   rb_screen->private_context = screen->context_create(screen, NULL);
+   if (!rb_screen->private_context)
+      goto err_free;
+
+   rb_screen->rbug = rbug_start(rb_screen);
+
+   if (!rb_screen->rbug)
+      goto err_context;
+
+   return &rb_screen->base;
+
+err_context:
+   rb_screen->private_context->destroy(rb_screen->private_context);
+err_free:
+   FREE(rb_screen);
+   return screen;
+}
diff --git a/src/gallium/drivers/rbug/rbug_screen.h b/src/gallium/drivers/rbug/rbug_screen.h
new file mode 100644
index 0000000000..a53afac05e
--- /dev/null
+++ b/src/gallium/drivers/rbug/rbug_screen.h
@@ -0,0 +1,100 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef RBUG_SCREEN_H
+#define RBUG_SCREEN_H
+
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+
+#include "os/os_thread.h"
+
+struct rbug_list {
+   struct rbug_list *next;
+   struct rbug_list *prev;
+};
+
+
+struct rbug_screen
+{
+   struct pipe_screen base;
+
+   struct pipe_screen *screen;
+   struct pipe_context *private_context;
+
+   /* remote debugger */
+   struct rbug_rbug *rbug;
+
+   pipe_mutex list_mutex;
+   int num_contexts;
+   int num_resources;
+   int num_surfaces;
+   int num_transfers;
+   struct rbug_list contexts;
+   struct rbug_list resources;
+   struct rbug_list surfaces;
+   struct rbug_list transfers;
+};
+
+static INLINE struct rbug_screen *
+rbug_screen(struct pipe_screen *screen)
+{
+   return (struct rbug_screen *)screen;
+}
+
+#define rbug_screen_add_to_list(scr, name, obj) \
+   do {                                          \
+      pipe_mutex_lock(scr->list_mutex);          \
+      insert_at_head(&scr->name, &obj->list);    \
+      scr->num_##name++;                         \
+      pipe_mutex_unlock(scr->list_mutex);        \
+   } while (0)
+
+#define rbug_screen_remove_from_list(scr, name, obj) \
+   do {                                               \
+      pipe_mutex_lock(scr->list_mutex);               \
+      remove_from_list(&obj->list);                   \
+      scr->num_##name--;                              \
+      pipe_mutex_unlock(scr->list_mutex);             \
+   } while (0)
+
+
+
+/**********************************************************
+ * rbug_core.c
+ */
+
+struct rbug_rbug;
+
+struct rbug_rbug *
+rbug_start(struct rbug_screen *rb_screen);
+
+void
+rbug_stop(struct rbug_rbug *rbug);
+
+
+#endif /* RBUG_SCREEN_H */
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
new file mode 100644
index 0000000000..35d426aa3e
--- /dev/null
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -0,0 +1,38 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = softpipe
+
+C_SOURCES = \
+	sp_fs_exec.c \
+	sp_fs_sse.c \
+	sp_clear.c \
+	sp_fence.c \
+	sp_flush.c \
+	sp_query.c \
+	sp_context.c \
+	sp_draw_arrays.c \
+	sp_prim_vbuf.c \
+	sp_quad_pipe.c \
+	sp_quad_stipple.c \
+	sp_quad_depth_test.c \
+	sp_quad_fs.c \
+	sp_quad_blend.c \
+	sp_screen.c \
+        sp_setup.c \
+	sp_state_blend.c \
+	sp_state_clip.c \
+	sp_state_derived.c \
+	sp_state_fs.c \
+	sp_state_sampler.c \
+	sp_state_so.c \
+	sp_state_rasterizer.c \
+	sp_state_surface.c \
+	sp_state_vertex.c \
+	sp_texture.c \
+	sp_tex_sample.c \
+	sp_tex_tile_cache.c \
+	sp_tile_cache.c \
+	sp_surface.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript
new file mode 100644
index 0000000000..be5917a688
--- /dev/null
+++ b/src/gallium/drivers/softpipe/SConscript
@@ -0,0 +1,40 @@
+Import('*')
+
+env = env.Clone()
+
+softpipe = env.ConvenienceLibrary(
+	target = 'softpipe',
+	source = [
+		'sp_fs_exec.c',
+		'sp_fs_sse.c',
+		'sp_clear.c',
+		'sp_context.c',
+		'sp_draw_arrays.c',
+		'sp_fence.c',
+		'sp_flush.c',
+		'sp_prim_vbuf.c',
+		'sp_setup.c',
+		'sp_quad_blend.c',
+		'sp_quad_pipe.c',
+		'sp_quad_depth_test.c',
+		'sp_quad_fs.c',
+		'sp_quad_stipple.c',
+		'sp_query.c',
+		'sp_screen.c',
+		'sp_state_blend.c',
+		'sp_state_clip.c',
+		'sp_state_derived.c',
+		'sp_state_fs.c',
+		'sp_state_rasterizer.c',
+		'sp_state_sampler.c',
+		'sp_state_so.c',
+		'sp_state_surface.c',
+		'sp_state_vertex.c',
+		'sp_surface.c',
+		'sp_tex_sample.c',
+		'sp_tex_tile_cache.c',
+		'sp_texture.c',
+		'sp_tile_cache.c',
+	])
+
+Export('softpipe')
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
new file mode 100644
index 0000000000..ae3f00f338
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -0,0 +1,84 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2009 VMware, Inc.  All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Michel Dänzer
+ */
+
+
+#include "pipe/p_defines.h"
+#include "util/u_pack_color.h"
+#include "sp_clear.h"
+#include "sp_context.h"
+#include "sp_query.h"
+#include "sp_tile_cache.h"
+
+
+/**
+ * Clear the given buffers to the specified values.
+ * No masking, no scissor (clear entire buffer).
+ */
+void
+softpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
+               double depth, unsigned stencil)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   union util_color uc;
+   unsigned cv;
+   uint i;
+
+   if (softpipe->no_rast)
+      return;
+
+   if (!softpipe_check_render_cond(softpipe))
+      return;
+
+#if 0
+   softpipe_update_derived(softpipe); /* not needed?? */
+#endif
+
+   if (buffers & PIPE_CLEAR_COLOR) {
+      for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
+         struct pipe_surface *ps = softpipe->framebuffer.cbufs[i];
+
+         util_pack_color(rgba, ps->format, &uc);
+         sp_tile_cache_clear(softpipe->cbuf_cache[i], rgba, uc.ui);
+      }
+   }
+
+   if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
+      static const float zero[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
+      struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
+
+      cv = util_pack_z_stencil(ps->format, depth, stencil);
+      sp_tile_cache_clear(softpipe->zsbuf_cache, zero, cv);
+   }
+
+   softpipe->dirty_render_cache = TRUE;
+}
diff --git a/src/gallium/drivers/softpipe/sp_clear.h b/src/gallium/drivers/softpipe/sp_clear.h
new file mode 100644
index 0000000000..9be3b86fe9
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_clear.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ */
+
+#ifndef SP_CLEAR_H
+#define SP_CLEAR_H
+
+struct pipe_context;
+
+extern void
+softpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
+               double depth, unsigned stencil);
+
+
+#endif /* SP_CLEAR_H */
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
new file mode 100644
index 0000000000..12ef98aac7
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -0,0 +1,381 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "tgsi/tgsi_exec.h"
+#include "sp_clear.h"
+#include "sp_context.h"
+#include "sp_flush.h"
+#include "sp_prim_vbuf.h"
+#include "sp_state.h"
+#include "sp_surface.h"
+#include "sp_tile_cache.h"
+#include "sp_tex_tile_cache.h"
+#include "sp_texture.h"
+#include "sp_query.h"
+
+
+
+/**
+ * Map any drawing surfaces which aren't already mapped
+ */
+void
+softpipe_map_transfers(struct softpipe_context *sp)
+{
+   unsigned i;
+
+   for (i = 0; i < sp->framebuffer.nr_cbufs; i++) {
+      sp_tile_cache_map_transfers(sp->cbuf_cache[i]);
+   }
+
+   sp_tile_cache_map_transfers(sp->zsbuf_cache);
+}
+
+
+/**
+ * Unmap any mapped drawing surfaces
+ */
+void
+softpipe_unmap_transfers(struct softpipe_context *sp)
+{
+   uint i;
+
+   for (i = 0; i < sp->framebuffer.nr_cbufs; i++) {
+      sp_tile_cache_unmap_transfers(sp->cbuf_cache[i]);
+   }
+
+   sp_tile_cache_unmap_transfers(sp->zsbuf_cache);
+}
+
+
+static void
+softpipe_destroy( struct pipe_context *pipe )
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+   uint i;
+
+   if (softpipe->draw)
+      draw_destroy( softpipe->draw );
+
+   softpipe->quad.shade->destroy( softpipe->quad.shade );
+   softpipe->quad.depth_test->destroy( softpipe->quad.depth_test );
+   softpipe->quad.blend->destroy( softpipe->quad.blend );
+   softpipe->quad.pstipple->destroy( softpipe->quad.pstipple );
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      sp_destroy_tile_cache(softpipe->cbuf_cache[i]);
+      pipe_surface_reference(&softpipe->framebuffer.cbufs[i], NULL);
+   }
+
+   sp_destroy_tile_cache(softpipe->zsbuf_cache);
+   pipe_surface_reference(&softpipe->framebuffer.zsbuf, NULL);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      sp_destroy_tex_tile_cache(softpipe->tex_cache[i]);
+      pipe_sampler_view_reference(&softpipe->sampler_views[i], NULL);
+   }
+
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      sp_destroy_tex_tile_cache(softpipe->vertex_tex_cache[i]);
+      pipe_sampler_view_reference(&softpipe->vertex_sampler_views[i], NULL);
+   }
+
+   for (i = 0; i < PIPE_MAX_GEOMETRY_SAMPLERS; i++) {
+      sp_destroy_tex_tile_cache(softpipe->geometry_tex_cache[i]);
+      pipe_sampler_view_reference(&softpipe->geometry_sampler_views[i], NULL);
+   }
+
+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+      uint j;
+
+      for (j = 0; j < PIPE_MAX_CONSTANT_BUFFERS; j++) {
+         if (softpipe->constants[i][j]) {
+            pipe_resource_reference(&softpipe->constants[i][j], NULL);
+         }
+      }
+   }
+
+   tgsi_exec_machine_destroy(softpipe->fs_machine);
+
+   FREE( softpipe );
+}
+
+
+/**
+ * if (the texture is being used as a framebuffer surface)
+ *    return PIPE_REFERENCED_FOR_WRITE
+ * else if (the texture is a bound texture source)
+ *    return PIPE_REFERENCED_FOR_READ
+ * else
+ *    return PIPE_UNREFERENCED
+ */
+static unsigned int
+softpipe_is_resource_referenced( struct pipe_context *pipe,
+				struct pipe_resource *texture,
+				unsigned face, unsigned level)
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+   unsigned i;
+
+   if (texture->target == PIPE_BUFFER)
+      return PIPE_UNREFERENCED;
+   
+   /* check if any of the bound drawing surfaces are this texture */
+   if (softpipe->dirty_render_cache) {
+      for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
+         if (softpipe->framebuffer.cbufs[i] && 
+             softpipe->framebuffer.cbufs[i]->texture == texture) {
+            return PIPE_REFERENCED_FOR_WRITE;
+         }
+      }
+      if (softpipe->framebuffer.zsbuf && 
+          softpipe->framebuffer.zsbuf->texture == texture) {
+         return PIPE_REFERENCED_FOR_WRITE;
+      }
+   }
+   
+   /* check if any of the tex_cache textures are this texture */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      if (softpipe->tex_cache[i] &&
+          softpipe->tex_cache[i]->texture == texture)
+         return PIPE_REFERENCED_FOR_READ;
+   }
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      if (softpipe->vertex_tex_cache[i] &&
+          softpipe->vertex_tex_cache[i]->texture == texture)
+         return PIPE_REFERENCED_FOR_READ;
+   }
+   for (i = 0; i < PIPE_MAX_GEOMETRY_SAMPLERS; i++) {
+      if (softpipe->geometry_tex_cache[i] &&
+          softpipe->geometry_tex_cache[i]->texture == texture)
+         return PIPE_REFERENCED_FOR_READ;
+   }
+
+   return PIPE_UNREFERENCED;
+}
+
+
+
+
+static void
+softpipe_render_condition( struct pipe_context *pipe,
+                           struct pipe_query *query,
+                           uint mode )
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+
+   softpipe->render_cond_query = query;
+   softpipe->render_cond_mode = mode;
+}
+
+
+
+struct pipe_context *
+softpipe_create_context( struct pipe_screen *screen,
+			 void *priv )
+{
+   struct softpipe_context *softpipe = CALLOC_STRUCT(softpipe_context);
+   uint i;
+
+   util_init_math();
+
+#ifdef PIPE_ARCH_X86
+   softpipe->use_sse = !debug_get_bool_option( "GALLIUM_NOSSE", FALSE );
+#else
+   softpipe->use_sse = FALSE;
+#endif
+
+   softpipe->dump_fs = debug_get_bool_option( "GALLIUM_DUMP_FS", FALSE );
+   softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE );
+
+   softpipe->pipe.winsys = NULL;
+   softpipe->pipe.screen = screen;
+   softpipe->pipe.destroy = softpipe_destroy;
+   softpipe->pipe.priv = priv;
+
+   /* state setters */
+   softpipe->pipe.create_blend_state = softpipe_create_blend_state;
+   softpipe->pipe.bind_blend_state   = softpipe_bind_blend_state;
+   softpipe->pipe.delete_blend_state = softpipe_delete_blend_state;
+
+   softpipe->pipe.create_sampler_state = softpipe_create_sampler_state;
+   softpipe->pipe.bind_fragment_sampler_states  = softpipe_bind_sampler_states;
+   softpipe->pipe.bind_vertex_sampler_states = softpipe_bind_vertex_sampler_states;
+   softpipe->pipe.bind_geometry_sampler_states = softpipe_bind_geometry_sampler_states;
+   softpipe->pipe.delete_sampler_state = softpipe_delete_sampler_state;
+
+   softpipe->pipe.create_depth_stencil_alpha_state = softpipe_create_depth_stencil_state;
+   softpipe->pipe.bind_depth_stencil_alpha_state   = softpipe_bind_depth_stencil_state;
+   softpipe->pipe.delete_depth_stencil_alpha_state = softpipe_delete_depth_stencil_state;
+
+   softpipe->pipe.create_rasterizer_state = softpipe_create_rasterizer_state;
+   softpipe->pipe.bind_rasterizer_state   = softpipe_bind_rasterizer_state;
+   softpipe->pipe.delete_rasterizer_state = softpipe_delete_rasterizer_state;
+
+   softpipe->pipe.create_fs_state = softpipe_create_fs_state;
+   softpipe->pipe.bind_fs_state   = softpipe_bind_fs_state;
+   softpipe->pipe.delete_fs_state = softpipe_delete_fs_state;
+
+   softpipe->pipe.create_vs_state = softpipe_create_vs_state;
+   softpipe->pipe.bind_vs_state   = softpipe_bind_vs_state;
+   softpipe->pipe.delete_vs_state = softpipe_delete_vs_state;
+
+   softpipe->pipe.create_gs_state = softpipe_create_gs_state;
+   softpipe->pipe.bind_gs_state   = softpipe_bind_gs_state;
+   softpipe->pipe.delete_gs_state = softpipe_delete_gs_state;
+
+   softpipe->pipe.create_vertex_elements_state = softpipe_create_vertex_elements_state;
+   softpipe->pipe.bind_vertex_elements_state = softpipe_bind_vertex_elements_state;
+   softpipe->pipe.delete_vertex_elements_state = softpipe_delete_vertex_elements_state;
+
+   softpipe->pipe.create_stream_output_state = softpipe_create_stream_output_state;
+   softpipe->pipe.bind_stream_output_state = softpipe_bind_stream_output_state;
+   softpipe->pipe.delete_stream_output_state = softpipe_delete_stream_output_state;
+
+   softpipe->pipe.set_blend_color = softpipe_set_blend_color;
+   softpipe->pipe.set_stencil_ref = softpipe_set_stencil_ref;
+   softpipe->pipe.set_clip_state = softpipe_set_clip_state;
+   softpipe->pipe.set_sample_mask = softpipe_set_sample_mask;
+   softpipe->pipe.set_constant_buffer = softpipe_set_constant_buffer;
+   softpipe->pipe.set_framebuffer_state = softpipe_set_framebuffer_state;
+   softpipe->pipe.set_polygon_stipple = softpipe_set_polygon_stipple;
+   softpipe->pipe.set_scissor_state = softpipe_set_scissor_state;
+   softpipe->pipe.set_fragment_sampler_views = softpipe_set_sampler_views;
+   softpipe->pipe.set_vertex_sampler_views = softpipe_set_vertex_sampler_views;
+   softpipe->pipe.set_geometry_sampler_views = softpipe_set_geometry_sampler_views;
+   softpipe->pipe.create_sampler_view = softpipe_create_sampler_view;
+   softpipe->pipe.sampler_view_destroy = softpipe_sampler_view_destroy;
+   softpipe->pipe.set_viewport_state = softpipe_set_viewport_state;
+   softpipe->pipe.set_stream_output_buffers = softpipe_set_stream_output_buffers;
+   softpipe->pipe.set_vertex_buffers = softpipe_set_vertex_buffers;
+
+   softpipe->pipe.draw_arrays = softpipe_draw_arrays;
+   softpipe->pipe.draw_elements = softpipe_draw_elements;
+   softpipe->pipe.draw_range_elements = softpipe_draw_range_elements;
+   softpipe->pipe.draw_arrays_instanced = softpipe_draw_arrays_instanced;
+   softpipe->pipe.draw_elements_instanced = softpipe_draw_elements_instanced;
+   softpipe->pipe.draw_stream_output = softpipe_draw_stream_output;
+
+   softpipe->pipe.clear = softpipe_clear;
+   softpipe->pipe.flush = softpipe_flush;
+
+   softpipe->pipe.is_resource_referenced = softpipe_is_resource_referenced;
+
+   softpipe_init_query_funcs( softpipe );
+   softpipe_init_texture_funcs( &softpipe->pipe );
+
+   softpipe->pipe.render_condition = softpipe_render_condition;
+
+   /*
+    * Alloc caches for accessing drawing surfaces and textures.
+    * Must be before quad stage setup!
+    */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+      softpipe->cbuf_cache[i] = sp_create_tile_cache( &softpipe->pipe );
+   softpipe->zsbuf_cache = sp_create_tile_cache( &softpipe->pipe );
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+      softpipe->tex_cache[i] = sp_create_tex_tile_cache( &softpipe->pipe );
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      softpipe->vertex_tex_cache[i] = sp_create_tex_tile_cache( &softpipe->pipe );
+   }
+   for (i = 0; i < PIPE_MAX_GEOMETRY_SAMPLERS; i++) {
+      softpipe->geometry_tex_cache[i] = sp_create_tex_tile_cache( &softpipe->pipe );
+   }
+
+   softpipe->fs_machine = tgsi_exec_machine_create();
+
+   /* setup quad rendering stages */
+   softpipe->quad.shade = sp_quad_shade_stage(softpipe);
+   softpipe->quad.depth_test = sp_quad_depth_test_stage(softpipe);
+   softpipe->quad.blend = sp_quad_blend_stage(softpipe);
+   softpipe->quad.pstipple = sp_quad_polygon_stipple_stage(softpipe);
+
+
+   /*
+    * Create drawing context and plug our rendering stage into it.
+    */
+   softpipe->draw = draw_create(&softpipe->pipe);
+   if (!softpipe->draw) 
+      goto fail;
+
+   draw_texture_samplers(softpipe->draw,
+                         PIPE_SHADER_VERTEX,
+                         PIPE_MAX_VERTEX_SAMPLERS,
+                         (struct tgsi_sampler **)
+                            softpipe->tgsi.vert_samplers_list);
+
+   draw_texture_samplers(softpipe->draw,
+                         PIPE_SHADER_GEOMETRY,
+                         PIPE_MAX_GEOMETRY_SAMPLERS,
+                         (struct tgsi_sampler **)
+                            softpipe->tgsi.geom_samplers_list);
+
+   if (debug_get_bool_option( "SP_NO_RAST", FALSE ))
+      softpipe->no_rast = TRUE;
+
+   softpipe->vbuf_backend = sp_create_vbuf_backend(softpipe);
+   if (!softpipe->vbuf_backend)
+      goto fail;
+
+   softpipe->vbuf = draw_vbuf_stage(softpipe->draw, softpipe->vbuf_backend);
+   if (!softpipe->vbuf)
+      goto fail;
+
+   draw_set_rasterize_stage(softpipe->draw, softpipe->vbuf);
+   draw_set_render(softpipe->draw, softpipe->vbuf_backend);
+
+
+   /* plug in AA line/point stages */
+   draw_install_aaline_stage(softpipe->draw, &softpipe->pipe);
+   draw_install_aapoint_stage(softpipe->draw, &softpipe->pipe);
+
+   /* Do polygon stipple w/ texture map + frag prog? */
+#if DO_PSTIPPLE_IN_DRAW_MODULE
+   draw_install_pstipple_stage(softpipe->draw, &softpipe->pipe);
+#endif
+
+   draw_wide_point_sprites(softpipe->draw, TRUE);
+
+   sp_init_surface_functions(softpipe);
+
+   return &softpipe->pipe;
+
+ fail:
+   softpipe_destroy(&softpipe->pipe);
+   return NULL;
+}
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
new file mode 100644
index 0000000000..53115a827d
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -0,0 +1,199 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_CONTEXT_H
+#define SP_CONTEXT_H
+
+#include "pipe/p_context.h"
+
+#include "draw/draw_vertex.h"
+
+#include "sp_quad_pipe.h"
+
+
+/** Do polygon stipple in the driver here, or in the draw module? */
+#define DO_PSTIPPLE_IN_DRAW_MODULE 1
+
+
+struct softpipe_vbuf_render;
+struct draw_context;
+struct draw_stage;
+struct softpipe_tile_cache;
+struct softpipe_tex_tile_cache;
+struct sp_fragment_shader;
+struct sp_vertex_shader;
+struct sp_velems_state;
+struct sp_so_state;
+
+
+struct softpipe_context {
+   struct pipe_context pipe;  /**< base class */
+
+   /** Constant state objects */
+   struct pipe_blend_state *blend;
+   struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_sampler_state *vertex_samplers[PIPE_MAX_VERTEX_SAMPLERS];
+   struct pipe_sampler_state *geometry_samplers[PIPE_MAX_GEOMETRY_SAMPLERS];
+   struct pipe_depth_stencil_alpha_state *depth_stencil;
+   struct pipe_rasterizer_state *rasterizer;
+   struct sp_fragment_shader *fs;
+   struct sp_vertex_shader *vs;
+   struct sp_geometry_shader *gs;
+   struct sp_velems_state *velems;
+   struct sp_so_state *so;
+
+   /** Other rendering state */
+   struct pipe_blend_color blend_color;
+   struct pipe_stencil_ref stencil_ref;
+   struct pipe_clip_state clip;
+   struct pipe_resource *constants[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
+   struct pipe_sampler_view *vertex_sampler_views[PIPE_MAX_VERTEX_SAMPLERS];
+   struct pipe_sampler_view *geometry_sampler_views[PIPE_MAX_GEOMETRY_SAMPLERS];
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   struct {
+      struct softpipe_resource *buffer[PIPE_MAX_SO_BUFFERS];
+      int offset[PIPE_MAX_SO_BUFFERS];
+      int so_count[PIPE_MAX_SO_BUFFERS];
+      int num_buffers;
+   } so_target;
+   struct pipe_query_data_so_statistics so_stats;
+
+   unsigned num_samplers;
+   unsigned num_sampler_views;
+   unsigned num_vertex_samplers;
+   unsigned num_vertex_sampler_views;
+   unsigned num_geometry_samplers;
+   unsigned num_geometry_sampler_views;
+   unsigned num_vertex_buffers;
+
+   unsigned dirty; /**< Mask of SP_NEW_x flags */
+
+   /* Counter for occlusion queries.  Note this supports overlapping
+    * queries.
+    */
+   uint64_t occlusion_count;
+   unsigned active_query_count;
+
+   /** Mapped vertex buffers */
+   ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS];
+
+   /** Mapped constant buffers */
+   const void *mapped_constants[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
+
+   /** Vertex format */
+   struct vertex_info vertex_info;
+   struct vertex_info vertex_info_vbuf;
+
+   /** Which vertex shader output slot contains point size */
+   int psize_slot;
+
+   /** The reduced version of the primitive supplied by the state tracker */
+   unsigned reduced_api_prim;
+
+   /** Derived information about which winding orders to cull */
+   unsigned cull_mode;
+
+   /**
+    * The reduced primitive after unfilled triangles, wide-line decomposition,
+    * etc, are taken into account.  This is the primitive type that's actually
+    * rasterized.
+    */
+   unsigned reduced_prim;
+
+   /** Derived from scissor and surface bounds: */
+   struct pipe_scissor_state cliprect;
+
+   unsigned line_stipple_counter;
+
+   /** Conditional query object and mode */
+   struct pipe_query *render_cond_query;
+   uint render_cond_mode;
+
+   /** Software quad rendering pipeline */
+   struct {
+      struct quad_stage *shade;
+      struct quad_stage *depth_test;
+      struct quad_stage *blend;
+      struct quad_stage *pstipple;
+      struct quad_stage *first; /**< points to one of the above stages */
+   } quad;
+
+   /** TGSI exec things */
+   struct {
+      struct sp_sampler_varient *geom_samplers_list[PIPE_MAX_GEOMETRY_SAMPLERS];
+      struct sp_sampler_varient *vert_samplers_list[PIPE_MAX_VERTEX_SAMPLERS];
+      struct sp_sampler_varient *frag_samplers_list[PIPE_MAX_SAMPLERS];
+   } tgsi;
+
+   struct tgsi_exec_machine *fs_machine;
+
+   /** The primitive drawing context */
+   struct draw_context *draw;
+
+   /** Draw module backend */
+   struct vbuf_render *vbuf_backend;
+   struct draw_stage *vbuf;
+
+   boolean dirty_render_cache;
+
+   struct softpipe_tile_cache *cbuf_cache[PIPE_MAX_COLOR_BUFS];
+   struct softpipe_tile_cache *zsbuf_cache;
+
+   unsigned tex_timestamp;
+   struct softpipe_tex_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
+   struct softpipe_tex_tile_cache *vertex_tex_cache[PIPE_MAX_VERTEX_SAMPLERS];
+   struct softpipe_tex_tile_cache *geometry_tex_cache[PIPE_MAX_GEOMETRY_SAMPLERS];
+
+   unsigned use_sse : 1;
+   unsigned dump_fs : 1;
+   unsigned dump_gs : 1;
+   unsigned no_rast : 1;
+};
+
+
+static INLINE struct softpipe_context *
+softpipe_context( struct pipe_context *pipe )
+{
+   return (struct softpipe_context *)pipe;
+}
+
+void
+softpipe_reset_sampler_varients(struct softpipe_context *softpipe);
+
+struct pipe_context *
+softpipe_create_context( struct pipe_screen *, void *priv );
+
+
+#endif /* SP_CONTEXT_H */
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
new file mode 100644
index 0000000000..79daa68f3b
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -0,0 +1,298 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "util/u_inlines.h"
+#include "util/u_prim.h"
+
+#include "sp_context.h"
+#include "sp_query.h"
+#include "sp_state.h"
+#include "sp_texture.h"
+
+#include "draw/draw_context.h"
+
+
+
+
+
+
+/**
+ * Draw vertex arrays, with optional indexing.
+ * Basically, map the vertex buffers (and drawing surfaces), then hand off
+ * the drawing to the 'draw' module.
+ */
+static void
+softpipe_draw_range_elements_instanced(struct pipe_context *pipe,
+                                       struct pipe_resource *indexBuffer,
+                                       unsigned indexSize,
+                                       int indexBias,
+                                       unsigned minIndex,
+                                       unsigned maxIndex,
+                                       unsigned mode,
+                                       unsigned start,
+                                       unsigned count,
+                                       unsigned startInstance,
+                                       unsigned instanceCount);
+
+
+void
+softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   softpipe_draw_range_elements_instanced(pipe,
+                                          NULL,
+                                          0,
+                                          0,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          0,
+                                          1);
+}
+
+void
+softpipe_draw_stream_output(struct pipe_context *pipe, unsigned mode)
+{
+   struct softpipe_context *sp = softpipe_context(pipe);
+   struct draw_context *draw = sp->draw;
+   const unsigned start = 0;
+   const unsigned count = sp->so_target.so_count[0];
+   void *buf = sp->so_target.buffer[0]->data;
+   int offset = sp->so_target.offset[0];
+
+   if (!softpipe_check_render_cond(sp) ||
+       sp->so_target.num_buffers != 1)
+      return;
+
+   sp->reduced_api_prim = u_reduced_prim(mode);
+
+   if (sp->dirty) {
+      softpipe_update_derived(sp);
+   }
+
+   softpipe_map_transfers(sp);
+
+   /* Map so buffers */
+   if (offset < 0) /* we were appending so start from beginning */
+      offset = 0;
+   buf = (void*)((int32_t*)buf + offset);
+   draw_set_mapped_vertex_buffer(draw, 0, buf);
+
+   draw_set_mapped_element_buffer_range(draw,
+                                        0, 0,
+                                        start,
+                                        start + count - 1,
+                                        NULL);
+
+   /* draw! */
+   draw_arrays_instanced(draw, mode, start, count, 0, 1);
+
+   /* unmap vertex/index buffers - will cause draw module to flush */
+   draw_set_mapped_vertex_buffer(draw, 0, NULL);
+
+   /*
+    * TODO: Flush only when a user vertex/index buffer is present
+    * (or even better, modify draw module to do this
+    * internally when this condition is seen?)
+    */
+   draw_flush(draw);
+
+   /* Note: leave drawing surfaces mapped */
+   sp->dirty_render_cache = TRUE;
+}
+
+
+void
+softpipe_draw_range_elements(struct pipe_context *pipe,
+                             struct pipe_resource *indexBuffer,
+                             unsigned indexSize,
+                             int indexBias,
+                             unsigned min_index,
+                             unsigned max_index,
+                             unsigned mode, unsigned start, unsigned count)
+{
+   softpipe_draw_range_elements_instanced(pipe,
+                                          indexBuffer,
+                                          indexSize,
+                                          indexBias,
+                                          min_index,
+                                          max_index,
+                                          mode,
+                                          start,
+                                          count,
+                                          0,
+                                          1);
+}
+
+
+void
+softpipe_draw_elements(struct pipe_context *pipe,
+                       struct pipe_resource *indexBuffer,
+                       unsigned indexSize, int indexBias,
+                       unsigned mode, unsigned start, unsigned count)
+{
+   softpipe_draw_range_elements_instanced(pipe,
+                                          indexBuffer,
+                                          indexSize,
+                                          indexBias,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          0,
+                                          1);
+}
+
+void
+softpipe_draw_arrays_instanced(struct pipe_context *pipe,
+                               unsigned mode,
+                               unsigned start,
+                               unsigned count,
+                               unsigned startInstance,
+                               unsigned instanceCount)
+{
+   softpipe_draw_range_elements_instanced(pipe,
+                                          NULL,
+                                          0,
+                                          0,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          startInstance,
+                                          instanceCount);
+}
+
+void
+softpipe_draw_elements_instanced(struct pipe_context *pipe,
+                                 struct pipe_resource *indexBuffer,
+                                 unsigned indexSize,
+                                 int indexBias,
+                                 unsigned mode,
+                                 unsigned start,
+                                 unsigned count,
+                                 unsigned startInstance,
+                                 unsigned instanceCount)
+{
+   softpipe_draw_range_elements_instanced(pipe,
+                                          indexBuffer,
+                                          indexSize,
+                                          indexBias,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          startInstance,
+                                          instanceCount);
+}
+
+static void
+softpipe_draw_range_elements_instanced(struct pipe_context *pipe,
+                                       struct pipe_resource *indexBuffer,
+                                       unsigned indexSize,
+                                       int indexBias,
+                                       unsigned minIndex,
+                                       unsigned maxIndex,
+                                       unsigned mode,
+                                       unsigned start,
+                                       unsigned count,
+                                       unsigned startInstance,
+                                       unsigned instanceCount)
+{
+   struct softpipe_context *sp = softpipe_context(pipe);
+   struct draw_context *draw = sp->draw;
+   unsigned i;
+
+   if (!softpipe_check_render_cond(sp))
+      return;
+
+   sp->reduced_api_prim = u_reduced_prim(mode);
+
+   if (sp->dirty) {
+      softpipe_update_derived(sp);
+   }
+
+   softpipe_map_transfers(sp);
+
+   /* Map vertex buffers */
+   for (i = 0; i < sp->num_vertex_buffers; i++) {
+      void *buf = softpipe_resource(sp->vertex_buffer[i].buffer)->data;
+      draw_set_mapped_vertex_buffer(draw, i, buf);
+   }
+
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      void *mapped_indexes = softpipe_resource(indexBuffer)->data;
+      draw_set_mapped_element_buffer_range(draw,
+                                           indexSize,
+                                           indexBias,
+                                           minIndex,
+                                           maxIndex,
+                                           mapped_indexes);
+   } else {
+      /* no index/element buffer */
+      draw_set_mapped_element_buffer_range(draw,
+                                           0, 0,
+                                           start,
+                                           start + count - 1,
+                                           NULL);
+   }
+
+   /* draw! */
+   draw_arrays_instanced(draw, mode, start, count, startInstance, instanceCount);
+
+   /* unmap vertex/index buffers - will cause draw module to flush */
+   for (i = 0; i < sp->num_vertex_buffers; i++) {
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+   }
+   if (indexBuffer) {
+      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
+   }
+
+   /*
+    * TODO: Flush only when a user vertex/index buffer is present
+    * (or even better, modify draw module to do this
+    * internally when this condition is seen?)
+    */
+   draw_flush(draw);
+
+   /* Note: leave drawing surfaces mapped */
+   sp->dirty_render_cache = TRUE;
+}
diff --git a/src/gallium/drivers/softpipe/sp_fence.c b/src/gallium/drivers/softpipe/sp_fence.c
new file mode 100644
index 0000000000..66c5214113
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fence.c
@@ -0,0 +1,70 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_screen.h"
+#include "util/u_debug.h"
+#include "sp_fence.h"
+
+
+static void
+softpipe_fence_reference(struct pipe_screen *screen,
+                         struct pipe_fence_handle **ptr,
+                         struct pipe_fence_handle *fence)
+{
+   assert(!*ptr);
+   assert(!fence);
+}
+
+
+static int
+softpipe_fence_signalled(struct pipe_screen *screen,
+                         struct pipe_fence_handle *fence,
+                         unsigned flags)
+{
+   assert(!fence);
+   return 0;
+}
+
+
+static int
+softpipe_fence_finish(struct pipe_screen *screen,
+                      struct pipe_fence_handle *fence,
+                      unsigned flags)
+{
+   assert(!fence);
+   return 0;
+}
+
+
+void
+softpipe_init_screen_fence_funcs(struct pipe_screen *screen)
+{
+   screen->fence_reference = softpipe_fence_reference;
+   screen->fence_finish = softpipe_fence_finish;
+   screen->fence_signalled = softpipe_fence_signalled;
+}
diff --git a/src/gallium/drivers/softpipe/sp_fence.h b/src/gallium/drivers/softpipe/sp_fence.h
new file mode 100644
index 0000000000..39c33243bd
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fence.h
@@ -0,0 +1,40 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef SP_FENCE_H_
+#define SP_FENCE_H_
+
+
+struct pipe_screen;
+
+
+void
+softpipe_init_screen_fence_funcs(struct pipe_screen *screen);
+
+
+#endif /* SP_FENCE_H_ */
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
new file mode 100644
index 0000000000..4a53ef048f
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -0,0 +1,177 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "draw/draw_context.h"
+#include "sp_flush.h"
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_tile_cache.h"
+#include "sp_tex_tile_cache.h"
+
+
+void
+softpipe_flush( struct pipe_context *pipe,
+		unsigned flags,
+                struct pipe_fence_handle **fence )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i;
+
+   draw_flush(softpipe->draw);
+
+   if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+      for (i = 0; i < softpipe->num_sampler_views; i++) {
+         sp_flush_tex_tile_cache(softpipe->tex_cache[i]);
+      }
+      for (i = 0; i < softpipe->num_vertex_sampler_views; i++) {
+         sp_flush_tex_tile_cache(softpipe->vertex_tex_cache[i]);
+      }
+      for (i = 0; i < softpipe->num_geometry_sampler_views; i++) {
+         sp_flush_tex_tile_cache(softpipe->geometry_tex_cache[i]);
+      }
+   }
+
+   if (flags & PIPE_FLUSH_SWAPBUFFERS) {
+      /* If this is a swapbuffers, just flush color buffers.
+       *
+       * The zbuffer changes are not discarded, but held in the cache
+       * in the hope that a later clear will wipe them out.
+       */
+      for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
+         if (softpipe->cbuf_cache[i])
+            sp_flush_tile_cache(softpipe->cbuf_cache[i]);
+
+      /* Need this call for hardware buffers before swapbuffers.
+       *
+       * there should probably be another/different flush-type function
+       * that's called before swapbuffers because we don't always want
+       * to unmap surfaces when flushing.
+       */
+      softpipe_unmap_transfers(softpipe);
+   }
+   else if (flags & PIPE_FLUSH_RENDER_CACHE) {
+      for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
+         if (softpipe->cbuf_cache[i])
+            sp_flush_tile_cache(softpipe->cbuf_cache[i]);
+
+      if (softpipe->zsbuf_cache)
+         sp_flush_tile_cache(softpipe->zsbuf_cache);
+     
+      softpipe->dirty_render_cache = FALSE;
+   }
+
+   /* Enable to dump BMPs of the color/depth buffers each frame */
+#if 0
+   if(flags & PIPE_FLUSH_FRAME) {
+      static unsigned frame_no = 1;
+      static char filename[256];
+      util_snprintf(filename, sizeof(filename), "cbuf_%u.bmp", frame_no);
+      debug_dump_surface_bmp(softpipe, filename, softpipe->framebuffer.cbufs[0]);
+      util_snprintf(filename, sizeof(filename), "zsbuf_%u.bmp", frame_no);
+      debug_dump_surface_bmp(softpipe, filename, softpipe->framebuffer.zsbuf);
+      ++frame_no;
+   }
+#endif
+   
+   if (fence)
+      *fence = NULL;
+}
+
+
+/**
+ * Flush context if necessary.
+ *
+ * Returns FALSE if it would have block, but do_not_block was set, TRUE
+ * otherwise.
+ *
+ * TODO: move this logic to an auxiliary library?
+ */
+boolean
+softpipe_flush_resource(struct pipe_context *pipe,
+                        struct pipe_resource *texture,
+                        unsigned face,
+                        unsigned level,
+                        unsigned flush_flags,
+                        boolean read_only,
+                        boolean cpu_access,
+                        boolean do_not_block)
+{
+   unsigned referenced;
+
+   referenced = pipe->is_resource_referenced(pipe, texture, face, level);
+
+   if ((referenced & PIPE_REFERENCED_FOR_WRITE) ||
+       ((referenced & PIPE_REFERENCED_FOR_READ) && !read_only)) {
+
+      /*
+       * TODO: The semantics of these flush flags are too obtuse. They should
+       * disappear and the pipe driver should just ensure that all visible
+       * side-effects happen when they need to happen.
+       */
+      if (referenced & PIPE_REFERENCED_FOR_WRITE)
+         flush_flags |= PIPE_FLUSH_RENDER_CACHE;
+
+      if (referenced & PIPE_REFERENCED_FOR_READ)
+         flush_flags |= PIPE_FLUSH_TEXTURE_CACHE;
+
+      if (cpu_access) {
+         /*
+          * Flush and wait.
+          */
+
+         struct pipe_fence_handle *fence = NULL;
+
+         if (do_not_block)
+            return FALSE;
+
+         pipe->flush(pipe, flush_flags, &fence);
+
+         if (fence) {
+            /*
+             * This is for illustrative purposes only, as softpipe does not
+             * have fences.
+             */
+            pipe->screen->fence_finish(pipe->screen, fence, 0);
+            pipe->screen->fence_reference(pipe->screen, &fence, NULL);
+         }
+      } else {
+         /*
+          * Just flush.
+          */
+
+         pipe->flush(pipe, flush_flags, NULL);
+      }
+   }
+
+   return TRUE;
+}
diff --git a/src/gallium/drivers/softpipe/sp_flush.h b/src/gallium/drivers/softpipe/sp_flush.h
new file mode 100644
index 0000000000..cb97482a71
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_flush.h
@@ -0,0 +1,50 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_FLUSH_H
+#define SP_FLUSH_H
+
+#include "pipe/p_compiler.h"
+
+struct pipe_context;
+struct pipe_fence_handle;
+
+void
+softpipe_flush(struct pipe_context *pipe, unsigned flags,
+               struct pipe_fence_handle **fence);
+
+boolean
+softpipe_flush_resource(struct pipe_context *pipe,
+                        struct pipe_resource *texture,
+                        unsigned face,
+                        unsigned level,
+                        unsigned flush_flags,
+                        boolean read_only,
+                        boolean cpu_access,
+                        boolean do_not_block);
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs.h b/src/gallium/drivers/softpipe/sp_fs.h
new file mode 100644
index 0000000000..4792ace3a3
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fs.h
@@ -0,0 +1,54 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_FS_H
+#define SP_FS_H
+
+struct sp_fragment_shader *
+softpipe_create_fs_exec(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ);
+
+struct sp_fragment_shader *
+softpipe_create_fs_sse(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ);
+
+struct sp_fragment_shader *
+softpipe_create_fs_llvm(struct softpipe_context *softpipe,
+			const struct pipe_shader_state *templ);
+
+struct tgsi_interp_coef;
+struct tgsi_exec_vector;
+
+void sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
+			 float x, float y,
+			 struct tgsi_exec_vector *quadpos);
+
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
new file mode 100644
index 0000000000..67e2c8f8bc
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -0,0 +1,203 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Execute fragment shader using the TGSI interpreter.
+ */
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_fs.h"
+#include "sp_quad.h"
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_parse.h"
+
+
+/**
+ * Subclass of sp_fragment_shader
+ */
+struct sp_exec_fragment_shader
+{
+   struct sp_fragment_shader base;
+   /* No other members for now */
+};
+
+
+/** cast wrapper */
+static INLINE struct sp_exec_fragment_shader *
+sp_exec_fragment_shader(const struct sp_fragment_shader *base)
+{
+   return (struct sp_exec_fragment_shader *) base;
+}
+
+
+static void
+exec_prepare( const struct sp_fragment_shader *base,
+	      struct tgsi_exec_machine *machine,
+	      struct tgsi_sampler **samplers )
+{
+   /*
+    * Bind tokens/shader to the interpreter's machine state.
+    * Avoid redundant binding.
+    */
+   if (machine->Tokens != base->shader.tokens) {
+      tgsi_exec_machine_bind_shader( machine,
+                                     base->shader.tokens,
+                                     PIPE_MAX_SAMPLERS,
+                                     samplers );
+   }
+}
+
+
+
+/**
+ * Compute quad X,Y,Z,W for the four fragments in a quad.
+ *
+ * This should really be part of the compiled shader.
+ */
+static void
+setup_pos_vector(const struct tgsi_interp_coef *coef,
+                 float x, float y,
+                 struct tgsi_exec_vector *quadpos)
+{
+   uint chan;
+   /* do X */
+   quadpos->xyzw[0].f[0] = x;
+   quadpos->xyzw[0].f[1] = x + 1;
+   quadpos->xyzw[0].f[2] = x;
+   quadpos->xyzw[0].f[3] = x + 1;
+
+   /* do Y */
+   quadpos->xyzw[1].f[0] = y;
+   quadpos->xyzw[1].f[1] = y;
+   quadpos->xyzw[1].f[2] = y + 1;
+   quadpos->xyzw[1].f[3] = y + 1;
+
+   /* do Z and W for all fragments in the quad */
+   for (chan = 2; chan < 4; chan++) {
+      const float dadx = coef->dadx[chan];
+      const float dady = coef->dady[chan];
+      const float a0 = coef->a0[chan] + dadx * x + dady * y;
+      quadpos->xyzw[chan].f[0] = a0;
+      quadpos->xyzw[chan].f[1] = a0 + dadx;
+      quadpos->xyzw[chan].f[2] = a0 + dady;
+      quadpos->xyzw[chan].f[3] = a0 + dadx + dady;
+   }
+}
+
+
+/* TODO: hide the machine struct in here somewhere, remove from this
+ * interface:
+ */
+static unsigned 
+exec_run( const struct sp_fragment_shader *base,
+	  struct tgsi_exec_machine *machine,
+	  struct quad_header *quad )
+{
+   /* Compute X, Y, Z, W vals for this quad */
+   setup_pos_vector(quad->posCoef, 
+                    (float)quad->input.x0, (float)quad->input.y0, 
+                    &machine->QuadPos);
+
+   /* convert 0 to 1.0 and 1 to -1.0 */
+   machine->Face = (float) (quad->input.facing * -2 + 1);
+
+   quad->inout.mask &= tgsi_exec_machine_run( machine );
+   if (quad->inout.mask == 0)
+      return FALSE;
+
+   /* store outputs */
+   {
+      const ubyte *sem_name = base->info.output_semantic_name;
+      const ubyte *sem_index = base->info.output_semantic_index;
+      const uint n = base->info.num_outputs;
+      uint i;
+      for (i = 0; i < n; i++) {
+         switch (sem_name[i]) {
+         case TGSI_SEMANTIC_COLOR:
+            {
+               uint cbuf = sem_index[i];
+
+               assert(sizeof(quad->output.color[cbuf]) ==
+                      sizeof(machine->Outputs[i]));
+
+               /* copy float[4][4] result */
+               memcpy(quad->output.color[cbuf],
+                      &machine->Outputs[i],
+                      sizeof(quad->output.color[0]) );
+            }
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            {
+               uint j;
+               for (j = 0; j < 4; j++) {
+                  quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+               }
+            }
+            break;
+         }
+      }
+   }
+
+   return TRUE;
+}
+
+
+static void 
+exec_delete( struct sp_fragment_shader *base )
+{
+   FREE((void *) base->shader.tokens);
+   FREE(base);
+}
+
+
+struct sp_fragment_shader *
+softpipe_create_fs_exec(struct softpipe_context *softpipe,
+			const struct pipe_shader_state *templ)
+{
+   struct sp_exec_fragment_shader *shader;
+
+   /* Decide whether we'll be codegenerating this shader and if so do
+    * that now.
+    */
+
+   shader = CALLOC_STRUCT(sp_exec_fragment_shader);
+   if (!shader)
+      return NULL;
+
+   /* we need to keep a local copy of the tokens */
+   shader->base.shader.tokens = tgsi_dup_tokens(templ->tokens);
+   shader->base.prepare = exec_prepare;
+   shader->base.run = exec_run;
+   shader->base.delete = exec_delete;
+
+   return &shader->base;
+}
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
new file mode 100644
index 0000000000..daa158df7c
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -0,0 +1,243 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Execute fragment shader using runtime SSE code generation.
+ */
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_fs.h"
+#include "sp_quad.h"
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_sse2.h"
+
+
+#if defined(PIPE_ARCH_X86)
+
+#include "rtasm/rtasm_x86sse.h"
+
+
+
+/**
+ * Subclass of sp_fragment_shader
+ */
+struct sp_sse_fragment_shader
+{
+   struct sp_fragment_shader base;
+   struct x86_function sse2_program;
+   tgsi_sse2_fs_function func;
+   float immediates[TGSI_EXEC_NUM_IMMEDIATES][4];
+};
+
+
+/** cast wrapper */
+static INLINE struct sp_sse_fragment_shader *
+sp_sse_fragment_shader(const struct sp_fragment_shader *base)
+{
+   return (struct sp_sse_fragment_shader *) base;
+}
+
+
+static void
+fs_sse_prepare( const struct sp_fragment_shader *base,
+		struct tgsi_exec_machine *machine,
+		struct tgsi_sampler **samplers )
+{
+   machine->Samplers = samplers;
+}
+
+
+
+/**
+ * Compute quad X,Y,Z,W for the four fragments in a quad.
+ *
+ * This should really be part of the compiled shader.
+ */
+static void
+setup_pos_vector(const struct tgsi_interp_coef *coef,
+		    float x, float y,
+		    struct tgsi_exec_vector *quadpos)
+{
+   uint chan;
+   /* do X */
+   quadpos->xyzw[0].f[0] = x;
+   quadpos->xyzw[0].f[1] = x + 1;
+   quadpos->xyzw[0].f[2] = x;
+   quadpos->xyzw[0].f[3] = x + 1;
+
+   /* do Y */
+   quadpos->xyzw[1].f[0] = y;
+   quadpos->xyzw[1].f[1] = y;
+   quadpos->xyzw[1].f[2] = y + 1;
+   quadpos->xyzw[1].f[3] = y + 1;
+
+   /* do Z and W for all fragments in the quad */
+   for (chan = 2; chan < 4; chan++) {
+      const float dadx = coef->dadx[chan];
+      const float dady = coef->dady[chan];
+      const float a0 = coef->a0[chan] + dadx * x + dady * y;
+      quadpos->xyzw[chan].f[0] = a0;
+      quadpos->xyzw[chan].f[1] = a0 + dadx;
+      quadpos->xyzw[chan].f[2] = a0 + dady;
+      quadpos->xyzw[chan].f[3] = a0 + dadx + dady;
+   }
+}
+
+
+/* TODO: codegenerate the whole run function, skip this wrapper.
+ * TODO: break dependency on tgsi_exec_machine struct
+ * TODO: push Position calculation into the generated shader
+ * TODO: process >1 quad at a time
+ */
+static unsigned 
+fs_sse_run( const struct sp_fragment_shader *base,
+	    struct tgsi_exec_machine *machine,
+	    struct quad_header *quad )
+{
+   struct sp_sse_fragment_shader *shader = sp_sse_fragment_shader(base);
+
+   /* Compute X, Y, Z, W vals for this quad -- place in temp[0] for now */
+   setup_pos_vector(quad->posCoef, 
+                    (float)quad->input.x0, (float)quad->input.y0, 
+                    machine->Temps);
+
+   /* init kill mask */
+   tgsi_set_kill_mask(machine, 0x0);
+   tgsi_set_exec_mask(machine, 1, 1, 1, 1);
+
+   shader->func( machine,
+                 (const float (*)[4])machine->Consts[0],
+                 (const float (*)[4])shader->immediates,
+		 machine->InterpCoefs
+		 /*, &machine->QuadPos*/
+      );
+
+   quad->inout.mask &= ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
+   if (quad->inout.mask == 0)
+      return FALSE;
+
+   /* store outputs */
+   {
+      const ubyte *sem_name = base->info.output_semantic_name;
+      const ubyte *sem_index = base->info.output_semantic_index;
+      const uint n = base->info.num_outputs;
+      uint i;
+      for (i = 0; i < n; i++) {
+         switch (sem_name[i]) {
+         case TGSI_SEMANTIC_COLOR:
+            {
+               uint cbuf = sem_index[i];
+
+               assert(sizeof(quad->output.color[cbuf]) ==
+                      sizeof(machine->Outputs[i]));
+
+               /* copy float[4][4] result */
+               memcpy(quad->output.color[cbuf],
+                      &machine->Outputs[i],
+                      sizeof(quad->output.color[0]) );
+            }
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            {
+               uint j;
+               for (j = 0; j < 4; j++) {
+                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
+               }
+            }
+            break;
+         }
+      }
+   }
+
+   return TRUE;
+}
+
+
+static void 
+fs_sse_delete( struct sp_fragment_shader *base )
+{
+   struct sp_sse_fragment_shader *shader = sp_sse_fragment_shader(base);
+
+   x86_release_func( &shader->sse2_program );
+   FREE(shader);
+}
+
+
+struct sp_fragment_shader *
+softpipe_create_fs_sse(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ)
+{
+   struct sp_sse_fragment_shader *shader;
+
+   if (!softpipe->use_sse)
+      return NULL;
+
+   shader = CALLOC_STRUCT(sp_sse_fragment_shader);
+   if (!shader)
+      return NULL;
+
+   x86_init_func( &shader->sse2_program );
+   
+   if (!tgsi_emit_sse2( templ->tokens, &shader->sse2_program,
+                        shader->immediates, FALSE )) {
+      FREE(shader);
+      return NULL;
+   }
+
+   shader->func = (tgsi_sse2_fs_function) x86_get_func( &shader->sse2_program );
+   if (!shader->func) {
+      x86_release_func( &shader->sse2_program );
+      FREE(shader);
+      return NULL;
+   }
+
+   shader->base.shader.tokens = NULL; /* don't hold reference to templ->tokens */
+   shader->base.prepare = fs_sse_prepare;
+   shader->base.run = fs_sse_run;
+   shader->base.delete = fs_sse_delete;
+
+   return &shader->base;
+}
+
+
+#else
+
+/* Maybe put this variant in the header file.
+ */
+struct sp_fragment_shader *
+softpipe_create_fs_sse(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ)
+{
+   return NULL;
+}
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
new file mode 100644
index 0000000000..c60249dbfb
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -0,0 +1,602 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Interface between 'draw' module's output and the softpipe rasterizer/setup
+ * code.  When the 'draw' module has finished filling a vertex buffer, the
+ * draw_arrays() functions below will be called.  Loop over the vertices and
+ * call the point/line/tri setup functions.
+ *
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include "sp_context.h"
+#include "sp_setup.h"
+#include "sp_state.h"
+#include "sp_prim_vbuf.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "util/u_memory.h"
+#include "util/u_prim.h"
+
+
+#define SP_MAX_VBUF_INDEXES 1024
+#define SP_MAX_VBUF_SIZE    4096
+
+typedef const float (*cptrf4)[4];
+
+/**
+ * Subclass of vbuf_render.
+ */
+struct softpipe_vbuf_render
+{
+   struct vbuf_render base;
+   struct softpipe_context *softpipe;
+   struct setup_context *setup;
+
+   uint prim;
+   uint vertex_size;
+   uint nr_vertices;
+   uint vertex_buffer_size;
+   void *vertex_buffer;
+};
+
+
+/** cast wrapper */
+static struct softpipe_vbuf_render *
+softpipe_vbuf_render(struct vbuf_render *vbr)
+{
+   return (struct softpipe_vbuf_render *) vbr;
+}
+
+
+/** This tells the draw module about our desired vertex layout */
+static const struct vertex_info *
+sp_vbuf_get_vertex_info(struct vbuf_render *vbr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   return softpipe_get_vbuf_vertex_info(cvbr->softpipe);
+}
+
+
+static boolean
+sp_vbuf_allocate_vertices(struct vbuf_render *vbr,
+                          ushort vertex_size, ushort nr_vertices)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   unsigned size = vertex_size * nr_vertices;
+
+   if (cvbr->vertex_buffer_size < size) {
+      align_free(cvbr->vertex_buffer);
+      cvbr->vertex_buffer = align_malloc(size, 16);
+      cvbr->vertex_buffer_size = size;
+   }
+
+   cvbr->vertex_size = vertex_size;
+   cvbr->nr_vertices = nr_vertices;
+   
+   return cvbr->vertex_buffer != NULL;
+}
+
+
+static void
+sp_vbuf_release_vertices(struct vbuf_render *vbr)
+{
+   /* keep the old allocation for next time */
+}
+
+
+static void *
+sp_vbuf_map_vertices(struct vbuf_render *vbr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   return cvbr->vertex_buffer;
+}
+
+
+static void 
+sp_vbuf_unmap_vertices(struct vbuf_render *vbr, 
+                       ushort min_index,
+                       ushort max_index )
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   assert( cvbr->vertex_buffer_size >= (max_index+1) * cvbr->vertex_size );
+   (void) cvbr;
+   /* do nothing */
+}
+
+
+static boolean
+sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   struct setup_context *setup_ctx = cvbr->setup;
+   
+   sp_setup_prepare( setup_ctx );
+
+   cvbr->softpipe->reduced_prim = u_reduced_prim(prim);
+   cvbr->prim = prim;
+   return TRUE;
+}
+
+
+static INLINE cptrf4 get_vert( const void *vertex_buffer,
+                               int index,
+                               int stride )
+{
+   return (cptrf4)((char *)vertex_buffer + index * stride);
+}
+
+
+/**
+ * draw elements / indexed primitives
+ */
+static void
+sp_vbuf_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   struct softpipe_context *softpipe = cvbr->softpipe;
+   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const void *vertex_buffer = cvbr->vertex_buffer;
+   struct setup_context *setup = cvbr->setup;
+   const boolean flatshade_first = softpipe->rasterizer->flatshade_first;
+   unsigned i;
+
+   switch (cvbr->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         sp_setup_point( setup,
+                         get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         sp_setup_line( setup,
+                        get_vert(vertex_buffer, indices[i-1], stride),
+                        get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         sp_setup_line( setup,
+                        get_vert(vertex_buffer, indices[i-1], stride),
+                        get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         sp_setup_line( setup,
+                        get_vert(vertex_buffer, indices[i-1], stride),
+                        get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      if (nr) {
+         sp_setup_line( setup,
+                        get_vert(vertex_buffer, indices[nr-1], stride),
+                        get_vert(vertex_buffer, indices[0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 2; i < nr; i += 3) {
+         sp_setup_tri( setup,
+                       get_vert(vertex_buffer, indices[i-2], stride),
+                       get_vert(vertex_buffer, indices[i-1], stride),
+                       get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            /* emit first triangle vertex as first triangle vertex */
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[i-2], stride),
+                          get_vert(vertex_buffer, indices[i+(i&1)-1], stride),
+                          get_vert(vertex_buffer, indices[i-(i&1)], stride) );
+
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            /* emit last triangle vertex as last triangle vertex */
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[i+(i&1)-2], stride),
+                          get_vert(vertex_buffer, indices[i-(i&1)-1], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            /* emit first non-spoke vertex as first vertex */
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[i-1], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride),
+                          get_vert(vertex_buffer, indices[0], stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            /* emit last non-spoke vertex as last vertex */
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[0], stride),
+                          get_vert(vertex_buffer, indices[i-1], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUADS:
+      /* GL quads don't follow provoking vertex convention */
+      if (flatshade_first) { 
+         /* emit last quad vertex as first triangle vertex */
+         for (i = 3; i < nr; i += 4) {
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[i-0], stride),
+                          get_vert(vertex_buffer, indices[i-3], stride),
+                          get_vert(vertex_buffer, indices[i-2], stride) );
+
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[i-0], stride),
+                          get_vert(vertex_buffer, indices[i-2], stride),
+                          get_vert(vertex_buffer, indices[i-1], stride) );
+         }
+      }
+      else {
+         /* emit last quad vertex as last triangle vertex */
+         for (i = 3; i < nr; i += 4) {
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[i-3], stride),
+                          get_vert(vertex_buffer, indices[i-2], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride) );
+
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[i-2], stride),
+                          get_vert(vertex_buffer, indices[i-1], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      /* GL quad strips don't follow provoking vertex convention */
+      if (flatshade_first) { 
+         /* emit last quad vertex as first triangle vertex */
+         for (i = 3; i < nr; i += 2) {
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[i-0], stride),
+                          get_vert(vertex_buffer, indices[i-3], stride),
+                          get_vert(vertex_buffer, indices[i-2], stride) );
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[i-0], stride),
+                          get_vert(vertex_buffer, indices[i-1], stride),
+                          get_vert(vertex_buffer, indices[i-3], stride) );
+         }
+      }
+      else {
+         /* emit last quad vertex as last triangle vertex */
+         for (i = 3; i < nr; i += 2) {
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[i-3], stride),
+                          get_vert(vertex_buffer, indices[i-2], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride) );
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[i-1], stride),
+                          get_vert(vertex_buffer, indices[i-3], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      /* Almost same as tri fan but the _first_ vertex specifies the flat
+       * shading color.
+       */
+      if (flatshade_first) { 
+         /* emit first polygon  vertex as first triangle vertex */
+         for (i = 2; i < nr; i += 1) {
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[0], stride),
+                          get_vert(vertex_buffer, indices[i-1], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      else {
+         /* emit first polygon  vertex as last triangle vertex */
+         for (i = 2; i < nr; i += 1) {
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, indices[i-1], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride),
+                          get_vert(vertex_buffer, indices[0], stride) );
+         }
+      }
+      break;
+
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * This function is hit when the draw module is working in pass-through mode.
+ * It's up to us to convert the vertex array into point/line/tri prims.
+ */
+static void
+sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   struct softpipe_context *softpipe = cvbr->softpipe;
+   struct setup_context *setup = cvbr->setup;
+   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const void *vertex_buffer =
+      (void *) get_vert(cvbr->vertex_buffer, start, stride);
+   const boolean flatshade_first = softpipe->rasterizer->flatshade_first;
+   unsigned i;
+
+   switch (cvbr->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         sp_setup_point( setup,
+                         get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         sp_setup_line( setup,
+                        get_vert(vertex_buffer, i-1, stride),
+                        get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         sp_setup_line( setup,
+                     get_vert(vertex_buffer, i-1, stride),
+                     get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         sp_setup_line( setup,
+                        get_vert(vertex_buffer, i-1, stride),
+                        get_vert(vertex_buffer, i-0, stride) );
+      }
+      if (nr) {
+         sp_setup_line( setup,
+                        get_vert(vertex_buffer, nr-1, stride),
+                        get_vert(vertex_buffer, 0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 2; i < nr; i += 3) {
+         sp_setup_tri( setup,
+                       get_vert(vertex_buffer, i-2, stride),
+                       get_vert(vertex_buffer, i-1, stride),
+                       get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (flatshade_first) {
+         for (i = 2; i < nr; i++) {
+            /* emit first triangle vertex as first triangle vertex */
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, i-2, stride),
+                          get_vert(vertex_buffer, i+(i&1)-1, stride),
+                          get_vert(vertex_buffer, i-(i&1), stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i++) {
+            /* emit last triangle vertex as last triangle vertex */
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, i+(i&1)-2, stride),
+                          get_vert(vertex_buffer, i-(i&1)-1, stride),
+                          get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            /* emit first non-spoke vertex as first vertex */
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, i-1, stride),
+                          get_vert(vertex_buffer, i-0, stride),
+                          get_vert(vertex_buffer, 0, stride)  );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            /* emit last non-spoke vertex as last vertex */
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, 0, stride),
+                          get_vert(vertex_buffer, i-1, stride),
+                          get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUADS:
+      /* GL quads don't follow provoking vertex convention */
+      if (flatshade_first) { 
+         /* emit last quad vertex as first triangle vertex */
+         for (i = 3; i < nr; i += 4) {
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, i-0, stride),
+                          get_vert(vertex_buffer, i-3, stride),
+                          get_vert(vertex_buffer, i-2, stride) );
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, i-0, stride),
+                          get_vert(vertex_buffer, i-2, stride),
+                          get_vert(vertex_buffer, i-1, stride) );
+         }
+      }
+      else {
+         /* emit last quad vertex as last triangle vertex */
+         for (i = 3; i < nr; i += 4) {
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, i-3, stride),
+                          get_vert(vertex_buffer, i-2, stride),
+                          get_vert(vertex_buffer, i-0, stride) );
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, i-2, stride),
+                          get_vert(vertex_buffer, i-1, stride),
+                          get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      /* GL quad strips don't follow provoking vertex convention */
+      if (flatshade_first) { 
+         /* emit last quad vertex as first triangle vertex */
+         for (i = 3; i < nr; i += 2) {
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, i-0, stride),
+                          get_vert(vertex_buffer, i-3, stride),
+                          get_vert(vertex_buffer, i-2, stride) );
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, i-0, stride),
+                          get_vert(vertex_buffer, i-1, stride),
+                          get_vert(vertex_buffer, i-3, stride) );
+         }
+      }
+      else {
+         /* emit last quad vertex as last triangle vertex */
+         for (i = 3; i < nr; i += 2) {
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, i-3, stride),
+                          get_vert(vertex_buffer, i-2, stride),
+                          get_vert(vertex_buffer, i-0, stride) );
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, i-1, stride),
+                          get_vert(vertex_buffer, i-3, stride),
+                          get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      /* Almost same as tri fan but the _first_ vertex specifies the flat
+       * shading color.
+       */
+      if (flatshade_first) { 
+         /* emit first polygon  vertex as first triangle vertex */
+         for (i = 2; i < nr; i += 1) {
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, 0, stride),
+                          get_vert(vertex_buffer, i-1, stride),
+                          get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      else {
+         /* emit first polygon  vertex as last triangle vertex */
+         for (i = 2; i < nr; i += 1) {
+            sp_setup_tri( setup,
+                          get_vert(vertex_buffer, i-1, stride),
+                          get_vert(vertex_buffer, i-0, stride),
+                          get_vert(vertex_buffer, 0, stride) );
+         }
+      }
+      break;
+
+   default:
+      assert(0);
+   }
+}
+
+static void
+sp_vbuf_so_info(struct vbuf_render *vbr, uint primitives, uint vertices)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   struct softpipe_context *softpipe = cvbr->softpipe;
+   unsigned i;
+
+   for (i = 0; i < softpipe->so_target.num_buffers; ++i) {
+      softpipe->so_target.so_count[i] += vertices;
+   }
+
+   softpipe->so_stats.num_primitives_written = primitives;
+   softpipe->so_stats.primitives_storage_needed =
+      vertices * 4 /*sizeof(float|int32)*/ * 4 /*x,y,z,w*/;
+}
+
+
+static void
+sp_vbuf_destroy(struct vbuf_render *vbr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   if (cvbr->vertex_buffer)
+      align_free(cvbr->vertex_buffer);
+   sp_setup_destroy_context(cvbr->setup);
+   FREE(cvbr);
+}
+
+
+/**
+ * Create the post-transform vertex handler for the given context.
+ */
+struct vbuf_render *
+sp_create_vbuf_backend(struct softpipe_context *sp)
+{
+   struct softpipe_vbuf_render *cvbr = CALLOC_STRUCT(softpipe_vbuf_render);
+
+   assert(sp->draw);
+
+   cvbr->base.max_indices = SP_MAX_VBUF_INDEXES;
+   cvbr->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
+
+   cvbr->base.get_vertex_info = sp_vbuf_get_vertex_info;
+   cvbr->base.allocate_vertices = sp_vbuf_allocate_vertices;
+   cvbr->base.map_vertices = sp_vbuf_map_vertices;
+   cvbr->base.unmap_vertices = sp_vbuf_unmap_vertices;
+   cvbr->base.set_primitive = sp_vbuf_set_primitive;
+   cvbr->base.draw_elements = sp_vbuf_draw_elements;
+   cvbr->base.draw_arrays = sp_vbuf_draw_arrays;
+   cvbr->base.release_vertices = sp_vbuf_release_vertices;
+   cvbr->base.set_stream_output_info = sp_vbuf_so_info;
+   cvbr->base.destroy = sp_vbuf_destroy;
+
+   cvbr->softpipe = sp;
+
+   cvbr->setup = sp_setup_create_context(cvbr->softpipe);
+
+   return &cvbr->base;
+}
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.h b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
new file mode 100644
index 0000000000..ad01cc2f28
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_VBUF_H
+#define SP_VBUF_H
+
+
+struct softpipe_context;
+
+extern struct vbuf_render *
+sp_create_vbuf_backend(struct softpipe_context *softpipe);
+
+
+#endif /* SP_VBUF_H */
diff --git a/src/gallium/drivers/softpipe/sp_public.h b/src/gallium/drivers/softpipe/sp_public.h
new file mode 100644
index 0000000000..62d0903d87
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_public.h
@@ -0,0 +1,10 @@
+#ifndef SP_PUBLIC_H
+#define SP_PUBLIC_H
+
+struct pipe_screen;
+struct sw_winsys;
+
+struct pipe_screen *
+softpipe_create_screen(struct sw_winsys *winsys);
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_quad.h b/src/gallium/drivers/softpipe/sp_quad.h
new file mode 100644
index 0000000000..a3236bd116
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad.h
@@ -0,0 +1,106 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_QUAD_H
+#define SP_QUAD_H
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_exec.h"
+
+
+#define QUAD_PRIM_POINT 1
+#define QUAD_PRIM_LINE  2
+#define QUAD_PRIM_TRI   3
+
+
+/* The rasterizer generates 2x2 quads of fragment and feeds them to
+ * the current fp_machine (see below).
+ * Remember that Y=0=top with Y increasing down the window.
+ */
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+
+#define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
+#define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
+#define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
+#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
+#define MASK_ALL          0xf
+
+
+/**
+ * Quad stage inputs (pos, coverage, front/back face, etc)
+ */
+struct quad_header_input
+{
+   int x0, y0;                /**< quad window pos, always even */
+   float coverage[QUAD_SIZE]; /**< fragment coverage for antialiasing */
+   unsigned facing:1;         /**< Front (0) or back (1) facing? */
+   unsigned prim:2;           /**< QUAD_PRIM_POINT, LINE, TRI */
+};
+
+
+/**
+ * Quad stage inputs/outputs.
+ */
+struct quad_header_inout
+{
+   unsigned mask:4;
+};
+
+
+/**
+ * Quad stage outputs (color & depth).
+ */
+struct quad_header_output
+{
+   /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
+   float color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
+   float depth[QUAD_SIZE];
+};
+
+
+/**
+ * Encodes everything we need to know about a 2x2 pixel block.  Uses
+ * "Channel-Serial" or "SoA" layout.  
+ */
+struct quad_header {
+   struct quad_header_input input;
+   struct quad_header_inout inout;
+   struct quad_header_output output;
+
+   /* Redundant/duplicated:
+    */
+   const struct tgsi_interp_coef *posCoef;
+   const struct tgsi_interp_coef *coef;
+};
+
+#endif /* SP_QUAD_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
new file mode 100644
index 0000000000..00187febf0
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -0,0 +1,1054 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * quad blending
+ * \author Brian Paul
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "sp_context.h"
+#include "sp_quad.h"
+#include "sp_tile_cache.h"
+#include "sp_quad_pipe.h"
+
+
+#define VEC4_COPY(DST, SRC) \
+do { \
+    DST[0] = SRC[0]; \
+    DST[1] = SRC[1]; \
+    DST[2] = SRC[2]; \
+    DST[3] = SRC[3]; \
+} while(0)
+
+#define VEC4_SCALAR(DST, SRC) \
+do { \
+    DST[0] = SRC; \
+    DST[1] = SRC; \
+    DST[2] = SRC; \
+    DST[3] = SRC; \
+} while(0)
+
+#define VEC4_ADD(R, A, B) \
+do { \
+   R[0] = A[0] + B[0]; \
+   R[1] = A[1] + B[1]; \
+   R[2] = A[2] + B[2]; \
+   R[3] = A[3] + B[3]; \
+} while (0)
+
+#define VEC4_SUB(R, A, B) \
+do { \
+   R[0] = A[0] - B[0]; \
+   R[1] = A[1] - B[1]; \
+   R[2] = A[2] - B[2]; \
+   R[3] = A[3] - B[3]; \
+} while (0)
+
+/** Add and limit result to ceiling of 1.0 */
+#define VEC4_ADD_SAT(R, A, B) \
+do { \
+   R[0] = A[0] + B[0];  if (R[0] > 1.0f) R[0] = 1.0f; \
+   R[1] = A[1] + B[1];  if (R[1] > 1.0f) R[1] = 1.0f; \
+   R[2] = A[2] + B[2];  if (R[2] > 1.0f) R[2] = 1.0f; \
+   R[3] = A[3] + B[3];  if (R[3] > 1.0f) R[3] = 1.0f; \
+} while (0)
+
+/** Subtract and limit result to floor of 0.0 */
+#define VEC4_SUB_SAT(R, A, B) \
+do { \
+   R[0] = A[0] - B[0];  if (R[0] < 0.0f) R[0] = 0.0f; \
+   R[1] = A[1] - B[1];  if (R[1] < 0.0f) R[1] = 0.0f; \
+   R[2] = A[2] - B[2];  if (R[2] < 0.0f) R[2] = 0.0f; \
+   R[3] = A[3] - B[3];  if (R[3] < 0.0f) R[3] = 0.0f; \
+} while (0)
+
+#define VEC4_MUL(R, A, B) \
+do { \
+   R[0] = A[0] * B[0]; \
+   R[1] = A[1] * B[1]; \
+   R[2] = A[2] * B[2]; \
+   R[3] = A[3] * B[3]; \
+} while (0)
+
+#define VEC4_MIN(R, A, B) \
+do { \
+   R[0] = (A[0] < B[0]) ? A[0] : B[0]; \
+   R[1] = (A[1] < B[1]) ? A[1] : B[1]; \
+   R[2] = (A[2] < B[2]) ? A[2] : B[2]; \
+   R[3] = (A[3] < B[3]) ? A[3] : B[3]; \
+} while (0)
+
+#define VEC4_MAX(R, A, B) \
+do { \
+   R[0] = (A[0] > B[0]) ? A[0] : B[0]; \
+   R[1] = (A[1] > B[1]) ? A[1] : B[1]; \
+   R[2] = (A[2] > B[2]) ? A[2] : B[2]; \
+   R[3] = (A[3] > B[3]) ? A[3] : B[3]; \
+} while (0)
+
+
+
+static void
+logicop_quad(struct quad_stage *qs, 
+             float (*quadColor)[4],
+             float (*dest)[4])
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   ubyte src[4][4], dst[4][4], res[4][4];
+   uint *src4 = (uint *) src;
+   uint *dst4 = (uint *) dst;
+   uint *res4 = (uint *) res;
+   uint j;
+
+
+   /* convert to ubyte */
+   for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
+      dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
+      dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
+      dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
+      dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
+
+      src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
+      src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
+      src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
+      src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
+   }
+
+   switch (softpipe->blend->logicop_func) {
+   case PIPE_LOGICOP_CLEAR:
+      for (j = 0; j < 4; j++)
+         res4[j] = 0;
+      break;
+   case PIPE_LOGICOP_NOR:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] | dst4[j]);
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j] & dst4[j];
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j];
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] & ~dst4[j];
+      break;
+   case PIPE_LOGICOP_INVERT:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~dst4[j];
+      break;
+   case PIPE_LOGICOP_XOR:
+      for (j = 0; j < 4; j++)
+         res4[j] = dst4[j] ^ src4[j];
+      break;
+   case PIPE_LOGICOP_NAND:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] & dst4[j]);
+      break;
+   case PIPE_LOGICOP_AND:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] & dst4[j];
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] ^ dst4[j]);
+      break;
+   case PIPE_LOGICOP_NOOP:
+      for (j = 0; j < 4; j++)
+         res4[j] = dst4[j];
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j] | dst4[j];
+      break;
+   case PIPE_LOGICOP_COPY:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j];
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] | ~dst4[j];
+      break;
+   case PIPE_LOGICOP_OR:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] | dst4[j];
+      break;
+   case PIPE_LOGICOP_SET:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~0;
+      break;
+   default:
+      assert(0);
+   }
+
+   for (j = 0; j < 4; j++) {
+      quadColor[j][0] = ubyte_to_float(res[j][0]);
+      quadColor[j][1] = ubyte_to_float(res[j][1]);
+      quadColor[j][2] = ubyte_to_float(res[j][2]);
+      quadColor[j][3] = ubyte_to_float(res[j][3]);
+   }
+}
+
+
+
+static void
+blend_quad(struct quad_stage *qs, 
+           float (*quadColor)[4],
+           float (*dest)[4],
+           unsigned cbuf,
+           boolean has_dst_alpha)
+{
+   static const float zero[4] = { 0, 0, 0, 0 };
+   static const float one[4] = { 1, 1, 1, 1 };
+   struct softpipe_context *softpipe = qs->softpipe;
+   float source[4][QUAD_SIZE] = { { 0 } };
+
+   /*
+    * Compute src/first term RGB
+    */
+   switch (softpipe->blend->rt[cbuf].rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      VEC4_COPY(source[0], quadColor[0]); /* R */
+      VEC4_COPY(source[1], quadColor[1]); /* G */
+      VEC4_COPY(source[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
+      VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
+      VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   {
+      const float *alpha = quadColor[3];
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
+      VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
+      VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      if (has_dst_alpha) {
+         const float *alpha = dest[3];
+         VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+         VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+         VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+      } 
+      else {
+         VEC4_COPY(source[0], quadColor[0]); /* R */
+         VEC4_COPY(source[1], quadColor[1]); /* G */
+         VEC4_COPY(source[2], quadColor[2]); /* B */
+      }
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      if (has_dst_alpha) {
+         const float *alpha = quadColor[3];
+         float diff[4], temp[4];
+         VEC4_SUB(diff, one, dest[3]);
+         VEC4_MIN(temp, alpha, diff);
+         VEC4_MUL(source[0], quadColor[0], temp); /* R */
+         VEC4_MUL(source[1], quadColor[1], temp); /* G */
+         VEC4_MUL(source[2], quadColor[2], temp); /* B */
+      }
+      else {
+         VEC4_COPY(source[0], zero); /* R */
+         VEC4_COPY(source[1], zero); /* G */
+         VEC4_COPY(source[2], zero); /* B */
+      }
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], comp); /* R */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], comp); /* G */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float alpha[4];
+      VEC4_SCALAR(alpha, softpipe->blend_color.color[3]);
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(source[0], zero); /* R */
+      VEC4_COPY(source[1], zero); /* G */
+      VEC4_COPY(source[2], zero); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, quadColor[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      if (has_dst_alpha) {
+         float inv_alpha[4];
+         VEC4_SUB(inv_alpha, one, dest[3]);
+         VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+         VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+         VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+      }
+      else {
+         VEC4_COPY(source[0], zero); /* R */
+         VEC4_COPY(source[1], zero); /* G */
+         VEC4_COPY(source[2], zero); /* B */
+      }
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, dest[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, dest[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   {
+      float inv_comp[4];
+      /* R */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+      VEC4_MUL(source[0], quadColor[0], inv_comp);
+      /* G */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+      VEC4_MUL(source[1], quadColor[1], inv_comp);
+      /* B */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+      VEC4_MUL(source[2], quadColor[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SCALAR(inv_alpha, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Compute src/first term A
+    */
+   switch (softpipe->blend->rt[cbuf].alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      VEC4_COPY(source[3], quadColor[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   {
+      const float *alpha = quadColor[3];
+      VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      if (has_dst_alpha)
+         VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
+      else
+         VEC4_COPY(source[3], quadColor[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* multiply alpha by 1.0 */
+      VEC4_COPY(source[3], quadColor[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(source[3], quadColor[3], comp); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(source[3], zero); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, quadColor[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      if (has_dst_alpha) {
+         float inv_alpha[4];
+         VEC4_SUB(inv_alpha, one, dest[3]);
+         VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+      }
+      else {
+         VEC4_COPY(source[3], zero); /* A */
+      }
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      /* A */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_comp);
+   }
+   break;
+   default:
+      assert(0);
+   }
+
+
+   /*
+    * Compute dest/second term RGB
+    */
+   switch (softpipe->blend->rt[cbuf].rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* dest = dest * 1   NO-OP, leave dest as-is */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
+      VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
+      VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
+      VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
+      VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      if (has_dst_alpha) {
+         VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
+         VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
+         VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
+      }
+      else {
+         /* dest = dest * 1   NO-OP, leave dest as-is */
+      }
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
+      VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
+      VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      if (has_dst_alpha) {
+         const float *alpha = quadColor[3];
+         float diff[4], temp[4];
+         VEC4_SUB(diff, one, dest[3]);
+         VEC4_MIN(temp, alpha, diff);
+         VEC4_MUL(dest[0], quadColor[0], temp); /* R */
+         VEC4_MUL(dest[1], quadColor[1], temp); /* G */
+         VEC4_MUL(dest[2], quadColor[2], temp); /* B */
+      }
+      else {
+         VEC4_COPY(dest[0], zero); /* R */
+         VEC4_COPY(dest[1], zero); /* G */
+         VEC4_COPY(dest[2], zero); /* B */
+      }
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+      VEC4_MUL(dest[0], dest[0], comp); /* R */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+      VEC4_MUL(dest[1], dest[1], comp); /* G */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+      VEC4_MUL(dest[2], dest[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(dest[0], dest[0], comp); /* R */
+      VEC4_MUL(dest[1], dest[1], comp); /* G */
+      VEC4_MUL(dest[2], dest[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(dest[0], zero); /* R */
+      VEC4_COPY(dest[1], zero); /* G */
+      VEC4_COPY(dest[2], zero); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      /* XXX what are these? */
+      assert(0);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+      VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+      VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+      VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+      VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+      VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float one_minus_alpha[QUAD_SIZE];
+      VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+      VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
+      VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
+      VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      if (has_dst_alpha) {
+         float inv_comp[4];
+         VEC4_SUB(inv_comp, one, dest[3]); /* A */
+         VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+         VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+         VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+      }
+      else {
+         VEC4_COPY(dest[0], zero); /* R */
+         VEC4_COPY(dest[1], zero); /* G */
+         VEC4_COPY(dest[2], zero); /* B */
+      }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[0]); /* R */
+      VEC4_MUL(dest[0], dest[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, dest[1]); /* G */
+      VEC4_MUL(dest[1], dest[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, dest[2]); /* B */
+      VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   {
+      float inv_comp[4];
+      /* R */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+      VEC4_MUL(dest[0], dest[0], inv_comp);
+      /* G */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+      VEC4_MUL(dest[1], dest[1], inv_comp);
+      /* B */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+      VEC4_MUL(dest[2], dest[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(dest[0], dest[0], inv_comp);
+      VEC4_MUL(dest[1], dest[1], inv_comp);
+      VEC4_MUL(dest[2], dest[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      /* XXX what are these? */
+      assert(0);
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Compute dest/second term A
+    */
+   switch (softpipe->blend->rt[cbuf].alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* dest = dest * 1   NO-OP, leave dest as-is */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      if (has_dst_alpha) {
+         VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
+      }
+      else {
+         /* dest = dest * 1   NO-OP, leave dest as-is */
+      }
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* dest = dest * 1   NO-OP, leave dest as-is */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(dest[3], dest[3], comp); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(dest[3], zero); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float one_minus_alpha[QUAD_SIZE];
+      VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      if (has_dst_alpha) {
+         float inv_comp[4];
+         VEC4_SUB(inv_comp, one, dest[3]); /* A */
+         VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
+      }
+      else {
+         VEC4_COPY(dest[3], zero); /* A */
+      }
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(dest[3], dest[3], inv_comp);
+   }
+   break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Combine RGB terms
+    */
+   switch (softpipe->blend->rt[cbuf].rgb_func) {
+   case PIPE_BLEND_ADD:
+      VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_SUB_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_SUB_SAT(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[0], dest[0], source[0]); /* R */
+      VEC4_SUB_SAT(quadColor[1], dest[1], source[1]); /* G */
+      VEC4_SUB_SAT(quadColor[2], dest[2], source[2]); /* B */
+      break;
+   case PIPE_BLEND_MIN:
+      VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_MAX:
+      VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Combine A terms
+    */
+   switch (softpipe->blend->rt[cbuf].alpha_func) {
+   case PIPE_BLEND_ADD:
+      VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[3], dest[3], source[3]); /* A */
+      break;
+   case PIPE_BLEND_MIN:
+      VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_MAX:
+      VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   default:
+      assert(0);
+   }
+}
+
+static void
+colormask_quad(unsigned colormask,
+               float (*quadColor)[4],
+               float (*dest)[4])
+{
+   /* R */
+   if (!(colormask & PIPE_MASK_R))
+      COPY_4V(quadColor[0], dest[0]);
+
+   /* G */
+   if (!(colormask & PIPE_MASK_G))
+      COPY_4V(quadColor[1], dest[1]);
+
+   /* B */
+   if (!(colormask & PIPE_MASK_B))
+      COPY_4V(quadColor[2], dest[2]);
+
+   /* A */
+   if (!(colormask & PIPE_MASK_A))
+      COPY_4V(quadColor[3], dest[3]);
+}
+
+
+static void
+blend_fallback(struct quad_stage *qs, 
+               struct quad_header *quads[],
+               unsigned nr)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   const struct pipe_blend_state *blend = softpipe->blend;
+   unsigned cbuf;
+
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) 
+   {
+      /* which blend/mask state index to use: */
+      const uint blend_buf = blend->independent_blend_enable ? cbuf : 0;
+      float dest[4][QUAD_SIZE];
+      struct softpipe_cached_tile *tile
+         = sp_get_cached_tile(softpipe->cbuf_cache[cbuf],
+                              quads[0]->input.x0, 
+                              quads[0]->input.y0);
+      boolean has_dst_alpha
+         = util_format_has_alpha(softpipe->framebuffer.cbufs[cbuf]->format);
+      uint q, i, j;
+
+      for (q = 0; q < nr; q++) {
+         struct quad_header *quad = quads[q];
+         float (*quadColor)[4] = quad->output.color[cbuf];
+         const int itx = (quad->input.x0 & (TILE_SIZE-1));
+         const int ity = (quad->input.y0 & (TILE_SIZE-1));
+
+         /* get/swizzle dest colors 
+          */
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) {
+               dest[i][j] = tile->data.color[y][x][i];
+            }
+         }
+
+
+         if (blend->logicop_enable) {
+            logicop_quad( qs, quadColor, dest );
+         }
+         else if (blend->rt[blend_buf].blend_enable) {
+            blend_quad( qs, quadColor, dest, cbuf, has_dst_alpha );
+         }
+
+         if (blend->rt[blend_buf].colormask != 0xf)
+            colormask_quad( blend->rt[cbuf].colormask, quadColor, dest);
+   
+         /* Output color values
+          */
+         for (j = 0; j < QUAD_SIZE; j++) {
+            if (quad->inout.mask & (1 << j)) {
+               int x = itx + (j & 1);
+               int y = ity + (j >> 1);
+               for (i = 0; i < 4; i++) { /* loop over color chans */
+                  tile->data.color[y][x][i] = quadColor[i][j];
+               }
+            }
+         }
+      }
+   }
+}
+
+
+static void
+blend_single_add_src_alpha_inv_src_alpha(struct quad_stage *qs, 
+                                         struct quad_header *quads[],
+                                         unsigned nr)
+{
+   static const float one[4] = { 1, 1, 1, 1 };
+   float one_minus_alpha[QUAD_SIZE];
+   float dest[4][QUAD_SIZE];
+   float source[4][QUAD_SIZE];
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const float *alpha = quadColor[3];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = itx + (j & 1);
+         int y = ity + (j >> 1);
+         for (i = 0; i < 4; i++) {
+            dest[i][j] = tile->data.color[y][x][i];
+         }
+      }
+
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+      VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+
+      VEC4_SUB(one_minus_alpha, one, alpha);
+      VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
+      VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
+      VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
+      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* B */
+
+      VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
+      VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
+         }
+      }
+   }
+}
+
+static void
+blend_single_add_one_one(struct quad_stage *qs, 
+                         struct quad_header *quads[],
+                         unsigned nr)
+{
+   float dest[4][QUAD_SIZE];
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = itx + (j & 1);
+         int y = ity + (j >> 1);
+         for (i = 0; i < 4; i++) {
+            dest[i][j] = tile->data.color[y][x][i];
+         }
+      }
+     
+      VEC4_ADD_SAT(quadColor[0], quadColor[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], quadColor[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], quadColor[2], dest[2]); /* B */
+      VEC4_ADD_SAT(quadColor[3], quadColor[3], dest[3]); /* A */
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
+         }
+      }
+   }
+}
+
+
+static void
+single_output_color(struct quad_stage *qs, 
+                    struct quad_header *quads[],
+                    unsigned nr)
+{
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
+         }
+      }
+   }
+}
+
+static void
+blend_noop(struct quad_stage *qs, 
+           struct quad_header *quads[],
+           unsigned nr)
+{
+}
+
+
+static void
+choose_blend_quad(struct quad_stage *qs, 
+                  struct quad_header *quads[],
+                  unsigned nr)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   const struct pipe_blend_state *blend = softpipe->blend;
+
+   qs->run = blend_fallback;
+   
+   if (softpipe->framebuffer.nr_cbufs == 0) {
+      qs->run = blend_noop;
+   }
+   else if (!softpipe->blend->logicop_enable &&
+            softpipe->blend->rt[0].colormask == 0xf &&
+            softpipe->framebuffer.nr_cbufs == 1)
+   {
+      if (!blend->rt[0].blend_enable) {
+         qs->run = single_output_color;
+      }
+      else if (blend->rt[0].rgb_src_factor == blend->rt[0].alpha_src_factor &&
+               blend->rt[0].rgb_dst_factor == blend->rt[0].alpha_dst_factor &&
+               blend->rt[0].rgb_func == blend->rt[0].alpha_func)
+      {
+         if (blend->rt[0].alpha_func == PIPE_BLEND_ADD) {
+            if (blend->rt[0].rgb_src_factor == PIPE_BLENDFACTOR_ONE &&
+                blend->rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_ONE) {
+               qs->run = blend_single_add_one_one;
+            }
+            else if (blend->rt[0].rgb_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA &&
+                blend->rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
+               qs->run = blend_single_add_src_alpha_inv_src_alpha;
+
+         }
+      }
+   }
+
+   qs->run(qs, quads, nr);
+}
+
+
+static void blend_begin(struct quad_stage *qs)
+{
+   qs->run = choose_blend_quad;
+}
+
+
+static void blend_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = blend_begin;
+   stage->run = choose_blend_quad;
+   stage->destroy = blend_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
new file mode 100644
index 0000000000..72117c233e
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -0,0 +1,889 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Quad depth / stencil testing
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_scan.h"
+#include "sp_context.h"
+#include "sp_quad.h"
+#include "sp_quad_pipe.h"
+#include "sp_tile_cache.h"
+#include "sp_state.h"           /* for sp_fragment_shader */
+
+
+struct depth_data {
+   struct pipe_surface *ps;
+   enum pipe_format format;
+   unsigned bzzzz[QUAD_SIZE];  /**< Z values fetched from depth buffer */
+   unsigned qzzzz[QUAD_SIZE];  /**< Z values from the quad */
+   ubyte stencilVals[QUAD_SIZE];
+   struct softpipe_cached_tile *tile;
+};
+
+
+
+static void
+get_depth_stencil_values( struct depth_data *data,
+                          const struct quad_header *quad )
+{
+   unsigned j;
+   const struct softpipe_cached_tile *tile = data->tile;
+
+   switch (data->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth16[y][x];
+      }
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x];
+      }
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x] & 0xffffff;
+         data->stencilVals[j] = tile->data.depth32[y][x] >> 24;
+      }
+   break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x] >> 8;
+         data->stencilVals[j] = tile->data.depth32[y][x] & 0xff;
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * If the shader has not been run, interpolate the depth values
+ * ourselves.
+ */
+static void
+interpolate_quad_depth( struct quad_header *quad )
+{
+   const float fx = (float) quad->input.x0;
+   const float fy = (float) quad->input.y0;
+   const float dzdx = quad->posCoef->dadx[2];
+   const float dzdy = quad->posCoef->dady[2];
+   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+
+   quad->output.depth[0] = z0;
+   quad->output.depth[1] = z0 + dzdx;
+   quad->output.depth[2] = z0 + dzdy;
+   quad->output.depth[3] = z0 + dzdx + dzdy;
+}
+
+
+/**
+ * Compute the depth_data::qzzzz[] values from the float fragment Z values.
+ */
+static void
+convert_quad_depth( struct depth_data *data, 
+                    const struct quad_header *quad )
+{
+   unsigned j;
+
+   /* Convert quad's float depth values to int depth values (qzzzz).
+    * If the Z buffer stores integer values, we _have_ to do the depth
+    * compares with integers (not floats).  Otherwise, the float->int->float
+    * conversion of Z values (which isn't an identity function) will cause
+    * Z-fighting errors.
+    */
+   switch (data->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      {
+         float scale = 65535.0;
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+         }
+      }
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      {
+         double scale = (double) (uint) ~0UL;
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+         }
+      }
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+      {
+         float scale = (float) ((1 << 24) - 1);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+         }
+      }
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+      {
+         float scale = (float) ((1 << 24) - 1);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+/**
+ * Write data->bzzzz[] values and data->stencilVals into the Z/stencil buffer.
+ */
+static void
+write_depth_stencil_values( struct depth_data *data,
+                            struct quad_header *quad )
+{
+   struct softpipe_cached_tile *tile = data->tile;
+   unsigned j;
+
+   /* put updated Z values back into cached tile */
+   switch (data->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth16[y][x] = (ushort) data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z32_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = (data->stencilVals[j] << 24) | data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = (data->bzzzz[j] << 8) | data->stencilVals[j];
+      }
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = data->bzzzz[j] << 8;
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+/** Only 8-bit stencil supported */
+#define STENCIL_MAX 0xff
+
+
+/**
+ * Do the basic stencil test (compare stencil buffer values against the
+ * reference value.
+ *
+ * \param data->stencilVals  the stencil values from the stencil buffer
+ * \param func  the stencil func (PIPE_FUNC_x)
+ * \param ref  the stencil reference value
+ * \param valMask  the stencil value mask indicating which bits of the stencil
+ *                 values and ref value are to be used.
+ * \return mask indicating which pixels passed the stencil test
+ */
+static unsigned
+do_stencil_test(struct depth_data *data,
+                unsigned func,
+                unsigned ref, unsigned valMask)
+{
+   unsigned passMask = 0x0;
+   unsigned j;
+
+   ref &= valMask;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      /* passMask = 0x0 */
+      break;
+   case PIPE_FUNC_LESS:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref < (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_EQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref == (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_LEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref <= (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GREATER:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref > (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref != (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref >= (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_ALWAYS:
+      passMask = MASK_ALL;
+      break;
+   default:
+      assert(0);
+   }
+
+   return passMask;
+}
+
+
+/**
+ * Apply the stencil operator to stencil values.
+ *
+ * \param data->stencilVals  the stencil buffer values (read and written)
+ * \param mask  indicates which pixels to update
+ * \param op  the stencil operator (PIPE_STENCIL_OP_x)
+ * \param ref  the stencil reference value
+ * \param wrtMask  writemask controlling which bits are changed in the
+ *                 stencil values
+ */
+static void
+apply_stencil_op(struct depth_data *data,
+                 unsigned mask, unsigned op, ubyte ref, ubyte wrtMask)
+{
+   unsigned j;
+   ubyte newstencil[QUAD_SIZE];
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      newstencil[j] = data->stencilVals[j];
+   }
+
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* no-op */
+      break;
+   case PIPE_STENCIL_OP_ZERO:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = 0;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ref;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (data->stencilVals[j] < STENCIL_MAX) {
+               newstencil[j] = data->stencilVals[j] + 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (data->stencilVals[j] > 0) {
+               newstencil[j] = data->stencilVals[j] - 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = data->stencilVals[j] + 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = data->stencilVals[j] - 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ~data->stencilVals[j];
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * update the stencil values
+    */
+   if (wrtMask != STENCIL_MAX) {
+      /* apply bit-wise stencil buffer writemask */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         data->stencilVals[j] = (wrtMask & newstencil[j]) | (~wrtMask & data->stencilVals[j]);
+      }
+   }
+   else {
+      for (j = 0; j < QUAD_SIZE; j++) {
+         data->stencilVals[j] = newstencil[j];
+      }
+   }
+}
+
+   
+
+/**
+ * To increase efficiency, we should probably have multiple versions
+ * of this function that are specifically for Z16, Z32 and FP Z buffers.
+ * Try to effectively do that with codegen...
+ */
+static boolean
+depth_test_quad(struct quad_stage *qs, 
+                struct depth_data *data,
+                struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned zmask = 0;
+   unsigned j;
+
+   switch (softpipe->depth_stencil->depth.func) {
+   case PIPE_FUNC_NEVER:
+      /* zmask = 0 */
+      break;
+   case PIPE_FUNC_LESS:
+      /* Note this is pretty much a single sse or cell instruction.  
+       * Like this:  quad->mask &= (quad->outputs.depth < zzzz);
+       */
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (data->qzzzz[j] < data->bzzzz[j]) 
+	    zmask |= 1 << j;
+      }
+      break;
+   case PIPE_FUNC_EQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (data->qzzzz[j] == data->bzzzz[j]) 
+	    zmask |= 1 << j;
+      }
+      break;
+   case PIPE_FUNC_LEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (data->qzzzz[j] <= data->bzzzz[j]) 
+	    zmask |= (1 << j);
+      }
+      break;
+   case PIPE_FUNC_GREATER:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (data->qzzzz[j] > data->bzzzz[j]) 
+	    zmask |= (1 << j);
+      }
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (data->qzzzz[j] != data->bzzzz[j]) 
+	    zmask |= (1 << j);
+      }
+      break;
+   case PIPE_FUNC_GEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (data->qzzzz[j] >= data->bzzzz[j]) 
+	    zmask |= (1 << j);
+      }
+      break;
+   case PIPE_FUNC_ALWAYS:
+      zmask = MASK_ALL;
+      break;
+   default:
+      assert(0);
+   }
+
+   quad->inout.mask &= zmask;
+   if (quad->inout.mask == 0)
+      return FALSE;
+
+   /* Update our internal copy only if writemask set.  Even if
+    * depth.writemask is FALSE, may still need to write out buffer
+    * data due to stencil changes.
+    */
+   if (softpipe->depth_stencil->depth.writemask) {
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            data->bzzzz[j] = data->qzzzz[j];
+         }
+      }
+   }
+
+   return TRUE;
+}
+
+
+
+/**
+ * Do stencil (and depth) testing.  Stenciling depends on the outcome of
+ * depth testing.
+ */
+static void
+depth_stencil_test_quad(struct quad_stage *qs, 
+                        struct depth_data *data,
+                        struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned func, zFailOp, zPassOp, failOp;
+   ubyte ref, wrtMask, valMask;
+   uint face = quad->input.facing;
+
+   if (!softpipe->depth_stencil->stencil[1].enabled) {
+      /* single-sided stencil test, use front (face=0) state */
+      face = 0;
+   }
+
+   /* 0 = front-face, 1 = back-face */
+   assert(face == 0 || face == 1);
+
+   /* choose front or back face function, operator, etc */
+   /* XXX we could do these initializations once per primitive */
+   func    = softpipe->depth_stencil->stencil[face].func;
+   failOp  = softpipe->depth_stencil->stencil[face].fail_op;
+   zFailOp = softpipe->depth_stencil->stencil[face].zfail_op;
+   zPassOp = softpipe->depth_stencil->stencil[face].zpass_op;
+   ref     = softpipe->stencil_ref.ref_value[face];
+   wrtMask = softpipe->depth_stencil->stencil[face].writemask;
+   valMask = softpipe->depth_stencil->stencil[face].valuemask;
+
+   /* do the stencil test first */
+   {
+      unsigned passMask, failMask;
+      passMask = do_stencil_test(data, func, ref, valMask);
+      failMask = quad->inout.mask & ~passMask;
+      quad->inout.mask &= passMask;
+
+      if (failOp != PIPE_STENCIL_OP_KEEP) {
+         apply_stencil_op(data, failMask, failOp, ref, wrtMask);
+      }
+   }
+
+   if (quad->inout.mask) {
+      /* now the pixels that passed the stencil test are depth tested */
+      if (softpipe->depth_stencil->depth.enabled) {
+         const unsigned origMask = quad->inout.mask;
+
+         depth_test_quad(qs, data, quad);  /* quad->mask is updated */
+
+         /* update stencil buffer values according to z pass/fail result */
+         if (zFailOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned zFailMask = origMask & ~quad->inout.mask;
+            apply_stencil_op(data, zFailMask, zFailOp, ref, wrtMask);
+         }
+
+         if (zPassOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned zPassMask = origMask & quad->inout.mask;
+            apply_stencil_op(data, zPassMask, zPassOp, ref, wrtMask);
+         }
+      }
+      else {
+         /* no depth test, apply Zpass operator to stencil buffer values */
+         apply_stencil_op(data, quad->inout.mask, zPassOp, ref, wrtMask);
+      }
+   }
+}
+
+
+#define ALPHATEST( FUNC, COMP )                                         \
+   static int                                                           \
+   alpha_test_quads_##FUNC( struct quad_stage *qs,                      \
+                           struct quad_header *quads[],                 \
+                           unsigned nr )                                \
+   {                                                                    \
+      const float ref = qs->softpipe->depth_stencil->alpha.ref_value;   \
+      const uint cbuf = 0; /* only output[0].alpha is tested */         \
+      unsigned pass_nr = 0;                                             \
+      unsigned i;                                                       \
+                                                                        \
+      for (i = 0; i < nr; i++) {                                        \
+         const float *aaaa = quads[i]->output.color[cbuf][3];           \
+         unsigned passMask = 0;                                         \
+                                                                        \
+         if (aaaa[0] COMP ref) passMask |= (1 << 0);                    \
+         if (aaaa[1] COMP ref) passMask |= (1 << 1);                    \
+         if (aaaa[2] COMP ref) passMask |= (1 << 2);                    \
+         if (aaaa[3] COMP ref) passMask |= (1 << 3);                    \
+                                                                        \
+         quads[i]->inout.mask &= passMask;                              \
+                                                                        \
+         if (quads[i]->inout.mask)                                      \
+            quads[pass_nr++] = quads[i];                                \
+      }                                                                 \
+                                                                        \
+      return pass_nr;                                                   \
+   }
+
+
+ALPHATEST( LESS,     < )
+ALPHATEST( EQUAL,    == )
+ALPHATEST( LEQUAL,   <= )
+ALPHATEST( GREATER,  > )
+ALPHATEST( NOTEQUAL, != )
+ALPHATEST( GEQUAL,   >= )
+
+
+/* XXX: Incorporate into shader using KILP.
+ */
+static int
+alpha_test_quads(struct quad_stage *qs, 
+                 struct quad_header *quads[], 
+                 unsigned nr)
+{
+   switch (qs->softpipe->depth_stencil->alpha.func) {
+   case PIPE_FUNC_LESS:
+      return alpha_test_quads_LESS( qs, quads, nr );
+   case PIPE_FUNC_EQUAL:
+      return alpha_test_quads_EQUAL( qs, quads, nr );
+      break;
+   case PIPE_FUNC_LEQUAL:
+      return alpha_test_quads_LEQUAL( qs, quads, nr );
+   case PIPE_FUNC_GREATER:
+      return alpha_test_quads_GREATER( qs, quads, nr );
+   case PIPE_FUNC_NOTEQUAL:
+      return alpha_test_quads_NOTEQUAL( qs, quads, nr );
+   case PIPE_FUNC_GEQUAL:
+      return alpha_test_quads_GEQUAL( qs, quads, nr );
+   case PIPE_FUNC_ALWAYS:
+      return nr;
+   case PIPE_FUNC_NEVER:
+   default:
+      return 0;
+   }
+}
+
+
+static unsigned mask_count[16] = 
+{
+   0,                           /* 0x0 */
+   1,                           /* 0x1 */
+   1,                           /* 0x2 */
+   2,                           /* 0x3 */
+   1,                           /* 0x4 */
+   2,                           /* 0x5 */
+   2,                           /* 0x6 */
+   3,                           /* 0x7 */
+   1,                           /* 0x8 */
+   2,                           /* 0x9 */
+   2,                           /* 0xa */
+   3,                           /* 0xb */
+   2,                           /* 0xc */
+   3,                           /* 0xd */
+   3,                           /* 0xe */
+   4,                           /* 0xf */
+};
+
+
+
+/** helper to get number of Z buffer bits */
+static unsigned
+get_depth_bits(struct quad_stage *qs)
+{
+   struct pipe_surface *zsurf = qs->softpipe->framebuffer.zsbuf;
+   if (zsurf)
+      return util_format_get_component_bits(zsurf->format,
+                                            UTIL_FORMAT_COLORSPACE_ZS, 0);
+   else
+      return 0;
+}
+
+
+
+/**
+ * General depth/stencil test function.  Used when there's no fast-path.
+ */
+static void
+depth_test_quads_fallback(struct quad_stage *qs, 
+                          struct quad_header *quads[],
+                          unsigned nr)
+{
+   unsigned i, pass = 0;
+   const struct sp_fragment_shader *fs = qs->softpipe->fs;
+   boolean interp_depth = !fs->info.writes_z;
+   struct depth_data data;
+
+
+   if (qs->softpipe->depth_stencil->alpha.enabled) {
+      nr = alpha_test_quads(qs, quads, nr);
+   }
+
+   if (get_depth_bits(qs) > 0 &&
+       (qs->softpipe->depth_stencil->depth.enabled ||
+        qs->softpipe->depth_stencil->stencil[0].enabled)) {
+
+      data.ps = qs->softpipe->framebuffer.zsbuf;
+      data.format = data.ps->format;
+      data.tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, 
+                                     quads[0]->input.x0, 
+                                     quads[0]->input.y0);
+
+      for (i = 0; i < nr; i++) {
+         get_depth_stencil_values(&data, quads[i]);
+
+         if (qs->softpipe->depth_stencil->depth.enabled) {
+            if (interp_depth)
+               interpolate_quad_depth(quads[i]);
+
+            convert_quad_depth(&data, quads[i]);
+         }
+
+         if (qs->softpipe->depth_stencil->stencil[0].enabled) {
+            depth_stencil_test_quad(qs, &data, quads[i]);
+            write_depth_stencil_values(&data, quads[i]);
+         }
+         else {
+            if (!depth_test_quad(qs, &data, quads[i]))
+               continue;
+
+            if (qs->softpipe->depth_stencil->depth.writemask)
+               write_depth_stencil_values(&data, quads[i]);
+         }
+
+         quads[pass++] = quads[i];
+      }
+
+      nr = pass;
+   }
+
+   if (qs->softpipe->active_query_count) {
+      for (i = 0; i < nr; i++) 
+         qs->softpipe->occlusion_count += mask_count[quads[i]->inout.mask];
+   }
+
+   if (nr)
+      qs->next->run(qs->next, quads, nr);
+}
+
+
+/**
+ * Special-case Z testing for 16-bit Zbuffer and Z buffer writes enabled.
+ */
+
+#define NAME depth_interp_z16_less_write
+#define OPERATOR <
+#include "sp_quad_depth_test_tmp.h"
+
+#define NAME depth_interp_z16_equal_write
+#define OPERATOR ==
+#include "sp_quad_depth_test_tmp.h"
+
+#define NAME depth_interp_z16_lequal_write
+#define OPERATOR <=
+#include "sp_quad_depth_test_tmp.h"
+
+#define NAME depth_interp_z16_greater_write
+#define OPERATOR >
+#include "sp_quad_depth_test_tmp.h"
+
+#define NAME depth_interp_z16_notequal_write
+#define OPERATOR !=
+#include "sp_quad_depth_test_tmp.h"
+
+#define NAME depth_interp_z16_gequal_write
+#define OPERATOR >=
+#include "sp_quad_depth_test_tmp.h"
+
+#define NAME depth_interp_z16_always_write
+#define ALWAYS 1
+#include "sp_quad_depth_test_tmp.h"
+
+
+
+static void
+depth_noop(struct quad_stage *qs, 
+           struct quad_header *quads[],
+           unsigned nr)
+{
+   qs->next->run(qs->next, quads, nr);
+}
+
+
+
+static void
+choose_depth_test(struct quad_stage *qs, 
+                  struct quad_header *quads[],
+                  unsigned nr)
+{
+   boolean interp_depth = !qs->softpipe->fs->info.writes_z;
+
+   boolean alpha = qs->softpipe->depth_stencil->alpha.enabled;
+
+   boolean depth = (get_depth_bits(qs) > 0 &&
+                    qs->softpipe->depth_stencil->depth.enabled);
+
+   unsigned depthfunc = qs->softpipe->depth_stencil->depth.func;
+
+   boolean stencil = qs->softpipe->depth_stencil->stencil[0].enabled;
+
+   boolean depthwrite = qs->softpipe->depth_stencil->depth.writemask;
+
+   boolean occlusion = qs->softpipe->active_query_count;
+
+   /* default */
+   qs->run = depth_test_quads_fallback;
+
+   /* look for special cases */
+   if (!alpha &&
+       !depth &&
+       !stencil) {
+      qs->run = depth_noop;
+   }
+   else if (!alpha && 
+            interp_depth && 
+            depth && 
+            depthwrite && 
+            !occlusion &&
+            !stencil) 
+   {
+      if (qs->softpipe->framebuffer.zsbuf->format == PIPE_FORMAT_Z16_UNORM) {
+         switch (depthfunc) {
+         case PIPE_FUNC_NEVER:
+            qs->run = depth_test_quads_fallback;
+            break;
+         case PIPE_FUNC_LESS:
+            qs->run = depth_interp_z16_less_write;
+            break;
+         case PIPE_FUNC_EQUAL:
+            qs->run = depth_interp_z16_equal_write;
+            break;
+         case PIPE_FUNC_LEQUAL:
+            qs->run = depth_interp_z16_lequal_write;
+            break;
+         case PIPE_FUNC_GREATER:
+            qs->run = depth_interp_z16_greater_write;
+            break;
+         case PIPE_FUNC_NOTEQUAL:
+            qs->run = depth_interp_z16_notequal_write;
+            break;
+         case PIPE_FUNC_GEQUAL:
+            qs->run = depth_interp_z16_gequal_write;
+            break;
+         case PIPE_FUNC_ALWAYS:
+            qs->run = depth_interp_z16_always_write;
+            break;
+         default:
+            qs->run = depth_test_quads_fallback;
+            break;
+         }
+      }
+   }
+
+   /* next quad/fragment stage */
+   qs->run( qs, quads, nr );
+}
+
+
+
+static void
+depth_test_begin(struct quad_stage *qs)
+{
+   qs->run = choose_depth_test;
+   qs->next->begin(qs->next);
+}
+
+
+static void
+depth_test_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *
+sp_quad_depth_test_stage(struct softpipe_context *softpipe)
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = depth_test_begin;
+   stage->run = choose_depth_test;
+   stage->destroy = depth_test_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test_tmp.h b/src/gallium/drivers/softpipe/sp_quad_depth_test_tmp.h
new file mode 100644
index 0000000000..25af415c25
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test_tmp.h
@@ -0,0 +1,147 @@
+/**************************************************************************
+ * 
+ * Copyright 2010 VMware, Inc.  All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/*
+ * Template for generating Z test functions
+ * Only PIPE_FORMAT_Z16_UNORM supported at this time.
+ */
+
+
+#ifndef NAME
+#error "NAME is not defined!"
+#endif
+
+#if !defined(OPERATOR) && !defined(ALWAYS)
+#error "neither OPERATOR nor ALWAYS is defined!"
+#endif
+
+
+/*
+ * NOTE: there's no guarantee that the quads are sequentially side by
+ * side.  The fragment shader may have culled some quads, etc.  Sliver
+ * triangles may generate non-sequential quads.
+ */
+static void
+NAME(struct quad_stage *qs, 
+     struct quad_header *quads[],
+     unsigned nr)
+{
+   unsigned i, pass = 0;
+   const unsigned ix = quads[0]->input.x0;
+   const unsigned iy = quads[0]->input.y0;
+   const float fx = (float) ix;
+   const float fy = (float) iy;
+   const float dzdx = quads[0]->posCoef->dadx[2];
+   const float dzdy = quads[0]->posCoef->dady[2];
+   const float z0 = quads[0]->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+   struct softpipe_cached_tile *tile;
+   ushort (*depth16)[TILE_SIZE];
+   ushort init_idepth[4], idepth[4], depth_step;
+   const float scale = 65535.0;
+
+   /* compute scaled depth of the four pixels in first quad */
+   init_idepth[0] = (ushort)((z0) * scale);
+   init_idepth[1] = (ushort)((z0 + dzdx) * scale);
+   init_idepth[2] = (ushort)((z0 + dzdy) * scale);
+   init_idepth[3] = (ushort)((z0 + dzdx + dzdy) * scale);
+
+   depth_step = (ushort)(dzdx * scale);
+
+   tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, ix, iy);
+
+   for (i = 0; i < nr; i++) {
+      const unsigned outmask = quads[i]->inout.mask;
+      const int dx = quads[i]->input.x0 - ix;
+      unsigned mask = 0;
+      
+      /* compute depth for this quad */
+      idepth[0] = init_idepth[0] + dx * depth_step;
+      idepth[1] = init_idepth[1] + dx * depth_step;
+      idepth[2] = init_idepth[2] + dx * depth_step;
+      idepth[3] = init_idepth[3] + dx * depth_step;
+
+      depth16 = (ushort (*)[TILE_SIZE])
+         &tile->data.depth16[iy % TILE_SIZE][(ix + dx)% TILE_SIZE];
+
+#ifdef ALWAYS
+      if (outmask & 1) {
+         depth16[0][0] = idepth[0];
+         mask |= (1 << 0);
+      }
+
+      if (outmask & 2) {
+         depth16[0][1] = idepth[1];
+         mask |= (1 << 1);
+      }
+
+      if (outmask & 4) {
+         depth16[1][0] = idepth[2];
+         mask |= (1 << 2);
+      }
+
+      if (outmask & 8) {
+         depth16[1][1] = idepth[3];
+         mask |= (1 << 3);
+      }
+#else
+      /* Note: OPERATOR appears here: */
+      if ((outmask & 1) && (idepth[0] OPERATOR depth16[0][0])) {
+         depth16[0][0] = idepth[0];
+         mask |= (1 << 0);
+      }
+
+      if ((outmask & 2) && (idepth[1] OPERATOR depth16[0][1])) {
+         depth16[0][1] = idepth[1];
+         mask |= (1 << 1);
+      }
+
+      if ((outmask & 4) && (idepth[2] OPERATOR depth16[1][0])) {
+         depth16[1][0] = idepth[2];
+         mask |= (1 << 2);
+      }
+
+      if ((outmask & 8) && (idepth[3] OPERATOR depth16[1][1])) {
+         depth16[1][1] = idepth[3];
+         mask |= (1 << 3);
+      }
+#endif
+
+      depth16 = (ushort (*)[TILE_SIZE]) &depth16[0][2];
+
+      quads[i]->inout.mask = mask;
+      if (quads[i]->inout.mask)
+         quads[pass++] = quads[i];
+   }
+
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
+}
+
+
+#undef NAME
+#undef OPERATOR
+#undef ALWAYS
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
new file mode 100644
index 0000000000..907e94b59b
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -0,0 +1,175 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Vertices are just an array of floats, with all the attributes
+ * packed.  We currently assume a layout like:
+ *
+ * attr[0][0..3] - window position
+ * attr[1..n][0..3] - remaining attributes.
+ *
+ * Attributes are assumed to be 4 floats wide but are packed so that
+ * all the enabled attributes run contiguously.
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_quad.h"
+#include "sp_quad_pipe.h"
+
+
+struct quad_shade_stage
+{
+   struct quad_stage stage;  /**< base class */
+
+   /* no other fields at this time */
+};
+
+
+/** cast wrapper */
+static INLINE struct quad_shade_stage *
+quad_shade_stage(struct quad_stage *qs)
+{
+   return (struct quad_shade_stage *) qs;
+}
+
+
+/**
+ * Execute fragment shader for the four fragments in the quad.
+ * \return TRUE if quad is alive, FALSE if all four pixels are killed
+ */
+static INLINE boolean
+shade_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   struct tgsi_exec_machine *machine = softpipe->fs_machine;
+
+   /* run shader */
+   return softpipe->fs->run( softpipe->fs, machine, quad );
+}
+
+
+
+static void
+coverage_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      unsigned j;
+      for (j = 0; j < QUAD_SIZE; j++) {
+         assert(quad->input.coverage[j] >= 0.0);
+         assert(quad->input.coverage[j] <= 1.0);
+         quadColor[3][j] *= quad->input.coverage[j];
+      }
+   }
+}
+
+
+/**
+ * Shade/write an array of quads
+ * Called via quad_stage::run()
+ */
+static void
+shade_quads(struct quad_stage *qs, 
+            struct quad_header *quads[],
+            unsigned nr)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   struct tgsi_exec_machine *machine = softpipe->fs_machine;
+   unsigned i, pass = 0;
+
+   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+      machine->Consts[i] = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT][i];
+   }
+   machine->InterpCoefs = quads[0]->coef;
+
+   for (i = 0; i < nr; i++) {
+      if (!shade_quad(qs, quads[i]))
+         continue; /* quad totally culled/killed */
+
+      if (/*do_coverage*/ 0)
+         coverage_quad( qs, quads[i] );
+
+      quads[pass++] = quads[i];
+   }
+   
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
+}
+   
+
+/**
+ * Per-primitive (or per-begin?) setup
+ */
+static void
+shade_begin(struct quad_stage *qs)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+
+   softpipe->fs->prepare( softpipe->fs, 
+			  softpipe->fs_machine,
+			  (struct tgsi_sampler **)
+                             softpipe->tgsi.frag_samplers_list );
+
+   qs->next->begin(qs->next);
+}
+
+
+static void
+shade_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *
+sp_quad_shade_stage( struct softpipe_context *softpipe )
+{
+   struct quad_shade_stage *qss = CALLOC_STRUCT(quad_shade_stage);
+   if (!qss)
+      goto fail;
+
+   qss->stage.softpipe = softpipe;
+   qss->stage.begin = shade_begin;
+   qss->stage.run = shade_quads;
+   qss->stage.destroy = shade_destroy;
+
+   return &qss->stage;
+
+fail:
+   FREE(qss);
+   return NULL;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.c b/src/gallium/drivers/softpipe/sp_quad_pipe.c
new file mode 100644
index 0000000000..43b8e88e33
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.c
@@ -0,0 +1,68 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "pipe/p_shader_tokens.h"
+
+static void
+sp_push_quad_first( struct softpipe_context *sp,
+                    struct quad_stage *quad )
+{
+   quad->next = sp->quad.first;
+   sp->quad.first = quad;
+}
+
+
+void
+sp_build_quad_pipeline(struct softpipe_context *sp)
+{
+   boolean early_depth_test =
+      sp->depth_stencil->depth.enabled &&
+      sp->framebuffer.zsbuf &&
+      !sp->depth_stencil->alpha.enabled &&
+      !sp->fs->info.uses_kill &&
+      !sp->fs->info.writes_z;
+
+   sp->quad.first = sp->quad.blend;
+
+   if (early_depth_test) {
+      sp_push_quad_first( sp, sp->quad.shade );
+      sp_push_quad_first( sp, sp->quad.depth_test );
+   }
+   else {
+      sp_push_quad_first( sp, sp->quad.depth_test );
+      sp_push_quad_first( sp, sp->quad.shade );
+   }
+
+#if !DO_PSTIPPLE_IN_DRAW_MODULE
+   if (sp->rasterizer->poly_stipple_enable)
+      sp_push_quad_first( sp, sp->quad.pstipple );
+#endif
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.h b/src/gallium/drivers/softpipe/sp_quad_pipe.h
new file mode 100644
index 0000000000..c0aa134831
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.h
@@ -0,0 +1,72 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_QUAD_PIPE_H
+#define SP_QUAD_PIPE_H
+
+
+struct softpipe_context;
+struct quad_header;
+
+
+/**
+ * Fragment processing is performed on 2x2 blocks of pixels called "quads".
+ * Quad processing is performed with a pipeline of stages represented by
+ * this type.
+ */
+struct quad_stage {
+   struct softpipe_context *softpipe;
+
+   struct quad_stage *next;
+
+   void (*begin)(struct quad_stage *qs);
+
+   /** the stage action */
+   void (*run)(struct quad_stage *qs, struct quad_header *quad[], unsigned nr);
+
+   void (*destroy)(struct quad_stage *qs);
+};
+
+
+struct quad_stage *sp_quad_polygon_stipple_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_earlyz_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_alpha_test_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_stencil_test_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_depth_test_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_occlusion_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_coverage_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_colormask_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe );
+
+void sp_build_quad_pipeline(struct softpipe_context *sp);
+
+#endif /* SP_QUAD_PIPE_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_stipple.c b/src/gallium/drivers/softpipe/sp_quad_stipple.c
new file mode 100644
index 0000000000..a0527a596a
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_stipple.c
@@ -0,0 +1,81 @@
+
+/**
+ * quad polygon stipple stage
+ */
+
+#include "sp_context.h"
+#include "sp_quad.h"
+#include "sp_quad_pipe.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+
+
+/**
+ * Apply polygon stipple to quads produced by triangle rasterization
+ */
+static void
+stipple_quad(struct quad_stage *qs, struct quad_header *quads[], unsigned nr)
+{
+   static const uint bit31 = 1 << 31;
+   static const uint bit30 = 1 << 30;
+   unsigned pass = nr;
+
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned q;
+
+   pass = 0;
+
+   for (q = 0; q < nr; q++)  {
+      struct quad_header *quad = quads[q];
+
+      const int col0 = quad->input.x0 % 32;
+      const int y0 = quad->input.y0;
+      const int y1 = y0 + 1;
+      const uint stipple0 = softpipe->poly_stipple.stipple[y0 % 32];
+      const uint stipple1 = softpipe->poly_stipple.stipple[y1 % 32];
+
+      /* turn off quad mask bits that fail the stipple test */
+      if ((stipple0 & (bit31 >> col0)) == 0)
+         quad->inout.mask &= ~MASK_TOP_LEFT;
+
+      if ((stipple0 & (bit30 >> col0)) == 0)
+         quad->inout.mask &= ~MASK_TOP_RIGHT;
+
+      if ((stipple1 & (bit31 >> col0)) == 0)
+         quad->inout.mask &= ~MASK_BOTTOM_LEFT;
+
+      if ((stipple1 & (bit30 >> col0)) == 0)
+         quad->inout.mask &= ~MASK_BOTTOM_RIGHT;
+
+      if (quad->inout.mask)
+         quads[pass++] = quad;
+   }
+
+   qs->next->run(qs->next, quads, pass);
+}
+
+
+static void stipple_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void stipple_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *
+sp_quad_polygon_stipple_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = stipple_begin;
+   stage->run = stipple_quad;
+   stage->destroy = stipple_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
new file mode 100644
index 0000000000..4ae69c1c2b
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -0,0 +1,207 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "draw/draw_context.h"
+#include "os/os_time.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_query.h"
+#include "sp_state.h"
+
+struct softpipe_query {
+   unsigned type;
+   uint64_t start;
+   uint64_t end;
+   struct pipe_query_data_so_statistics so;
+};
+
+
+static struct softpipe_query *softpipe_query( struct pipe_query *p )
+{
+   return (struct softpipe_query *)p;
+}
+
+static struct pipe_query *
+softpipe_create_query(struct pipe_context *pipe, 
+		      unsigned type)
+{
+   struct softpipe_query* sq;
+
+   assert(type == PIPE_QUERY_OCCLUSION_COUNTER ||
+          type == PIPE_QUERY_TIME_ELAPSED ||
+          type == PIPE_QUERY_SO_STATISTICS ||
+          type == PIPE_QUERY_GPU_FINISHED ||
+          type == PIPE_QUERY_TIMESTAMP_DISJOINT);
+   sq = CALLOC_STRUCT( softpipe_query );
+   sq->type = type;
+
+   return (struct pipe_query *)sq;
+}
+
+
+static void
+softpipe_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   FREE(q);
+}
+
+
+static void
+softpipe_begin_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+   struct softpipe_query *sq = softpipe_query(q);
+
+   switch (sq->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      sq->start = softpipe->occlusion_count;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      sq->start = 1000*os_time_get();
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      sq->so.num_primitives_written = 0;
+      sq->so.primitives_storage_needed = 0;
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+   default:
+      assert(0);
+      break;
+   }
+   softpipe->active_query_count++;
+   softpipe->dirty |= SP_NEW_QUERY;
+}
+
+
+static void
+softpipe_end_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+   struct softpipe_query *sq = softpipe_query(q);
+
+   softpipe->active_query_count--;
+   switch (sq->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      sq->end = softpipe->occlusion_count;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      sq->end = 1000*os_time_get();
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      sq->so.num_primitives_written =
+         softpipe->so_stats.num_primitives_written;
+      sq->so.primitives_storage_needed =
+         softpipe->so_stats.primitives_storage_needed;
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      break;
+   default:
+      assert(0);
+      break;
+   }
+   softpipe->dirty |= SP_NEW_QUERY;
+}
+
+
+static boolean
+softpipe_get_query_result(struct pipe_context *pipe, 
+			  struct pipe_query *q,
+			  boolean wait,
+			  void *vresult)
+{
+   struct softpipe_query *sq = softpipe_query(q);
+   uint64_t *result = (uint64_t*)vresult;
+
+   switch (sq->type) {
+   case PIPE_QUERY_SO_STATISTICS:
+      memcpy(vresult, &sq->so,
+             sizeof(struct pipe_query_data_so_statistics));
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+      *result = TRUE;
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT: {
+      struct pipe_query_data_timestamp_disjoint td;
+      /*os_get_time is in microseconds*/
+      td.frequency = 1000000;
+      td.disjoint = FALSE;
+      memcpy(vresult, &sq->so,
+             sizeof(struct pipe_query_data_timestamp_disjoint));
+   }
+      break;
+   default:
+      *result = sq->end - sq->start;
+      break;
+   }
+   return TRUE;
+}
+
+
+/**
+ * Called by rendering function to check rendering is conditional.
+ * \return TRUE if we should render, FALSE if we should skip rendering
+ */
+boolean
+softpipe_check_render_cond(struct softpipe_context *sp)
+{
+   struct pipe_context *pipe = &sp->pipe;
+   boolean b, wait;
+   uint64_t result;
+
+   if (!sp->render_cond_query) {
+      return TRUE;  /* no query predicate, draw normally */
+   }
+
+   wait = (sp->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+           sp->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT);
+
+   b = pipe->get_query_result(pipe, sp->render_cond_query, wait, &result);
+   if (b)
+      return result > 0;
+   else
+      return TRUE;
+}
+
+
+void softpipe_init_query_funcs(struct softpipe_context *softpipe )
+{
+   softpipe->pipe.create_query = softpipe_create_query;
+   softpipe->pipe.destroy_query = softpipe_destroy_query;
+   softpipe->pipe.begin_query = softpipe_begin_query;
+   softpipe->pipe.end_query = softpipe_end_query;
+   softpipe->pipe.get_query_result = softpipe_get_query_result;
+}
+
+
diff --git a/src/gallium/drivers/softpipe/sp_query.h b/src/gallium/drivers/softpipe/sp_query.h
new file mode 100644
index 0000000000..736c033897
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_query.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell
+ */
+
+#ifndef SP_QUERY_H
+#define SP_QUERY_H
+
+extern boolean
+softpipe_check_render_cond(struct softpipe_context *sp);
+
+
+struct softpipe_context;
+extern void softpipe_init_query_funcs(struct softpipe_context * );
+
+
+#endif /* SP_QUERY_H */
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
new file mode 100644
index 0000000000..fc57d3eb61
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -0,0 +1,330 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "util/u_format_s3tc.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+
+#include "state_tracker/sw_winsys.h"
+#include "tgsi/tgsi_exec.h"
+
+#include "sp_texture.h"
+#include "sp_screen.h"
+#include "sp_context.h"
+#include "sp_fence.h"
+#include "sp_public.h"
+
+
+static const char *
+softpipe_get_vendor(struct pipe_screen *screen)
+{
+   return "VMware, Inc.";
+}
+
+
+static const char *
+softpipe_get_name(struct pipe_screen *screen)
+{
+   return "softpipe";
+}
+
+
+static int
+softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return PIPE_MAX_SAMPLERS;
+   case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+      return PIPE_MAX_VERTEX_SAMPLERS;
+   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+      return PIPE_MAX_SAMPLERS + PIPE_MAX_VERTEX_SAMPLERS;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 1;
+   case PIPE_CAP_SM3:
+      return 1;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return PIPE_MAX_COLOR_BUFS;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_TIMER_QUERY:
+      return 1;
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+      return 1;
+   case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+      return 1;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_TEXTURE_SWIZZLE:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return SP_MAX_TEXTURE_2D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return SP_MAX_TEXTURE_3D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return SP_MAX_TEXTURE_2D_LEVELS;
+   case PIPE_CAP_TGSI_CONT_SUPPORTED:
+      return 1;
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+      return 1;
+   case PIPE_CAP_MAX_CONST_BUFFERS:
+      return PIPE_MAX_CONSTANT_BUFFERS;
+   case PIPE_CAP_MAX_CONST_BUFFER_SIZE:
+      return 4096 * 4 * sizeof(float);
+   case PIPE_CAP_INDEP_BLEND_ENABLE:
+      return 1;
+   case PIPE_CAP_INDEP_BLEND_FUNC:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+      return 1;
+   case PIPE_CAP_STREAM_OUTPUT:
+      return 1;
+
+   case PIPE_CAP_MAX_VS_INSTRUCTIONS:
+   case PIPE_CAP_MAX_FS_INSTRUCTIONS:
+   case PIPE_CAP_MAX_VS_ALU_INSTRUCTIONS:
+   case PIPE_CAP_MAX_FS_ALU_INSTRUCTIONS:
+   case PIPE_CAP_MAX_VS_TEX_INSTRUCTIONS:
+   case PIPE_CAP_MAX_FS_TEX_INSTRUCTIONS:
+   case PIPE_CAP_MAX_VS_TEX_INDIRECTIONS:
+   case PIPE_CAP_MAX_FS_TEX_INDIRECTIONS:
+      /* There is no limit in number of instructions beyond available memory */
+      return 32768;
+   case PIPE_CAP_MAX_VS_CONTROL_FLOW_DEPTH:
+   case PIPE_CAP_MAX_FS_CONTROL_FLOW_DEPTH:
+      return TGSI_EXEC_MAX_NESTING;
+   case PIPE_CAP_MAX_VS_INPUTS:
+   case PIPE_CAP_MAX_FS_INPUTS:
+      return TGSI_EXEC_MAX_INPUT_ATTRIBS;
+   case PIPE_CAP_MAX_FS_CONSTS:
+   case PIPE_CAP_MAX_VS_CONSTS:
+      return TGSI_EXEC_MAX_CONST_BUFFER;
+   case PIPE_CAP_MAX_VS_TEMPS:
+   case PIPE_CAP_MAX_FS_TEMPS:
+      return TGSI_EXEC_NUM_TEMPS;
+   case PIPE_CAP_MAX_VS_ADDRS:
+   case PIPE_CAP_MAX_FS_ADDRS:
+      return TGSI_EXEC_NUM_ADDRS;
+   case PIPE_CAP_MAX_VS_PREDS:
+   case PIPE_CAP_MAX_FS_PREDS:
+      return TGSI_EXEC_NUM_PREDS;
+
+   case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
+      return 0;
+   default:
+      return 0;
+   }
+}
+
+
+static float
+softpipe_get_paramf(struct pipe_screen *screen, enum pipe_cap param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 16.0; /* not actually signficant at this time */
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0; /* arbitrary */
+   default:
+      return 0;
+   }
+}
+
+
+/**
+ * Query format support for creating a texture, drawing surface, etc.
+ * \param format  the format to test
+ * \param type  one of PIPE_TEXTURE, PIPE_SURFACE
+ */
+static boolean
+softpipe_is_format_supported( struct pipe_screen *screen,
+                              enum pipe_format format,
+                              enum pipe_texture_target target,
+                              unsigned sample_count,
+                              unsigned bind,
+                              unsigned geom_flags )
+{
+   struct sw_winsys *winsys = softpipe_screen(screen)->winsys;
+   const struct util_format_description *format_desc;
+
+   assert(target == PIPE_BUFFER ||
+          target == PIPE_TEXTURE_1D ||
+          target == PIPE_TEXTURE_2D ||
+          target == PIPE_TEXTURE_3D ||
+          target == PIPE_TEXTURE_CUBE);
+
+   format_desc = util_format_description(format);
+   if (!format_desc)
+      return FALSE;
+
+   if (sample_count > 1)
+      return FALSE;
+
+   if (bind & (PIPE_BIND_DISPLAY_TARGET |
+               PIPE_BIND_SCANOUT |
+               PIPE_BIND_SHARED)) {
+      if(!winsys->is_displaytarget_format_supported(winsys, bind, format))
+         return FALSE;
+   }
+
+   if (bind & PIPE_BIND_RENDER_TARGET) {
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+
+      /*
+       * Although possible, it is unnatural to render into compressed or YUV
+       * surfaces. So disable these here to avoid going into weird paths
+       * inside the state trackers.
+       */
+      if (format_desc->block.width != 1 ||
+          format_desc->block.height != 1)
+         return FALSE;
+
+      /*
+       * TODO: Unfortunately we cannot render into anything more than 32 bits
+       * because we encode color clear values into a 32bit word.
+       */
+      if (format_desc->block.bits > 32)
+         return FALSE;
+   }
+
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
+      if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+
+      /*
+       * TODO: Unfortunately we cannot render into anything more than 32 bits
+       * because we encode depth and stencil clear values into a 32bit word.
+       */
+      if (format_desc->block.bits > 32)
+         return FALSE;
+
+      /*
+       * TODO: eliminate this restriction
+       */
+      if (format == PIPE_FORMAT_Z32_FLOAT)
+         return FALSE;
+   }
+
+   /*
+    * All other operations (sampling, transfer, etc).
+    */
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+      return util_format_s3tc_enabled;
+   }
+
+   /*
+    * Everything else should be supported by u_format.
+    */
+   return TRUE;
+}
+
+
+static void
+softpipe_destroy_screen( struct pipe_screen *screen )
+{
+   struct softpipe_screen *sp_screen = softpipe_screen(screen);
+   struct sw_winsys *winsys = sp_screen->winsys;
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+/* This is often overriden by the co-state tracker.
+ */
+static void
+softpipe_flush_frontbuffer(struct pipe_screen *_screen,
+                           struct pipe_surface *surface,
+                           void *context_private)
+{
+   struct softpipe_screen *screen = softpipe_screen(_screen);
+   struct sw_winsys *winsys = screen->winsys;
+   struct softpipe_resource *texture = softpipe_resource(surface->texture);
+
+   assert(texture->dt);
+   if (texture->dt)
+      winsys->displaytarget_display(winsys, texture->dt, context_private);
+}
+
+/**
+ * Create a new pipe_screen object
+ * Note: we're not presently subclassing pipe_screen (no softpipe_screen).
+ */
+struct pipe_screen *
+softpipe_create_screen(struct sw_winsys *winsys)
+{
+   struct softpipe_screen *screen = CALLOC_STRUCT(softpipe_screen);
+
+   if (!screen)
+      return NULL;
+
+   screen->winsys = winsys;
+
+   screen->base.winsys = NULL;
+   screen->base.destroy = softpipe_destroy_screen;
+
+   screen->base.get_name = softpipe_get_name;
+   screen->base.get_vendor = softpipe_get_vendor;
+   screen->base.get_param = softpipe_get_param;
+   screen->base.get_paramf = softpipe_get_paramf;
+   screen->base.is_format_supported = softpipe_is_format_supported;
+   screen->base.context_create = softpipe_create_context;
+   screen->base.flush_frontbuffer = softpipe_flush_frontbuffer;
+
+   util_format_s3tc_init();
+
+   softpipe_init_screen_texture_funcs(&screen->base);
+   softpipe_init_screen_fence_funcs(&screen->base);
+
+   return &screen->base;
+}
diff --git a/src/gallium/drivers/softpipe/sp_screen.h b/src/gallium/drivers/softpipe/sp_screen.h
new file mode 100644
index 0000000000..f741454c9e
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_screen.h
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_SCREEN_H
+#define SP_SCREEN_H
+
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+
+
+struct sw_winsys;
+
+struct softpipe_screen {
+   struct pipe_screen base;
+
+   struct sw_winsys *winsys;
+
+   /* Increments whenever textures are modified.  Contexts can track
+    * this.
+    */
+   unsigned timestamp;          
+};
+
+
+
+
+static INLINE struct softpipe_screen *
+softpipe_screen( struct pipe_screen *pipe )
+{
+   return (struct softpipe_screen *)pipe;
+}
+
+
+
+#endif /* SP_SCREEN_H */
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
new file mode 100644
index 0000000000..5d727dc00d
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -0,0 +1,1447 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \brief  Primitive rasterization/rendering (points, lines, triangles)
+ *
+ * \author  Keith Whitwell <keith@tungstengraphics.com>
+ * \author  Brian Paul
+ */
+
+#include "sp_context.h"
+#include "sp_quad.h"
+#include "sp_quad_pipe.h"
+#include "sp_setup.h"
+#include "sp_state.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+#define DEBUG_VERTS 0
+#define DEBUG_FRAGS 0
+
+
+/**
+ * Triangle edge info
+ */
+struct edge {
+   float dx;		/**< X(v1) - X(v0), used only during setup */
+   float dy;		/**< Y(v1) - Y(v0), used only during setup */
+   float dxdy;		/**< dx/dy */
+   float sx, sy;	/**< first sample point coord */
+   int lines;		/**< number of lines on this edge */
+};
+
+
+/**
+ * Max number of quads (2x2 pixel blocks) to process per batch.
+ * This can't be arbitrarily increased since we depend on some 32-bit
+ * bitmasks (two bits per quad).
+ */
+#define MAX_QUADS 16
+
+
+/**
+ * Triangle setup info.
+ * Also used for line drawing (taking some liberties).
+ */
+struct setup_context {
+   struct softpipe_context *softpipe;
+
+   /* Vertices are just an array of floats making up each attribute in
+    * turn.  Currently fixed at 4 floats, but should change in time.
+    * Codegen will help cope with this.
+    */
+   const float (*vmax)[4];
+   const float (*vmid)[4];
+   const float (*vmin)[4];
+   const float (*vprovoke)[4];
+
+   struct edge ebot;
+   struct edge etop;
+   struct edge emaj;
+
+   float oneoverarea;
+   int facing;
+
+   float pixel_offset;
+
+   struct quad_header quad[MAX_QUADS];
+   struct quad_header *quad_ptrs[MAX_QUADS];
+   unsigned count;
+
+   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
+   struct tgsi_interp_coef posCoef;  /* For Z, W */
+
+   struct {
+      int left[2];   /**< [0] = row0, [1] = row1 */
+      int right[2];
+      int y;
+   } span;
+
+#if DEBUG_FRAGS
+   uint numFragsEmitted;  /**< per primitive */
+   uint numFragsWritten;  /**< per primitive */
+#endif
+
+   unsigned cull_face;		/* which faces cull */
+   unsigned nr_vertex_attrs;
+};
+
+
+
+
+
+
+
+/**
+ * Clip setup->quad against the scissor/surface bounds.
+ */
+static INLINE void
+quad_clip(struct setup_context *setup, struct quad_header *quad)
+{
+   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
+   const int minx = (int) cliprect->minx;
+   const int maxx = (int) cliprect->maxx;
+   const int miny = (int) cliprect->miny;
+   const int maxy = (int) cliprect->maxy;
+
+   if (quad->input.x0 >= maxx ||
+       quad->input.y0 >= maxy ||
+       quad->input.x0 + 1 < minx ||
+       quad->input.y0 + 1 < miny) {
+      /* totally clipped */
+      quad->inout.mask = 0x0;
+      return;
+   }
+   if (quad->input.x0 < minx)
+      quad->inout.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+   if (quad->input.y0 < miny)
+      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+   if (quad->input.x0 == maxx - 1)
+      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+   if (quad->input.y0 == maxy - 1)
+      quad->inout.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+}
+
+
+/**
+ * Emit a quad (pass to next stage) with clipping.
+ */
+static INLINE void
+clip_emit_quad(struct setup_context *setup, struct quad_header *quad)
+{
+   quad_clip( setup, quad );
+
+   if (quad->inout.mask) {
+      struct softpipe_context *sp = setup->softpipe;
+
+      sp->quad.first->run( sp->quad.first, &quad, 1 );
+   }
+}
+
+
+
+/**
+ * Given an X or Y coordinate, return the block/quad coordinate that it
+ * belongs to.
+ */
+static INLINE int
+block(int x)
+{
+   return x & ~(2-1);
+}
+
+
+static INLINE int
+block_x(int x)
+{
+   return x & ~(16-1);
+}
+
+
+/**
+ * Render a horizontal span of quads
+ */
+static void
+flush_spans(struct setup_context *setup)
+{
+   const int step = MAX_QUADS;
+   const int xleft0 = setup->span.left[0];
+   const int xleft1 = setup->span.left[1];
+   const int xright0 = setup->span.right[0];
+   const int xright1 = setup->span.right[1];
+   struct quad_stage *pipe = setup->softpipe->quad.first;
+
+   const int minleft = block_x(MIN2(xleft0, xleft1));
+   const int maxright = MAX2(xright0, xright1);
+   int x;
+
+   /* process quads in horizontal chunks of 16 */
+   for (x = minleft; x < maxright; x += step) {
+      unsigned skip_left0 = CLAMP(xleft0 - x, 0, step);
+      unsigned skip_left1 = CLAMP(xleft1 - x, 0, step);
+      unsigned skip_right0 = CLAMP(x + step - xright0, 0, step);
+      unsigned skip_right1 = CLAMP(x + step - xright1, 0, step);
+      unsigned lx = x;
+      unsigned q = 0;
+
+      unsigned skipmask_left0 = (1U << skip_left0) - 1U;
+      unsigned skipmask_left1 = (1U << skip_left1) - 1U;
+
+      /* These calculations fail when step == 32 and skip_right == 0.
+       */
+      unsigned skipmask_right0 = ~0U << (unsigned)(step - skip_right0);
+      unsigned skipmask_right1 = ~0U << (unsigned)(step - skip_right1);
+
+      unsigned mask0 = ~skipmask_left0 & ~skipmask_right0;
+      unsigned mask1 = ~skipmask_left1 & ~skipmask_right1;
+
+      if (mask0 | mask1) {
+         do {
+            unsigned quadmask = (mask0 & 3) | ((mask1 & 3) << 2);
+            if (quadmask) {
+               setup->quad[q].input.x0 = lx;
+               setup->quad[q].input.y0 = setup->span.y;
+               setup->quad[q].input.facing = setup->facing;
+               setup->quad[q].inout.mask = quadmask;
+               setup->quad_ptrs[q] = &setup->quad[q];
+               q++;
+            }
+            mask0 >>= 2;
+            mask1 >>= 2;
+            lx += 2;
+         } while (mask0 | mask1);
+
+         pipe->run( pipe, setup->quad_ptrs, q );
+      }
+   }
+
+
+   setup->span.y = 0;
+   setup->span.right[0] = 0;
+   setup->span.right[1] = 0;
+   setup->span.left[0] = 1000000;     /* greater than right[0] */
+   setup->span.left[1] = 1000000;     /* greater than right[1] */
+}
+
+
+#if DEBUG_VERTS
+static void
+print_vertex(const struct setup_context *setup,
+             const float (*v)[4])
+{
+   int i;
+   debug_printf("   Vertex: (%p)\n", (void *) v);
+   for (i = 0; i < setup->nr_vertex_attrs; i++) {
+      debug_printf("     %d: %f %f %f %f\n",  i,
+              v[i][0], v[i][1], v[i][2], v[i][3]);
+      if (util_is_inf_or_nan(v[i][0])) {
+         debug_printf("   NaN!\n");
+      }
+   }
+}
+#endif
+
+
+/**
+ * Sort the vertices from top to bottom order, setting up the triangle
+ * edge fields (ebot, emaj, etop).
+ * \return FALSE if coords are inf/nan (cull the tri), TRUE otherwise
+ */
+static boolean
+setup_sort_vertices(struct setup_context *setup,
+                    float det,
+                    const float (*v0)[4],
+                    const float (*v1)[4],
+                    const float (*v2)[4])
+{
+   if (setup->softpipe->rasterizer->flatshade_first)
+      setup->vprovoke = v0;
+   else
+      setup->vprovoke = v2;
+
+   /* determine bottom to top order of vertices */
+   {
+      float y0 = v0[0][1];
+      float y1 = v1[0][1];
+      float y2 = v2[0][1];
+      if (y0 <= y1) {
+	 if (y1 <= y2) {
+	    /* y0<=y1<=y2 */
+	    setup->vmin = v0;
+	    setup->vmid = v1;
+	    setup->vmax = v2;
+	 }
+	 else if (y2 <= y0) {
+	    /* y2<=y0<=y1 */
+	    setup->vmin = v2;
+	    setup->vmid = v0;
+	    setup->vmax = v1;
+	 }
+	 else {
+	    /* y0<=y2<=y1 */
+	    setup->vmin = v0;
+	    setup->vmid = v2;
+	    setup->vmax = v1;
+	 }
+      }
+      else {
+	 if (y0 <= y2) {
+	    /* y1<=y0<=y2 */
+	    setup->vmin = v1;
+	    setup->vmid = v0;
+	    setup->vmax = v2;
+	 }
+	 else if (y2 <= y1) {
+	    /* y2<=y1<=y0 */
+	    setup->vmin = v2;
+	    setup->vmid = v1;
+	    setup->vmax = v0;
+	 }
+	 else {
+	    /* y1<=y2<=y0 */
+	    setup->vmin = v1;
+	    setup->vmid = v2;
+	    setup->vmax = v0;
+	 }
+      }
+   }
+
+   setup->ebot.dx = setup->vmid[0][0] - setup->vmin[0][0];
+   setup->ebot.dy = setup->vmid[0][1] - setup->vmin[0][1];
+   setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
+   setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
+   setup->etop.dx = setup->vmax[0][0] - setup->vmid[0][0];
+   setup->etop.dy = setup->vmax[0][1] - setup->vmid[0][1];
+
+   /*
+    * Compute triangle's area.  Use 1/area to compute partial
+    * derivatives of attributes later.
+    *
+    * The area will be the same as prim->det, but the sign may be
+    * different depending on how the vertices get sorted above.
+    *
+    * To determine whether the primitive is front or back facing we
+    * use the prim->det value because its sign is correct.
+    */
+   {
+      const float area = (setup->emaj.dx * setup->ebot.dy -
+			    setup->ebot.dx * setup->emaj.dy);
+
+      setup->oneoverarea = 1.0f / area;
+
+      /*
+      debug_printf("%s one-over-area %f  area %f  det %f\n",
+                   __FUNCTION__, setup->oneoverarea, area, det );
+      */
+      if (util_is_inf_or_nan(setup->oneoverarea))
+         return FALSE;
+   }
+
+   /* We need to know if this is a front or back-facing triangle for:
+    *  - the GLSL gl_FrontFacing fragment attribute (bool)
+    *  - two-sided stencil test
+    * 0 = front-facing, 1 = back-facing
+    */
+   setup->facing = 
+      ((det < 0.0) ^ 
+       (setup->softpipe->rasterizer->front_ccw));
+
+   {
+      unsigned face = setup->facing == 0 ? PIPE_FACE_FRONT : PIPE_FACE_BACK;
+
+      if (face & setup->cull_face)
+	 return FALSE;
+   }
+
+
+   /* Prepare pixel offset for rasterisation:
+    *  - pixel center (0.5, 0.5) for GL, or
+    *  - assume (0.0, 0.0) for other APIs.
+    */
+   if (setup->softpipe->rasterizer->gl_rasterization_rules) {
+      setup->pixel_offset = 0.5f;
+   } else {
+      setup->pixel_offset = 0.0f;
+   }
+
+   return TRUE;
+}
+
+
+/* Apply cylindrical wrapping to v0, v1, v2 coordinates, if enabled.
+ * Input coordinates must be in [0, 1] range, otherwise results are undefined.
+ * Some combinations of coordinates produce invalid results,
+ * but this behaviour is acceptable.
+ */
+static void
+tri_apply_cylindrical_wrap(float v0,
+                           float v1,
+                           float v2,
+                           uint cylindrical_wrap,
+                           float output[3])
+{
+   if (cylindrical_wrap) {
+      float delta;
+
+      delta = v1 - v0;
+      if (delta > 0.5f) {
+         v0 += 1.0f;
+      }
+      else if (delta < -0.5f) {
+         v1 += 1.0f;
+      }
+
+      delta = v2 - v1;
+      if (delta > 0.5f) {
+         v1 += 1.0f;
+      }
+      else if (delta < -0.5f) {
+         v2 += 1.0f;
+      }
+
+      delta = v0 - v2;
+      if (delta > 0.5f) {
+         v2 += 1.0f;
+      }
+      else if (delta < -0.5f) {
+         v0 += 1.0f;
+      }
+   }
+
+   output[0] = v0;
+   output[1] = v1;
+   output[2] = v2;
+}
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex[slot][i].
+ * The result will be put into setup->coef[slot].a0[i].
+ * \param slot  which attribute slot
+ * \param i  which component of the slot (0..3)
+ */
+static void
+const_coeff(struct setup_context *setup,
+            struct tgsi_interp_coef *coef,
+            uint vertSlot, uint i)
+{
+   assert(i <= 3);
+
+   coef->dadx[i] = 0;
+   coef->dady[i] = 0;
+
+   /* need provoking vertex info!
+    */
+   coef->a0[i] = setup->vprovoke[vertSlot][i];
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ * v[0], v[1] and v[2] are vmin, vmid and vmax, respectively.
+ */
+static void
+tri_linear_coeff(struct setup_context *setup,
+                 struct tgsi_interp_coef *coef,
+                 uint i,
+                 const float v[3])
+{
+   float botda = v[1] - v[0];
+   float majda = v[2] - v[0];
+   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+   float dadx = a * setup->oneoverarea;
+   float dady = b * setup->oneoverarea;
+
+   assert(i <= 3);
+
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+
+   /* calculate a0 as the value which would be sampled for the
+    * fragment at (0,0), taking into account that we want to sample at
+    * pixel centers, in other words (pixel_offset, pixel_offset).
+    *
+    * this is neat but unfortunately not a good way to do things for
+    * triangles with very large values of dadx or dady as it will
+    * result in the subtraction and re-addition from a0 of a very
+    * large number, which means we'll end up loosing a lot of the
+    * fractional bits and precision from a0.  the way to fix this is
+    * to define a0 as the sample at a pixel center somewhere near vmin
+    * instead - i'll switch to this later.
+    */
+   coef->a0[i] = (v[0] -
+                  (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                   dady * (setup->vmin[0][1] - setup->pixel_offset)));
+
+   /*
+   debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
+		slot, "xyzw"[i],
+		setup->coef[slot].a0[i],
+		setup->coef[slot].dadx[i],
+		setup->coef[slot].dady[i]);
+   */
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ * v[0], v[1] and v[2] are vmin, vmid and vmax, respectively.
+ */
+static void
+tri_persp_coeff(struct setup_context *setup,
+                struct tgsi_interp_coef *coef,
+                uint i,
+                const float v[3])
+{
+   /* premultiply by 1/w  (v[0][3] is always W):
+    */
+   float mina = v[0] * setup->vmin[0][3];
+   float mida = v[1] * setup->vmid[0][3];
+   float maxa = v[2] * setup->vmax[0][3];
+   float botda = mida - mina;
+   float majda = maxa - mina;
+   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+   float dadx = a * setup->oneoverarea;
+   float dady = b * setup->oneoverarea;
+
+   /*
+   debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
+          	setup->vmin[vertSlot][i],
+          	setup->vmid[vertSlot][i],
+       		setup->vmax[vertSlot][i]
+          );
+   */
+   assert(i <= 3);
+
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (mina -
+                  (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                   dady * (setup->vmin[0][1] - setup->pixel_offset)));
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial, though Y may have to be inverted for OpenGL.
+ * Z and W are copied from posCoef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_fragcoord_coeff(struct setup_context *setup, uint slot)
+{
+   struct sp_fragment_shader* spfs = setup->softpipe->fs;
+   /*X*/
+   setup->coef[slot].a0[0] = spfs->pixel_center_integer ? 0.0 : 0.5;
+   setup->coef[slot].dadx[0] = 1.0;
+   setup->coef[slot].dady[0] = 0.0;
+   /*Y*/
+   setup->coef[slot].a0[1] =
+		   (spfs->origin_lower_left ? setup->softpipe->framebuffer.height : 0)
+		   + (spfs->pixel_center_integer ? 0.0 : 0.5);
+   setup->coef[slot].dadx[1] = 0.0;
+   setup->coef[slot].dady[1] = spfs->origin_lower_left ? -1.0 : 1.0;
+   /*Z*/
+   setup->coef[slot].a0[2] = setup->posCoef.a0[2];
+   setup->coef[slot].dadx[2] = setup->posCoef.dadx[2];
+   setup->coef[slot].dady[2] = setup->posCoef.dady[2];
+   /*W*/
+   setup->coef[slot].a0[3] = setup->posCoef.a0[3];
+   setup->coef[slot].dadx[3] = setup->posCoef.dadx[3];
+   setup->coef[slot].dady[3] = setup->posCoef.dady[3];
+}
+
+
+
+/**
+ * Compute the setup->coef[] array dadx, dady, a0 values.
+ * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized.
+ */
+static void
+setup_tri_coefficients(struct setup_context *setup)
+{
+   struct softpipe_context *softpipe = setup->softpipe;
+   const struct sp_fragment_shader *spfs = softpipe->fs;
+   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   uint fragSlot;
+   float v[3];
+
+   /* z and w are done by linear interpolation:
+    */
+   v[0] = setup->vmin[0][2];
+   v[1] = setup->vmid[0][2];
+   v[2] = setup->vmax[0][2];
+   tri_linear_coeff(setup, &setup->posCoef, 2, v);
+
+   v[0] = setup->vmin[0][3];
+   v[1] = setup->vmid[0][3];
+   v[2] = setup->vmax[0][3];
+   tri_linear_coeff(setup, &setup->posCoef, 3, v);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      uint j;
+
+      switch (vinfo->attrib[fragSlot].interp_mode) {
+      case INTERP_CONSTANT:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_LINEAR:
+         for (j = 0; j < NUM_CHANNELS; j++) {
+            tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
+                                       setup->vmid[vertSlot][j],
+                                       setup->vmax[vertSlot][j],
+                                       spfs->info.input_cylindrical_wrap[fragSlot] & (1 << j),
+                                       v);
+            tri_linear_coeff(setup, &setup->coef[fragSlot], j, v);
+         }
+         break;
+      case INTERP_PERSPECTIVE:
+         for (j = 0; j < NUM_CHANNELS; j++) {
+            tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
+                                       setup->vmid[vertSlot][j],
+                                       setup->vmax[vertSlot][j],
+                                       spfs->info.input_cylindrical_wrap[fragSlot] & (1 << j),
+                                       v);
+            tri_persp_coeff(setup, &setup->coef[fragSlot], j, v);
+         }
+         break;
+      case INTERP_POS:
+         setup_fragcoord_coeff(setup, fragSlot);
+         break;
+      default:
+         assert(0);
+      }
+
+      if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
+         /* convert 0 to 1.0 and 1 to -1.0 */
+         setup->coef[fragSlot].a0[0] = setup->facing * -2.0f + 1.0f;
+         setup->coef[fragSlot].dadx[0] = 0.0;
+         setup->coef[fragSlot].dady[0] = 0.0;
+      }
+   }
+}
+
+
+static void
+setup_tri_edges(struct setup_context *setup)
+{
+   float vmin_x = setup->vmin[0][0] + setup->pixel_offset;
+   float vmid_x = setup->vmid[0][0] + setup->pixel_offset;
+
+   float vmin_y = setup->vmin[0][1] - setup->pixel_offset;
+   float vmid_y = setup->vmid[0][1] - setup->pixel_offset;
+   float vmax_y = setup->vmax[0][1] - setup->pixel_offset;
+
+   setup->emaj.sy = ceilf(vmin_y);
+   setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
+   setup->emaj.dxdy = setup->emaj.dy ? setup->emaj.dx / setup->emaj.dy : .0f;
+   setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy;
+
+   setup->etop.sy = ceilf(vmid_y);
+   setup->etop.lines = (int) ceilf(vmax_y - setup->etop.sy);
+   setup->etop.dxdy = setup->etop.dy ? setup->etop.dx / setup->etop.dy : .0f;
+   setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy;
+
+   setup->ebot.sy = ceilf(vmin_y);
+   setup->ebot.lines = (int) ceilf(vmid_y - setup->ebot.sy);
+   setup->ebot.dxdy = setup->ebot.dy ? setup->ebot.dx / setup->ebot.dy : .0f;
+   setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy;
+}
+
+
+/**
+ * Render the upper or lower half of a triangle.
+ * Scissoring/cliprect is applied here too.
+ */
+static void
+subtriangle(struct setup_context *setup,
+            struct edge *eleft,
+            struct edge *eright,
+            int lines)
+{
+   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
+   const int minx = (int) cliprect->minx;
+   const int maxx = (int) cliprect->maxx;
+   const int miny = (int) cliprect->miny;
+   const int maxy = (int) cliprect->maxy;
+   int y, start_y, finish_y;
+   int sy = (int)eleft->sy;
+
+   assert((int)eleft->sy == (int) eright->sy);
+   assert(lines >= 0);
+
+   /* clip top/bottom */
+   start_y = sy;
+   if (start_y < miny)
+      start_y = miny;
+
+   finish_y = sy + lines;
+   if (finish_y > maxy)
+      finish_y = maxy;
+
+   start_y -= sy;
+   finish_y -= sy;
+
+   /*
+   debug_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
+   */
+
+   for (y = start_y; y < finish_y; y++) {
+
+      /* avoid accumulating adds as floats don't have the precision to
+       * accurately iterate large triangle edges that way.  luckily we
+       * can just multiply these days.
+       *
+       * this is all drowned out by the attribute interpolation anyway.
+       */
+      int left = (int)(eleft->sx + y * eleft->dxdy);
+      int right = (int)(eright->sx + y * eright->dxdy);
+
+      /* clip left/right */
+      if (left < minx)
+         left = minx;
+      if (right > maxx)
+         right = maxx;
+
+      if (left < right) {
+         int _y = sy + y;
+         if (block(_y) != setup->span.y) {
+            flush_spans(setup);
+            setup->span.y = block(_y);
+         }
+
+         setup->span.left[_y&1] = left;
+         setup->span.right[_y&1] = right;
+      }
+   }
+
+
+   /* save the values so that emaj can be restarted:
+    */
+   eleft->sx += lines * eleft->dxdy;
+   eright->sx += lines * eright->dxdy;
+   eleft->sy += lines;
+   eright->sy += lines;
+}
+
+
+/**
+ * Recalculate prim's determinant.  This is needed as we don't have
+ * get this information through the vbuf_render interface & we must
+ * calculate it here.
+ */
+static float
+calc_det(const float (*v0)[4],
+         const float (*v1)[4],
+         const float (*v2)[4])
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0][0] - v2[0][0];
+   const float ey = v0[0][1] - v2[0][1];
+   const float fx = v1[0][0] - v2[0][0];
+   const float fy = v1[0][1] - v2[0][1];
+
+   /* det = cross(e,f).z */
+   return ex * fy - ey * fx;
+}
+
+
+/**
+ * Do setup for triangle rasterization, then render the triangle.
+ */
+void
+sp_setup_tri(struct setup_context *setup,
+             const float (*v0)[4],
+             const float (*v1)[4],
+             const float (*v2)[4])
+{
+   float det;
+
+#if DEBUG_VERTS
+   debug_printf("Setup triangle:\n");
+   print_vertex(setup, v0);
+   print_vertex(setup, v1);
+   print_vertex(setup, v2);
+#endif
+
+   if (setup->softpipe->no_rast)
+      return;
+   
+   det = calc_det(v0, v1, v2);
+   /*
+   debug_printf("%s\n", __FUNCTION__ );
+   */
+
+#if DEBUG_FRAGS
+   setup->numFragsEmitted = 0;
+   setup->numFragsWritten = 0;
+#endif
+
+   if (!setup_sort_vertices( setup, det, v0, v1, v2 ))
+      return;
+
+   setup_tri_coefficients( setup );
+   setup_tri_edges( setup );
+
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_TRIANGLES);
+
+   setup->span.y = 0;
+   setup->span.right[0] = 0;
+   setup->span.right[1] = 0;
+   /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
+
+   /*   init_constant_attribs( setup ); */
+
+   if (setup->oneoverarea < 0.0) {
+      /* emaj on left:
+       */
+      subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines );
+      subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines );
+   }
+   else {
+      /* emaj on right:
+       */
+      subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines );
+      subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines );
+   }
+
+   flush_spans( setup );
+
+#if DEBUG_FRAGS
+   printf("Tri: %u frags emitted, %u written\n",
+          setup->numFragsEmitted,
+          setup->numFragsWritten);
+#endif
+}
+
+
+/* Apply cylindrical wrapping to v0, v1 coordinates, if enabled.
+ * Input coordinates must be in [0, 1] range, otherwise results are undefined.
+ */
+static void
+line_apply_cylindrical_wrap(float v0,
+                            float v1,
+                            uint cylindrical_wrap,
+                            float output[2])
+{
+   if (cylindrical_wrap) {
+      float delta;
+
+      delta = v1 - v0;
+      if (delta > 0.5f) {
+         v0 += 1.0f;
+      }
+      else if (delta < -0.5f) {
+         v1 += 1.0f;
+      }
+   }
+
+   output[0] = v0;
+   output[1] = v1;
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a line.
+ * v[0] and v[1] are vmin and vmax, respectively.
+ */
+static void
+line_linear_coeff(const struct setup_context *setup,
+                  struct tgsi_interp_coef *coef,
+                  uint i,
+                  const float v[2])
+{
+   const float da = v[1] - v[0];
+   const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+   const float dady = da * setup->emaj.dy * setup->oneoverarea;
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (v[0] -
+                  (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                   dady * (setup->vmin[0][1] - setup->pixel_offset)));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a line.
+ * v[0] and v[1] are vmin and vmax, respectively.
+ */
+static void
+line_persp_coeff(const struct setup_context *setup,
+                 struct tgsi_interp_coef *coef,
+                 uint i,
+                 const float v[2])
+{
+   const float a0 = v[0] * setup->vmin[0][3];
+   const float a1 = v[1] * setup->vmax[0][3];
+   const float da = a1 - a0;
+   const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+   const float dady = da * setup->emaj.dy * setup->oneoverarea;
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (a0 -
+                  (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                   dady * (setup->vmin[0][1] - setup->pixel_offset)));
+}
+
+
+/**
+ * Compute the setup->coef[] array dadx, dady, a0 values.
+ * Must be called after setup->vmin,vmax are initialized.
+ */
+static boolean
+setup_line_coefficients(struct setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4])
+{
+   struct softpipe_context *softpipe = setup->softpipe;
+   const struct sp_fragment_shader *spfs = softpipe->fs;
+   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   uint fragSlot;
+   float area;
+   float v[2];
+
+   /* use setup->vmin, vmax to point to vertices */
+   if (softpipe->rasterizer->flatshade_first)
+      setup->vprovoke = v0;
+   else
+      setup->vprovoke = v1;
+   setup->vmin = v0;
+   setup->vmax = v1;
+
+   setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
+   setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
+
+   /* NOTE: this is not really area but something proportional to it */
+   area = setup->emaj.dx * setup->emaj.dx + setup->emaj.dy * setup->emaj.dy;
+   if (area == 0.0f || util_is_inf_or_nan(area))
+      return FALSE;
+   setup->oneoverarea = 1.0f / area;
+
+   /* z and w are done by linear interpolation:
+    */
+   v[0] = setup->vmin[0][2];
+   v[1] = setup->vmax[0][2];
+   line_linear_coeff(setup, &setup->posCoef, 2, v);
+
+   v[0] = setup->vmin[0][3];
+   v[1] = setup->vmax[0][3];
+   line_linear_coeff(setup, &setup->posCoef, 3, v);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      uint j;
+
+      switch (vinfo->attrib[fragSlot].interp_mode) {
+      case INTERP_CONSTANT:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_LINEAR:
+         for (j = 0; j < NUM_CHANNELS; j++) {
+            line_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
+                                        setup->vmax[vertSlot][j],
+                                        spfs->info.input_cylindrical_wrap[fragSlot] & (1 << j),
+                                        v);
+            line_linear_coeff(setup, &setup->coef[fragSlot], j, v);
+         }
+         break;
+      case INTERP_PERSPECTIVE:
+         for (j = 0; j < NUM_CHANNELS; j++) {
+            line_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
+                                        setup->vmax[vertSlot][j],
+                                        spfs->info.input_cylindrical_wrap[fragSlot] & (1 << j),
+                                        v);
+            line_persp_coeff(setup, &setup->coef[fragSlot], j, v);
+         }
+         break;
+      case INTERP_POS:
+         setup_fragcoord_coeff(setup, fragSlot);
+         break;
+      default:
+         assert(0);
+      }
+
+      if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
+         /* convert 0 to 1.0 and 1 to -1.0 */
+         setup->coef[fragSlot].a0[0] = setup->facing * -2.0f + 1.0f;
+         setup->coef[fragSlot].dadx[0] = 0.0;
+         setup->coef[fragSlot].dady[0] = 0.0;
+      }
+   }
+   return TRUE;
+}
+
+
+/**
+ * Plot a pixel in a line segment.
+ */
+static INLINE void
+plot(struct setup_context *setup, int x, int y)
+{
+   const int iy = y & 1;
+   const int ix = x & 1;
+   const int quadX = x - ix;
+   const int quadY = y - iy;
+   const int mask = (1 << ix) << (2 * iy);
+
+   if (quadX != setup->quad[0].input.x0 ||
+       quadY != setup->quad[0].input.y0)
+   {
+      /* flush prev quad, start new quad */
+
+      if (setup->quad[0].input.x0 != -1)
+         clip_emit_quad( setup, &setup->quad[0] );
+
+      setup->quad[0].input.x0 = quadX;
+      setup->quad[0].input.y0 = quadY;
+      setup->quad[0].inout.mask = 0x0;
+   }
+
+   setup->quad[0].inout.mask |= mask;
+}
+
+
+/**
+ * Do setup for line rasterization, then render the line.
+ * Single-pixel width, no stipple, etc.  We rely on the 'draw' module
+ * to handle stippling and wide lines.
+ */
+void
+sp_setup_line(struct setup_context *setup,
+              const float (*v0)[4],
+              const float (*v1)[4])
+{
+   int x0 = (int) v0[0][0];
+   int x1 = (int) v1[0][0];
+   int y0 = (int) v0[0][1];
+   int y1 = (int) v1[0][1];
+   int dx = x1 - x0;
+   int dy = y1 - y0;
+   int xstep, ystep;
+
+#if DEBUG_VERTS
+   debug_printf("Setup line:\n");
+   print_vertex(setup, v0);
+   print_vertex(setup, v1);
+#endif
+
+   if (setup->softpipe->no_rast)
+      return;
+
+   if (dx == 0 && dy == 0)
+      return;
+
+   if (!setup_line_coefficients(setup, v0, v1))
+      return;
+
+   assert(v0[0][0] < 1.0e9);
+   assert(v0[0][1] < 1.0e9);
+   assert(v1[0][0] < 1.0e9);
+   assert(v1[0][1] < 1.0e9);
+
+   if (dx < 0) {
+      dx = -dx;   /* make positive */
+      xstep = -1;
+   }
+   else {
+      xstep = 1;
+   }
+
+   if (dy < 0) {
+      dy = -dy;   /* make positive */
+      ystep = -1;
+   }
+   else {
+      ystep = 1;
+   }
+
+   assert(dx >= 0);
+   assert(dy >= 0);
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_LINES);
+
+   setup->quad[0].input.x0 = setup->quad[0].input.y0 = -1;
+   setup->quad[0].inout.mask = 0x0;
+
+   /* XXX temporary: set coverage to 1.0 so the line appears
+    * if AA mode happens to be enabled.
+    */
+   setup->quad[0].input.coverage[0] =
+   setup->quad[0].input.coverage[1] =
+   setup->quad[0].input.coverage[2] =
+   setup->quad[0].input.coverage[3] = 1.0;
+
+   if (dx > dy) {
+      /*** X-major line ***/
+      int i;
+      const int errorInc = dy + dy;
+      int error = errorInc - dx;
+      const int errorDec = error - dx;
+
+      for (i = 0; i < dx; i++) {
+         plot(setup, x0, y0);
+
+         x0 += xstep;
+         if (error < 0) {
+            error += errorInc;
+         }
+         else {
+            error += errorDec;
+            y0 += ystep;
+         }
+      }
+   }
+   else {
+      /*** Y-major line ***/
+      int i;
+      const int errorInc = dx + dx;
+      int error = errorInc - dy;
+      const int errorDec = error - dy;
+
+      for (i = 0; i < dy; i++) {
+         plot(setup, x0, y0);
+
+         y0 += ystep;
+         if (error < 0) {
+            error += errorInc;
+         }
+         else {
+            error += errorDec;
+            x0 += xstep;
+         }
+      }
+   }
+
+   /* draw final quad */
+   if (setup->quad[0].inout.mask) {
+      clip_emit_quad( setup, &setup->quad[0] );
+   }
+}
+
+
+static void
+point_persp_coeff(const struct setup_context *setup,
+                  const float (*vert)[4],
+                  struct tgsi_interp_coef *coef,
+                  uint vertSlot, uint i)
+{
+   assert(i <= 3);
+   coef->dadx[i] = 0.0F;
+   coef->dady[i] = 0.0F;
+   coef->a0[i] = vert[vertSlot][i] * vert[0][3];
+}
+
+
+/**
+ * Do setup for point rasterization, then render the point.
+ * Round or square points...
+ * XXX could optimize a lot for 1-pixel points.
+ */
+void
+sp_setup_point(struct setup_context *setup,
+               const float (*v0)[4])
+{
+   struct softpipe_context *softpipe = setup->softpipe;
+   const struct sp_fragment_shader *spfs = softpipe->fs;
+   const int sizeAttr = setup->softpipe->psize_slot;
+   const float size
+      = sizeAttr > 0 ? v0[sizeAttr][0]
+      : setup->softpipe->rasterizer->point_size;
+   const float halfSize = 0.5F * size;
+   const boolean round = (boolean) setup->softpipe->rasterizer->point_smooth;
+   const float x = v0[0][0];  /* Note: data[0] is always position */
+   const float y = v0[0][1];
+   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   uint fragSlot;
+
+#if DEBUG_VERTS
+   debug_printf("Setup point:\n");
+   print_vertex(setup, v0);
+#endif
+
+   if (softpipe->no_rast)
+      return;
+
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_POINTS);
+
+   /* For points, all interpolants are constant-valued.
+    * However, for point sprites, we'll need to setup texcoords appropriately.
+    * XXX: which coefficients are the texcoords???
+    * We may do point sprites as textured quads...
+    *
+    * KW: We don't know which coefficients are texcoords - ultimately
+    * the choice of what interpolation mode to use for each attribute
+    * should be determined by the fragment program, using
+    * per-attribute declaration statements that include interpolation
+    * mode as a parameter.  So either the fragment program will have
+    * to be adjusted for pointsprite vs normal point behaviour, or
+    * otherwise a special interpolation mode will have to be defined
+    * which matches the required behaviour for point sprites.  But -
+    * the latter is not a feature of normal hardware, and as such
+    * probably should be ruled out on that basis.
+    */
+   setup->vprovoke = v0;
+
+   /* setup Z, W */
+   const_coeff(setup, &setup->posCoef, 0, 2);
+   const_coeff(setup, &setup->posCoef, 0, 3);
+
+   for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      uint j;
+
+      switch (vinfo->attrib[fragSlot].interp_mode) {
+      case INTERP_CONSTANT:
+         /* fall-through */
+      case INTERP_LINEAR:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_PERSPECTIVE:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            point_persp_coeff(setup, setup->vprovoke,
+                              &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_POS:
+         setup_fragcoord_coeff(setup, fragSlot);
+         break;
+      default:
+         assert(0);
+      }
+
+      if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
+         /* convert 0 to 1.0 and 1 to -1.0 */
+         setup->coef[fragSlot].a0[0] = setup->facing * -2.0f + 1.0f;
+         setup->coef[fragSlot].dadx[0] = 0.0;
+         setup->coef[fragSlot].dady[0] = 0.0;
+      }
+   }
+
+
+   if (halfSize <= 0.5 && !round) {
+      /* special case for 1-pixel points */
+      const int ix = ((int) x) & 1;
+      const int iy = ((int) y) & 1;
+      setup->quad[0].input.x0 = (int) x - ix;
+      setup->quad[0].input.y0 = (int) y - iy;
+      setup->quad[0].inout.mask = (1 << ix) << (2 * iy);
+      clip_emit_quad( setup, &setup->quad[0] );
+   }
+   else {
+      if (round) {
+         /* rounded points */
+         const int ixmin = block((int) (x - halfSize));
+         const int ixmax = block((int) (x + halfSize));
+         const int iymin = block((int) (y - halfSize));
+         const int iymax = block((int) (y + halfSize));
+         const float rmin = halfSize - 0.7071F;  /* 0.7071 = sqrt(2)/2 */
+         const float rmax = halfSize + 0.7071F;
+         const float rmin2 = MAX2(0.0F, rmin * rmin);
+         const float rmax2 = rmax * rmax;
+         const float cscale = 1.0F / (rmax2 - rmin2);
+         int ix, iy;
+
+         for (iy = iymin; iy <= iymax; iy += 2) {
+            for (ix = ixmin; ix <= ixmax; ix += 2) {
+               float dx, dy, dist2, cover;
+
+               setup->quad[0].inout.mask = 0x0;
+
+               dx = (ix + 0.5f) - x;
+               dy = (iy + 0.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad[0].input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_LEFT;
+               }
+
+               dx = (ix + 1.5f) - x;
+               dy = (iy + 0.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad[0].input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_RIGHT;
+               }
+
+               dx = (ix + 0.5f) - x;
+               dy = (iy + 1.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_LEFT;
+               }
+
+               dx = (ix + 1.5f) - x;
+               dy = (iy + 1.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_RIGHT;
+               }
+
+               if (setup->quad[0].inout.mask) {
+                  setup->quad[0].input.x0 = ix;
+                  setup->quad[0].input.y0 = iy;
+                  clip_emit_quad( setup, &setup->quad[0] );
+               }
+            }
+         }
+      }
+      else {
+         /* square points */
+         const int xmin = (int) (x + 0.75 - halfSize);
+         const int ymin = (int) (y + 0.25 - halfSize);
+         const int xmax = xmin + (int) size;
+         const int ymax = ymin + (int) size;
+         /* XXX could apply scissor to xmin,ymin,xmax,ymax now */
+         const int ixmin = block(xmin);
+         const int ixmax = block(xmax - 1);
+         const int iymin = block(ymin);
+         const int iymax = block(ymax - 1);
+         int ix, iy;
+
+         /*
+         debug_printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
+         */
+         for (iy = iymin; iy <= iymax; iy += 2) {
+            uint rowMask = 0xf;
+            if (iy < ymin) {
+               /* above the top edge */
+               rowMask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+            }
+            if (iy + 1 >= ymax) {
+               /* below the bottom edge */
+               rowMask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+            }
+
+            for (ix = ixmin; ix <= ixmax; ix += 2) {
+               uint mask = rowMask;
+
+               if (ix < xmin) {
+                  /* fragment is past left edge of point, turn off left bits */
+                  mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+               }
+               if (ix + 1 >= xmax) {
+                  /* past the right edge */
+                  mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+               }
+
+               setup->quad[0].inout.mask = mask;
+               setup->quad[0].input.x0 = ix;
+               setup->quad[0].input.y0 = iy;
+               clip_emit_quad( setup, &setup->quad[0] );
+            }
+         }
+      }
+   }
+}
+
+
+/**
+ * Called by vbuf code just before we start buffering primitives.
+ */
+void
+sp_setup_prepare(struct setup_context *setup)
+{
+   struct softpipe_context *sp = setup->softpipe;
+
+   if (sp->dirty) {
+      softpipe_update_derived(sp);
+   }
+
+   /* Note: nr_attrs is only used for debugging (vertex printing) */
+   setup->nr_vertex_attrs = draw_num_shader_outputs(sp->draw);
+
+   sp->quad.first->begin( sp->quad.first );
+
+   if (sp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
+       sp->rasterizer->fill_front == PIPE_POLYGON_MODE_FILL &&
+       sp->rasterizer->fill_back == PIPE_POLYGON_MODE_FILL) {
+      /* we'll do culling */
+      setup->cull_face = sp->rasterizer->cull_face;
+   }
+   else {
+      /* 'draw' will do culling */
+      setup->cull_face = PIPE_FACE_NONE;
+   }
+}
+
+
+void
+sp_setup_destroy_context(struct setup_context *setup)
+{
+   FREE( setup );
+}
+
+
+/**
+ * Create a new primitive setup/render stage.
+ */
+struct setup_context *
+sp_setup_create_context(struct softpipe_context *softpipe)
+{
+   struct setup_context *setup = CALLOC_STRUCT(setup_context);
+   unsigned i;
+
+   setup->softpipe = softpipe;
+
+   for (i = 0; i < MAX_QUADS; i++) {
+      setup->quad[i].coef = setup->coef;
+      setup->quad[i].posCoef = &setup->posCoef;
+   }
+
+   setup->span.left[0] = 1000000;     /* greater than right[0] */
+   setup->span.left[1] = 1000000;     /* greater than right[1] */
+
+   return setup;
+}
diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h
new file mode 100644
index 0000000000..9c8844d2e8
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_setup.h
@@ -0,0 +1,53 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#ifndef SP_SETUP_H
+#define SP_SETUP_H
+
+struct setup_context;
+struct softpipe_context;
+
+void 
+sp_setup_tri( struct setup_context *setup,
+	   const float (*v0)[4],
+	   const float (*v1)[4],
+	   const float (*v2)[4] );
+
+void
+sp_setup_line(struct setup_context *setup,
+           const float (*v0)[4],
+           const float (*v1)[4]);
+
+void
+sp_setup_point( struct setup_context *setup,
+             const float (*v0)[4] );
+
+
+struct setup_context *sp_setup_create_context( struct softpipe_context *softpipe );
+void sp_setup_prepare( struct setup_context *setup );
+void sp_setup_destroy_context( struct setup_context *setup );
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
new file mode 100644
index 0000000000..7d6b86dce0
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -0,0 +1,300 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_STATE_H
+#define SP_STATE_H
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+
+#define SP_NEW_VIEWPORT      0x1
+#define SP_NEW_RASTERIZER    0x2
+#define SP_NEW_FS            0x4
+#define SP_NEW_BLEND         0x8
+#define SP_NEW_CLIP          0x10
+#define SP_NEW_SCISSOR       0x20
+#define SP_NEW_STIPPLE       0x40
+#define SP_NEW_FRAMEBUFFER   0x80
+#define SP_NEW_DEPTH_STENCIL_ALPHA 0x100
+#define SP_NEW_CONSTANTS     0x200
+#define SP_NEW_SAMPLER       0x400
+#define SP_NEW_TEXTURE       0x800
+#define SP_NEW_VERTEX        0x1000
+#define SP_NEW_VS            0x2000
+#define SP_NEW_QUERY         0x4000
+#define SP_NEW_GS            0x8000
+#define SP_NEW_SO            0x10000
+#define SP_NEW_SO_BUFFERS    0x20000
+
+
+struct tgsi_sampler;
+struct tgsi_exec_machine;
+struct vertex_info;
+
+
+/**
+ * Subclass of pipe_shader_state (though it doesn't really need to be).
+ *
+ * This is starting to look an awful lot like a quad pipeline stage...
+ */
+struct sp_fragment_shader {
+   struct pipe_shader_state shader;
+
+   struct tgsi_shader_info info;
+
+   boolean origin_lower_left; /**< fragment shader uses lower left position origin? */
+   boolean pixel_center_integer; /**< fragment shader uses integer pixel center? */
+
+   void (*prepare)( const struct sp_fragment_shader *shader,
+		    struct tgsi_exec_machine *machine,
+		    struct tgsi_sampler **samplers);
+
+   /* Run the shader - this interface will get cleaned up in the
+    * future:
+    */
+   unsigned (*run)( const struct sp_fragment_shader *shader,
+		    struct tgsi_exec_machine *machine,
+		    struct quad_header *quad );
+
+
+   void (*delete)( struct sp_fragment_shader * );
+};
+
+
+/** Subclass of pipe_shader_state */
+struct sp_vertex_shader {
+   struct pipe_shader_state shader;
+   struct draw_vertex_shader *draw_data;
+   int max_sampler;             /* -1 if no samplers */
+};
+
+/** Subclass of pipe_shader_state */
+struct sp_geometry_shader {
+   struct pipe_shader_state shader;
+   struct draw_geometry_shader *draw_data;
+   int max_sampler;
+};
+
+struct sp_velems_state {
+   unsigned count;
+   struct pipe_vertex_element velem[PIPE_MAX_ATTRIBS];
+};
+
+struct sp_so_state {
+   struct pipe_stream_output_state base;
+};
+
+
+void *
+softpipe_create_blend_state(struct pipe_context *,
+                            const struct pipe_blend_state *);
+void softpipe_bind_blend_state(struct pipe_context *,
+                               void *);
+void softpipe_delete_blend_state(struct pipe_context *,
+                                 void *);
+
+void *
+softpipe_create_sampler_state(struct pipe_context *,
+                              const struct pipe_sampler_state *);
+void softpipe_bind_sampler_states(struct pipe_context *, unsigned, void **);
+void
+softpipe_bind_vertex_sampler_states(struct pipe_context *,
+                                    unsigned num_samplers,
+                                    void **samplers);
+void
+softpipe_bind_geometry_sampler_states(struct pipe_context *,
+                                      unsigned num_samplers,
+                                      void **samplers);
+void softpipe_delete_sampler_state(struct pipe_context *, void *);
+
+void *
+softpipe_create_depth_stencil_state(struct pipe_context *,
+                                    const struct pipe_depth_stencil_alpha_state *);
+void softpipe_bind_depth_stencil_state(struct pipe_context *, void *);
+void softpipe_delete_depth_stencil_state(struct pipe_context *, void *);
+
+void *
+softpipe_create_rasterizer_state(struct pipe_context *,
+                                 const struct pipe_rasterizer_state *);
+void softpipe_bind_rasterizer_state(struct pipe_context *, void *);
+void softpipe_delete_rasterizer_state(struct pipe_context *, void *);
+
+void softpipe_set_framebuffer_state( struct pipe_context *,
+                                     const struct pipe_framebuffer_state * );
+
+void softpipe_set_blend_color( struct pipe_context *pipe,
+                               const struct pipe_blend_color *blend_color );
+
+void softpipe_set_stencil_ref( struct pipe_context *pipe,
+                               const struct pipe_stencil_ref *stencil_ref );
+
+void softpipe_set_clip_state( struct pipe_context *,
+                              const struct pipe_clip_state * );
+
+void softpipe_set_sample_mask( struct pipe_context *,
+                               unsigned sample_mask );
+
+void softpipe_set_constant_buffer(struct pipe_context *,
+                                  uint shader, uint index,
+                                  struct pipe_resource *buf);
+
+void *softpipe_create_fs_state(struct pipe_context *,
+                               const struct pipe_shader_state *);
+void softpipe_bind_fs_state(struct pipe_context *, void *);
+void softpipe_delete_fs_state(struct pipe_context *, void *);
+void *softpipe_create_vs_state(struct pipe_context *,
+                               const struct pipe_shader_state *);
+void softpipe_bind_vs_state(struct pipe_context *, void *);
+void softpipe_delete_vs_state(struct pipe_context *, void *);
+void *softpipe_create_gs_state(struct pipe_context *,
+                               const struct pipe_shader_state *);
+void softpipe_bind_gs_state(struct pipe_context *, void *);
+void softpipe_delete_gs_state(struct pipe_context *, void *);
+
+void *softpipe_create_vertex_elements_state(struct pipe_context *,
+                                            unsigned count,
+                                            const struct pipe_vertex_element *);
+void softpipe_bind_vertex_elements_state(struct pipe_context *, void *);
+void softpipe_delete_vertex_elements_state(struct pipe_context *, void *);
+
+void softpipe_set_polygon_stipple( struct pipe_context *,
+                                   const struct pipe_poly_stipple * );
+
+void softpipe_set_scissor_state( struct pipe_context *,
+                                 const struct pipe_scissor_state * );
+
+void softpipe_set_sampler_views( struct pipe_context *,
+                                 unsigned num,
+                                 struct pipe_sampler_view ** );
+
+void
+softpipe_set_vertex_sampler_views(struct pipe_context *,
+                                  unsigned num,
+                                  struct pipe_sampler_view **);
+
+void
+softpipe_set_geometry_sampler_views(struct pipe_context *,
+                                    unsigned num,
+                                    struct pipe_sampler_view **);
+
+struct pipe_sampler_view *
+softpipe_create_sampler_view(struct pipe_context *pipe,
+                             struct pipe_resource *texture,
+                             const struct pipe_sampler_view *templ);
+
+void
+softpipe_sampler_view_destroy(struct pipe_context *pipe,
+                              struct pipe_sampler_view *view);
+
+void softpipe_set_viewport_state( struct pipe_context *,
+                                  const struct pipe_viewport_state * );
+
+void softpipe_set_vertex_buffers(struct pipe_context *,
+                                 unsigned count,
+                                 const struct pipe_vertex_buffer *);
+
+
+void softpipe_update_derived( struct softpipe_context *softpipe );
+
+
+void softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                          unsigned start, unsigned count);
+
+void softpipe_draw_elements(struct pipe_context *pipe,
+                            struct pipe_resource *indexBuffer,
+                            unsigned indexSize, int indexBias,
+                            unsigned mode, unsigned start, unsigned count);
+void
+softpipe_draw_range_elements(struct pipe_context *pipe,
+                             struct pipe_resource *indexBuffer,
+                             unsigned indexSize,
+                             int indexBias,
+                             unsigned min_index,
+                             unsigned max_index,
+                             unsigned mode, unsigned start, unsigned count);
+
+void
+softpipe_draw_arrays_instanced(struct pipe_context *pipe,
+                               unsigned mode,
+                               unsigned start,
+                               unsigned count,
+                               unsigned startInstance,
+                               unsigned instanceCount);
+
+void
+softpipe_draw_elements_instanced(struct pipe_context *pipe,
+                                 struct pipe_resource *indexBuffer,
+                                 unsigned indexSize,
+                                 int indexBias,
+                                 unsigned mode,
+                                 unsigned start,
+                                 unsigned count,
+                                 unsigned startInstance,
+                                 unsigned instanceCount);
+
+void softpipe_draw_stream_output(struct pipe_context *pipe, unsigned mode);
+
+void
+softpipe_map_transfers(struct softpipe_context *sp);
+
+void
+softpipe_unmap_transfers(struct softpipe_context *sp);
+
+void
+softpipe_map_texture_surfaces(struct softpipe_context *sp);
+
+void
+softpipe_unmap_texture_surfaces(struct softpipe_context *sp);
+
+
+struct vertex_info *
+softpipe_get_vertex_info(struct softpipe_context *softpipe);
+
+struct vertex_info *
+softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe);
+
+void *
+softpipe_create_stream_output_state(
+   struct pipe_context *pipe,
+   const struct pipe_stream_output_state *templ);
+void
+softpipe_bind_stream_output_state(struct pipe_context *pipe,
+                                  void *so);
+void
+softpipe_delete_stream_output_state(struct pipe_context *pipe, void *so);
+
+void
+softpipe_set_stream_output_buffers(struct pipe_context *pipe,
+                                   struct pipe_resource **buffers,
+                                   int *offsets,
+                                   int num_buffers);
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_state_blend.c b/src/gallium/drivers/softpipe/sp_state_blend.c
new file mode 100644
index 0000000000..2a203f44e5
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_blend.c
@@ -0,0 +1,120 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+#include "sp_context.h"
+#include "sp_state.h"
+
+
+void *
+softpipe_create_blend_state(struct pipe_context *pipe,
+                            const struct pipe_blend_state *blend)
+{
+   return mem_dup(blend, sizeof(*blend));
+}
+
+void softpipe_bind_blend_state( struct pipe_context *pipe,
+                                void *blend )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   draw_flush(softpipe->draw);
+
+   softpipe->blend = (struct pipe_blend_state *)blend;
+
+   softpipe->dirty |= SP_NEW_BLEND;
+}
+
+void softpipe_delete_blend_state(struct pipe_context *pipe,
+                                 void *blend)
+{
+   FREE( blend );
+}
+
+
+void softpipe_set_blend_color( struct pipe_context *pipe,
+                               const struct pipe_blend_color *blend_color )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   draw_flush(softpipe->draw);
+
+   softpipe->blend_color = *blend_color;
+
+   softpipe->dirty |= SP_NEW_BLEND;
+}
+
+
+/** XXX move someday?  Or consolidate all these simple state setters
+ * into one file.
+ */
+
+
+void *
+softpipe_create_depth_stencil_state(struct pipe_context *pipe,
+                                    const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   return mem_dup(depth_stencil, sizeof(*depth_stencil));
+}
+
+void
+softpipe_bind_depth_stencil_state(struct pipe_context *pipe,
+                                  void *depth_stencil)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->depth_stencil = (struct pipe_depth_stencil_alpha_state *)depth_stencil;
+
+   softpipe->dirty |= SP_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+void
+softpipe_delete_depth_stencil_state(struct pipe_context *pipe, void *depth)
+{
+   FREE( depth );
+}
+
+void softpipe_set_stencil_ref( struct pipe_context *pipe,
+                               const struct pipe_stencil_ref *stencil_ref )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->stencil_ref = *stencil_ref;
+
+   softpipe->dirty |= SP_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+void
+softpipe_set_sample_mask(struct pipe_context *pipe,
+                         unsigned sample_mask)
+{
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_state_clip.c b/src/gallium/drivers/softpipe/sp_state_clip.c
new file mode 100644
index 0000000000..4946c776e3
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_clip.c
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+#include "sp_context.h"
+#include "sp_state.h"
+#include "draw/draw_context.h"
+
+
+void softpipe_set_clip_state( struct pipe_context *pipe,
+			     const struct pipe_clip_state *clip )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   /* pass the clip state to the draw module */
+   draw_set_clip_state(softpipe->draw, clip);
+}
+
+
+void softpipe_set_viewport_state( struct pipe_context *pipe,
+                                  const struct pipe_viewport_state *viewport )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_state(softpipe->draw, viewport);
+
+   softpipe->viewport = *viewport; /* struct copy */
+   softpipe->dirty |= SP_NEW_VIEWPORT;
+}
+
+
+void softpipe_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   draw_flush(softpipe->draw);
+
+   softpipe->scissor = *scissor; /* struct copy */
+   softpipe->dirty |= SP_NEW_SCISSOR;
+}
+
+
+void softpipe_set_polygon_stipple( struct pipe_context *pipe,
+                                   const struct pipe_poly_stipple *stipple )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   draw_flush(softpipe->draw);
+
+   softpipe->poly_stipple = *stipple; /* struct copy */
+   softpipe->dirty |= SP_NEW_STIPPLE;
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
new file mode 100644
index 0000000000..3ba4d934fd
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -0,0 +1,281 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "sp_context.h"
+#include "sp_screen.h"
+#include "sp_state.h"
+#include "sp_texture.h"
+#include "sp_tex_tile_cache.h"
+
+
+/**
+ * Mark the current vertex layout as "invalid".
+ * We'll validate the vertex layout later, when we start to actually
+ * render a point or line or tri.
+ */
+static void
+invalidate_vertex_layout(struct softpipe_context *softpipe)
+{
+   softpipe->vertex_info.num_attribs =  0;
+}
+
+
+/**
+ * The vertex info describes how to convert the post-transformed vertices
+ * (simple float[][4]) used by the 'draw' module into vertices for
+ * rasterization.
+ *
+ * This function validates the vertex layout and returns a pointer to a
+ * vertex_info object.
+ */
+struct vertex_info *
+softpipe_get_vertex_info(struct softpipe_context *softpipe)
+{
+   struct vertex_info *vinfo = &softpipe->vertex_info;
+
+   if (vinfo->num_attribs == 0) {
+      /* compute vertex layout now */
+      const struct sp_fragment_shader *spfs = softpipe->fs;
+      struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
+      const uint num = draw_num_shader_outputs(softpipe->draw);
+      uint i;
+
+      /* Tell draw_vbuf to simply emit the whole post-xform vertex
+       * as-is.  No longer any need to try and emit draw vertex_header
+       * info.
+       */
+      vinfo_vbuf->num_attribs = 0;
+      for (i = 0; i < num; i++) {
+	 draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
+      }
+      draw_compute_vertex_size(vinfo_vbuf);
+
+      /*
+       * Loop over fragment shader inputs, searching for the matching output
+       * from the vertex shader.
+       */
+      vinfo->num_attribs = 0;
+      for (i = 0; i < spfs->info.num_inputs; i++) {
+         int src;
+         enum interp_mode interp;
+
+         switch (spfs->info.input_interpolate[i]) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = INTERP_CONSTANT;
+            break;
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = INTERP_LINEAR;
+            break;
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = INTERP_PERSPECTIVE;
+            break;
+         default:
+            assert(0);
+            interp = INTERP_LINEAR;
+         }
+
+         switch (spfs->info.input_semantic_name[i]) {
+         case TGSI_SEMANTIC_POSITION:
+            interp = INTERP_POS;
+            break;
+
+         case TGSI_SEMANTIC_COLOR:
+            if (softpipe->rasterizer->flatshade) {
+               interp = INTERP_CONSTANT;
+            }
+            break;
+         }
+
+         /* this includes texcoords and varying vars */
+         src = draw_find_shader_output(softpipe->draw,
+                                       spfs->info.input_semantic_name[i],
+                                       spfs->info.input_semantic_index[i]);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
+      }
+
+      softpipe->psize_slot = draw_find_shader_output(softpipe->draw,
+                                                 TGSI_SEMANTIC_PSIZE, 0);
+      if (softpipe->psize_slot > 0) {
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT,
+                               softpipe->psize_slot);
+      }
+
+      draw_compute_vertex_size(vinfo);
+   }
+
+   return vinfo;
+}
+
+
+/**
+ * Called from vbuf module.
+ *
+ * Note that there's actually two different vertex layouts in softpipe.
+ *
+ * The normal one is computed in softpipe_get_vertex_info() above and is
+ * used by the point/line/tri "setup" code.
+ *
+ * The other one (this one) is only used by the vbuf module (which is
+ * not normally used by default but used in testing).  For the vbuf module,
+ * we basically want to pass-through the draw module's vertex layout as-is.
+ * When the softpipe vbuf code begins drawing, the normal vertex layout
+ * will come into play again.
+ */
+struct vertex_info *
+softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
+{
+   (void) softpipe_get_vertex_info(softpipe);
+   return &softpipe->vertex_info_vbuf;
+}
+
+
+/**
+ * Recompute cliprect from scissor bounds, scissor enable and surface size.
+ */
+static void
+compute_cliprect(struct softpipe_context *sp)
+{
+   /* SP_NEW_FRAMEBUFFER
+    */
+   uint surfWidth = sp->framebuffer.width;
+   uint surfHeight = sp->framebuffer.height;
+
+   /* SP_NEW_RASTERIZER
+    */
+   if (sp->rasterizer->scissor) {
+
+      /* SP_NEW_SCISSOR
+       *
+       * clip to scissor rect:
+       */
+      sp->cliprect.minx = MAX2(sp->scissor.minx, 0);
+      sp->cliprect.miny = MAX2(sp->scissor.miny, 0);
+      sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth);
+      sp->cliprect.maxy = MIN2(sp->scissor.maxy, surfHeight);
+   }
+   else {
+      /* clip to surface bounds */
+      sp->cliprect.minx = 0;
+      sp->cliprect.miny = 0;
+      sp->cliprect.maxx = surfWidth;
+      sp->cliprect.maxy = surfHeight;
+   }
+}
+
+
+static void
+update_tgsi_samplers( struct softpipe_context *softpipe )
+{
+   unsigned i;
+
+   softpipe_reset_sampler_varients( softpipe );
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      struct softpipe_tex_tile_cache *tc = softpipe->tex_cache[i];
+      if (tc->texture) {
+         struct softpipe_resource *spt = softpipe_resource(tc->texture);
+         if (spt->timestamp != tc->timestamp) {
+	    sp_tex_tile_cache_validate_texture( tc );
+            /*
+            _debug_printf("INV %d %d\n", tc->timestamp, spt->timestamp);
+            */
+            tc->timestamp = spt->timestamp;
+         }
+      }
+   }
+
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      struct softpipe_tex_tile_cache *tc = softpipe->vertex_tex_cache[i];
+
+      if (tc->texture) {
+         struct softpipe_resource *spt = softpipe_resource(tc->texture);
+
+         if (spt->timestamp != tc->timestamp) {
+	    sp_tex_tile_cache_validate_texture(tc);
+            tc->timestamp = spt->timestamp;
+         }
+      }
+   }
+
+   for (i = 0; i < PIPE_MAX_GEOMETRY_SAMPLERS; i++) {
+      struct softpipe_tex_tile_cache *tc = softpipe->geometry_tex_cache[i];
+
+      if (tc->texture) {
+         struct softpipe_resource *spt = softpipe_resource(tc->texture);
+
+         if (spt->timestamp != tc->timestamp) {
+	    sp_tex_tile_cache_validate_texture(tc);
+            tc->timestamp = spt->timestamp;
+         }
+      }
+   }
+}
+
+
+/* Hopefully this will remain quite simple, otherwise need to pull in
+ * something like the state tracker mechanism.
+ */
+void softpipe_update_derived( struct softpipe_context *softpipe )
+{
+   struct softpipe_screen *sp_screen = softpipe_screen(softpipe->pipe.screen);
+
+   /* Check for updated textures.
+    */
+   if (softpipe->tex_timestamp != sp_screen->timestamp) {
+      softpipe->tex_timestamp = sp_screen->timestamp;
+      softpipe->dirty |= SP_NEW_TEXTURE;
+   }
+      
+   if (softpipe->dirty & (SP_NEW_SAMPLER |
+                          SP_NEW_TEXTURE |
+                          SP_NEW_FS | 
+                          SP_NEW_VS))
+      update_tgsi_samplers( softpipe );
+
+   if (softpipe->dirty & (SP_NEW_RASTERIZER |
+                          SP_NEW_FS |
+                          SP_NEW_VS))
+      invalidate_vertex_layout( softpipe );
+
+   if (softpipe->dirty & (SP_NEW_SCISSOR |
+                          SP_NEW_RASTERIZER |
+                          SP_NEW_FRAMEBUFFER))
+      compute_cliprect(softpipe);
+
+   if (softpipe->dirty & (SP_NEW_BLEND |
+                          SP_NEW_DEPTH_STENCIL_ALPHA |
+                          SP_NEW_FRAMEBUFFER |
+                          SP_NEW_FS))
+      sp_build_quad_pipeline(softpipe);
+
+   softpipe->dirty = 0;
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
new file mode 100644
index 0000000000..3fbf1f2578
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -0,0 +1,266 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_fs.h"
+#include "sp_texture.h"
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vs.h"
+#include "draw/draw_gs.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_parse.h"
+
+
+void *
+softpipe_create_fs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct sp_fragment_shader *state;
+   unsigned i;
+
+   /* debug */
+   if (softpipe->dump_fs) 
+      tgsi_dump(templ->tokens, 0);
+
+   /* codegen */
+   state = softpipe_create_fs_sse( softpipe, templ );
+   if (!state) {
+      state = softpipe_create_fs_exec( softpipe, templ );
+   }
+
+   assert(state);
+
+   /* get/save the summary info for this shader */
+   tgsi_scan_shader(templ->tokens, &state->info);
+
+   for (i = 0; i < state->info.num_properties; ++i) {
+      if (state->info.properties[i].name == TGSI_PROPERTY_FS_COORD_ORIGIN)
+         state->origin_lower_left = state->info.properties[i].data[0];
+      else if (state->info.properties[i].name == TGSI_PROPERTY_FS_COORD_PIXEL_CENTER)
+	 state->pixel_center_integer = state->info.properties[i].data[0];
+   }
+
+   return state;
+}
+
+
+void
+softpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   draw_flush(softpipe->draw);
+
+   if (softpipe->fs == fs)
+      return;
+
+   draw_flush(softpipe->draw);
+
+   softpipe->fs = fs;
+
+   softpipe->dirty |= SP_NEW_FS;
+}
+
+
+void
+softpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct sp_fragment_shader *state = fs;
+
+   assert(fs != softpipe_context(pipe)->fs);
+
+   if (softpipe->fs_machine->Tokens == state->shader.tokens) {
+      /* unbind the shader from the tgsi executor if we're
+       * deleting it.
+       */
+      tgsi_exec_machine_bind_shader(softpipe->fs_machine, NULL, 0, NULL);
+   }
+
+   state->delete( state );
+}
+
+
+void *
+softpipe_create_vs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct sp_vertex_shader *state;
+
+   state = CALLOC_STRUCT(sp_vertex_shader);
+   if (state == NULL ) 
+      goto fail;
+
+   /* copy shader tokens, the ones passed in will go away.
+    */
+   state->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (state->shader.tokens == NULL)
+      goto fail;
+
+   state->draw_data = draw_create_vertex_shader(softpipe->draw, templ);
+   if (state->draw_data == NULL) 
+      goto fail;
+
+   state->max_sampler = state->draw_data->info.file_max[TGSI_FILE_SAMPLER];
+
+   return state;
+
+fail:
+   if (state) {
+      FREE( (void *)state->shader.tokens );
+      FREE( state->draw_data );
+      FREE( state );
+   }
+   return NULL;
+}
+
+
+void
+softpipe_bind_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->vs = (struct sp_vertex_shader *) vs;
+
+   draw_bind_vertex_shader(softpipe->draw,
+                           (softpipe->vs ? softpipe->vs->draw_data : NULL));
+
+   softpipe->dirty |= SP_NEW_VS;
+}
+
+
+void
+softpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   struct sp_vertex_shader *state = (struct sp_vertex_shader *) vs;
+
+   draw_delete_vertex_shader(softpipe->draw, state->draw_data);
+   FREE( (void *)state->shader.tokens );
+   FREE( state );
+}
+
+void
+softpipe_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             struct pipe_resource *constants)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   unsigned size = constants ? constants->width0 : 0;
+   const void *data = constants ? softpipe_resource(constants)->data : NULL;
+
+   assert(shader < PIPE_SHADER_TYPES);
+
+   draw_flush(softpipe->draw);
+
+   /* note: reference counting */
+   pipe_resource_reference(&softpipe->constants[shader][index], constants);
+
+   if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_GEOMETRY) {
+      draw_set_mapped_constant_buffer(softpipe->draw, shader, index, data, size);
+   }
+
+   softpipe->mapped_constants[shader][index] = data;
+   softpipe->dirty |= SP_NEW_CONSTANTS;
+}
+
+
+void *
+softpipe_create_gs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct sp_geometry_shader *state;
+
+   state = CALLOC_STRUCT(sp_geometry_shader);
+   if (state == NULL )
+      goto fail;
+
+   /* debug */
+   if (softpipe->dump_gs)
+      tgsi_dump(templ->tokens, 0);
+
+   /* copy shader tokens, the ones passed in will go away.
+    */
+   state->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (state->shader.tokens == NULL)
+      goto fail;
+
+   state->draw_data = draw_create_geometry_shader(softpipe->draw, templ);
+   if (state->draw_data == NULL)
+      goto fail;
+
+   state->max_sampler = state->draw_data->info.file_max[TGSI_FILE_SAMPLER];
+
+   return state;
+
+fail:
+   if (state) {
+      FREE( (void *)state->shader.tokens );
+      FREE( state->draw_data );
+      FREE( state );
+   }
+   return NULL;
+}
+
+
+void
+softpipe_bind_gs_state(struct pipe_context *pipe, void *gs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->gs = (struct sp_geometry_shader *)gs;
+
+   draw_bind_geometry_shader(softpipe->draw,
+                             (softpipe->gs ? softpipe->gs->draw_data : NULL));
+
+   softpipe->dirty |= SP_NEW_GS;
+}
+
+
+void
+softpipe_delete_gs_state(struct pipe_context *pipe, void *gs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   struct sp_geometry_shader *state =
+      (struct sp_geometry_shader *)gs;
+
+   draw_delete_geometry_shader(softpipe->draw,
+                               (state) ? state->draw_data : 0);
+   FREE(state);
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_rasterizer.c b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
new file mode 100644
index 0000000000..c9ede09f26
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
@@ -0,0 +1,65 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_state.h"
+#include "draw/draw_context.h"
+
+
+
+void *
+softpipe_create_rasterizer_state(struct pipe_context *pipe,
+                                 const struct pipe_rasterizer_state *rast)
+{
+   return mem_dup(rast, sizeof(*rast));
+}
+
+void softpipe_bind_rasterizer_state(struct pipe_context *pipe,
+                                    void *rasterizer)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   if (softpipe->rasterizer == rasterizer)
+      return;
+
+   /* pass-through to draw module */
+   draw_set_rasterizer_state(softpipe->draw, rasterizer, rasterizer);
+
+   softpipe->rasterizer = rasterizer;
+
+   softpipe->dirty |= SP_NEW_RASTERIZER;
+}
+
+void softpipe_delete_rasterizer_state(struct pipe_context *pipe,
+                                      void *rasterizer)
+{
+   FREE( rasterizer );
+}
+
+
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
new file mode 100644
index 0000000000..79d9516ad9
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -0,0 +1,416 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *  Brian Paul
+ */
+
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_context.h"
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_texture.h"
+#include "sp_tex_sample.h"
+#include "sp_tex_tile_cache.h"
+
+
+struct sp_sampler {
+   struct pipe_sampler_state base;
+   struct sp_sampler_varient *varients;
+   struct sp_sampler_varient *current;
+};
+
+static struct sp_sampler *sp_sampler( struct pipe_sampler_state *sampler )
+{
+   return (struct sp_sampler *)sampler;
+}
+
+
+void *
+softpipe_create_sampler_state(struct pipe_context *pipe,
+                              const struct pipe_sampler_state *sampler)
+{
+   struct sp_sampler *sp_sampler = CALLOC_STRUCT(sp_sampler);
+
+   sp_sampler->base = *sampler;
+   sp_sampler->varients = NULL;
+
+   return (void *)sp_sampler;
+}
+
+
+void
+softpipe_bind_sampler_states(struct pipe_context *pipe,
+                             unsigned num, void **sampler)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   unsigned i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == softpipe->num_samplers &&
+       !memcmp(softpipe->sampler, sampler, num * sizeof(void *)))
+      return;
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < num; ++i)
+      softpipe->sampler[i] = sampler[i];
+   for (i = num; i < PIPE_MAX_SAMPLERS; ++i)
+      softpipe->sampler[i] = NULL;
+
+   softpipe->num_samplers = num;
+
+   softpipe->dirty |= SP_NEW_SAMPLER;
+}
+
+
+void
+softpipe_bind_vertex_sampler_states(struct pipe_context *pipe,
+                                    unsigned num_samplers,
+                                    void **samplers)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   unsigned i;
+
+   assert(num_samplers <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_samplers == softpipe->num_vertex_samplers &&
+       !memcmp(softpipe->vertex_samplers, samplers, num_samplers * sizeof(void *)))
+      return;
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < num_samplers; ++i)
+      softpipe->vertex_samplers[i] = samplers[i];
+   for (i = num_samplers; i < PIPE_MAX_VERTEX_SAMPLERS; ++i)
+      softpipe->vertex_samplers[i] = NULL;
+
+   softpipe->num_vertex_samplers = num_samplers;
+
+   softpipe->dirty |= SP_NEW_SAMPLER;
+}
+
+void
+softpipe_bind_geometry_sampler_states(struct pipe_context *pipe,
+                                      unsigned num_samplers,
+                                      void **samplers)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   unsigned i;
+
+   assert(num_samplers <= PIPE_MAX_GEOMETRY_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_samplers == softpipe->num_geometry_samplers &&
+       !memcmp(softpipe->geometry_samplers, samplers, num_samplers * sizeof(void *)))
+      return;
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < num_samplers; ++i)
+      softpipe->geometry_samplers[i] = samplers[i];
+   for (i = num_samplers; i < PIPE_MAX_GEOMETRY_SAMPLERS; ++i)
+      softpipe->geometry_samplers[i] = NULL;
+
+   softpipe->num_geometry_samplers = num_samplers;
+
+   softpipe->dirty |= SP_NEW_SAMPLER;
+}
+
+
+struct pipe_sampler_view *
+softpipe_create_sampler_view(struct pipe_context *pipe,
+                             struct pipe_resource *resource,
+                             const struct pipe_sampler_view *templ)
+{
+   struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
+
+   if (view) {
+      *view = *templ;
+      view->reference.count = 1;
+      view->texture = NULL;
+      pipe_resource_reference(&view->texture, resource);
+      view->context = pipe;
+   }
+
+   return view;
+}
+
+
+void
+softpipe_sampler_view_destroy(struct pipe_context *pipe,
+                              struct pipe_sampler_view *view)
+{
+   pipe_resource_reference(&view->texture, NULL);
+   FREE(view);
+}
+
+
+void
+softpipe_set_sampler_views(struct pipe_context *pipe,
+                           unsigned num,
+                           struct pipe_sampler_view **views)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == softpipe->num_sampler_views &&
+       !memcmp(softpipe->sampler_views, views, num * sizeof(struct pipe_sampler_view *)))
+      return;
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      struct pipe_sampler_view *view = i < num ? views[i] : NULL;
+
+      pipe_sampler_view_reference(&softpipe->sampler_views[i], view);
+      sp_tex_tile_cache_set_sampler_view(softpipe->tex_cache[i], view);
+   }
+
+   softpipe->num_sampler_views = num;
+
+   softpipe->dirty |= SP_NEW_TEXTURE;
+}
+
+
+void
+softpipe_set_vertex_sampler_views(struct pipe_context *pipe,
+                                  unsigned num,
+                                  struct pipe_sampler_view **views)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == softpipe->num_vertex_sampler_views &&
+       !memcmp(softpipe->vertex_sampler_views, views, num * sizeof(struct pipe_sampler_view *))) {
+      return;
+   }
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      struct pipe_sampler_view *view = i < num ? views[i] : NULL;
+
+      pipe_sampler_view_reference(&softpipe->vertex_sampler_views[i], view);
+      sp_tex_tile_cache_set_sampler_view(softpipe->vertex_tex_cache[i], view);
+   }
+
+   softpipe->num_vertex_sampler_views = num;
+
+   softpipe->dirty |= SP_NEW_TEXTURE;
+}
+
+void
+softpipe_set_geometry_sampler_views(struct pipe_context *pipe,
+                                    unsigned num,
+                                    struct pipe_sampler_view **views)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_GEOMETRY_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == softpipe->num_geometry_sampler_views &&
+       !memcmp(softpipe->geometry_sampler_views, views, num * sizeof(struct pipe_sampler_view *))) {
+      return;
+   }
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < PIPE_MAX_GEOMETRY_SAMPLERS; i++) {
+      struct pipe_sampler_view *view = i < num ? views[i] : NULL;
+
+      pipe_sampler_view_reference(&softpipe->geometry_sampler_views[i], view);
+      sp_tex_tile_cache_set_sampler_view(softpipe->geometry_tex_cache[i], view);
+   }
+
+   softpipe->num_geometry_sampler_views = num;
+
+   softpipe->dirty |= SP_NEW_TEXTURE;
+}
+
+
+/**
+ * Find/create an sp_sampler_varient object for sampling the given texture,
+ * sampler and tex unit.
+ *
+ * Note that the tex unit is significant.  We can't re-use a sampler
+ * varient for multiple texture units because the sampler varient contains
+ * the texture object pointer.  If the texture object pointer were stored
+ * somewhere outside the sampler varient, we could re-use samplers for
+ * multiple texture units.
+ */
+static struct sp_sampler_varient *
+get_sampler_varient( unsigned unit,
+                     struct sp_sampler *sampler,
+                     struct pipe_resource *resource,
+                     unsigned processor )
+{
+   struct softpipe_resource *sp_texture = softpipe_resource(resource);
+   struct sp_sampler_varient *v = NULL;
+   union sp_sampler_key key;
+
+   /* if this fails, widen the key.unit field and update this assertion */
+   assert(PIPE_MAX_SAMPLERS <= 16);
+
+   key.bits.target = sp_texture->base.target;
+   key.bits.is_pot = sp_texture->pot;
+   key.bits.processor = processor;
+   key.bits.unit = unit;
+   key.bits.pad = 0;
+
+   if (sampler->current && 
+       key.value == sampler->current->key.value) {
+      v = sampler->current;
+   }
+
+   if (v == NULL) {
+      for (v = sampler->varients; v; v = v->next)
+         if (v->key.value == key.value)
+            break;
+
+      if (v == NULL) {
+         v = sp_create_sampler_varient( &sampler->base, key );
+         v->next = sampler->varients;
+         sampler->varients = v;
+      }
+   }
+   
+   sampler->current = v;
+   return v;
+}
+
+
+
+
+void
+softpipe_reset_sampler_varients(struct softpipe_context *softpipe)
+{
+   int i;
+
+   /* It's a bit hard to build these samplers ahead of time -- don't
+    * really know which samplers are going to be used for vertex and
+    * fragment programs.
+    */
+   for (i = 0; i <= softpipe->vs->max_sampler; i++) {
+      if (softpipe->vertex_samplers[i]) {
+         struct pipe_resource *texture = NULL;
+
+         if (softpipe->vertex_sampler_views[i]) {
+            texture = softpipe->vertex_sampler_views[i]->texture;
+         }
+
+         softpipe->tgsi.vert_samplers_list[i] = 
+            get_sampler_varient( i,
+                                 sp_sampler(softpipe->vertex_samplers[i]),
+                                 texture,
+                                 TGSI_PROCESSOR_VERTEX );
+
+         sp_sampler_varient_bind_texture( softpipe->tgsi.vert_samplers_list[i], 
+                                          softpipe->vertex_tex_cache[i],
+                                          texture );
+      }
+   }
+
+   if (softpipe->gs) {
+      for (i = 0; i <= softpipe->gs->max_sampler; i++) {
+         if (softpipe->geometry_samplers[i]) {
+            struct pipe_resource *texture = NULL;
+
+            if (softpipe->geometry_sampler_views[i]) {
+               texture = softpipe->geometry_sampler_views[i]->texture;
+            }
+
+            softpipe->tgsi.geom_samplers_list[i] =
+               get_sampler_varient(
+                  i,
+                  sp_sampler(softpipe->geometry_samplers[i]),
+                  texture,
+                  TGSI_PROCESSOR_GEOMETRY );
+
+            sp_sampler_varient_bind_texture(
+               softpipe->tgsi.geom_samplers_list[i],
+               softpipe->geometry_tex_cache[i],
+               texture );
+         }
+      }
+   }
+
+   for (i = 0; i <= softpipe->fs->info.file_max[TGSI_FILE_SAMPLER]; i++) {
+      if (softpipe->sampler[i]) {
+         struct pipe_resource *texture = NULL;
+
+         if (softpipe->sampler_views[i]) {
+            texture = softpipe->sampler_views[i]->texture;
+         }
+
+         softpipe->tgsi.frag_samplers_list[i] =
+            get_sampler_varient( i,
+                                 sp_sampler(softpipe->sampler[i]),
+                                 texture,
+                                 TGSI_PROCESSOR_FRAGMENT );
+
+         sp_sampler_varient_bind_texture( softpipe->tgsi.frag_samplers_list[i], 
+                                          softpipe->tex_cache[i],
+                                          texture );
+      }
+   }
+}
+
+
+
+void
+softpipe_delete_sampler_state(struct pipe_context *pipe,
+                              void *sampler)
+{
+   struct sp_sampler *sp_sampler = (struct sp_sampler *)sampler;
+   struct sp_sampler_varient *v, *tmp;
+
+   for (v = sp_sampler->varients; v; v = tmp) {
+      tmp = v->next;
+      sp_sampler_varient_destroy(v);
+   }
+
+   FREE( sampler );
+}
+
+
+
diff --git a/src/gallium/drivers/softpipe/sp_state_so.c b/src/gallium/drivers/softpipe/sp_state_so.c
new file mode 100644
index 0000000000..cfe23f9e84
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_so.c
@@ -0,0 +1,124 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_texture.h"
+
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+
+
+void *
+softpipe_create_stream_output_state(struct pipe_context *pipe,
+                                    const struct pipe_stream_output_state *templ)
+{
+   struct sp_so_state *so;
+   so = (struct sp_so_state *) CALLOC_STRUCT(sp_so_state);
+
+   if (so) {
+      so->base.num_outputs = templ->num_outputs;
+      so->base.stride = templ->stride;
+      memcpy(so->base.output_buffer,
+             templ->output_buffer,
+             sizeof(int) * templ->num_outputs);
+      memcpy(so->base.register_index,
+             templ->register_index,
+             sizeof(int) * templ->num_outputs);
+      memcpy(so->base.register_mask,
+             templ->register_mask,
+             sizeof(ubyte) * templ->num_outputs);
+   }
+   return so;
+}
+
+void
+softpipe_bind_stream_output_state(struct pipe_context *pipe,
+                                  void *so)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct sp_so_state *sp_so = (struct sp_so_state *) so;
+
+   softpipe->so = sp_so;
+
+   softpipe->dirty |= SP_NEW_SO;
+
+   if (sp_so)
+      draw_set_so_state(softpipe->draw, &sp_so->base);
+}
+
+void
+softpipe_delete_stream_output_state(struct pipe_context *pipe, void *so)
+{
+   FREE( so );
+}
+
+void
+softpipe_set_stream_output_buffers(struct pipe_context *pipe,
+                                   struct pipe_resource **buffers,
+                                   int *offsets,
+                                   int num_buffers)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   int i;
+   void *map_buffers[PIPE_MAX_SO_BUFFERS];
+
+   assert(num_buffers <= PIPE_MAX_SO_BUFFERS);
+   if (num_buffers > PIPE_MAX_SO_BUFFERS)
+      num_buffers = PIPE_MAX_SO_BUFFERS;
+
+   softpipe->dirty |= SP_NEW_SO_BUFFERS;
+
+   for (i = 0; i < num_buffers; ++i) {
+      void *mapped;
+      struct softpipe_resource *res = softpipe_resource(buffers[i]);
+
+      if (!res) {
+         /* the whole call is invalid, bail out */
+         softpipe->so_target.num_buffers = 0;
+         draw_set_mapped_so_buffers(softpipe->draw, 0, 0);
+         return;
+      }
+
+      softpipe->so_target.buffer[i] = res;
+      softpipe->so_target.offset[i] = offsets[i];
+      softpipe->so_target.so_count[i] = 0;
+
+      mapped = res->data;
+      if (offsets[i] >= 0)
+         map_buffers[i] = ((char*)mapped) + offsets[i];
+      else {
+         /* this is a buffer append */
+         assert(!"appending not implemented");
+         map_buffers[i] = mapped;
+      }
+   }
+   softpipe->so_target.num_buffers = num_buffers;
+
+   draw_set_mapped_so_buffers(softpipe->draw, map_buffers, num_buffers);
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_surface.c b/src/gallium/drivers/softpipe/sp_state_surface.c
new file mode 100644
index 0000000000..2db6faeca4
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_surface.c
@@ -0,0 +1,104 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_tile_cache.h"
+
+#include "draw/draw_context.h"
+
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+
+
+/**
+ * XXX this might get moved someday
+ * Set the framebuffer surface info: color buffers, zbuffer, stencil buffer.
+ * Here, we flush the old surfaces and update the tile cache to point to the new
+ * surfaces.
+ */
+void
+softpipe_set_framebuffer_state(struct pipe_context *pipe,
+                               const struct pipe_framebuffer_state *fb)
+{
+   struct softpipe_context *sp = softpipe_context(pipe);
+   uint i;
+
+   draw_flush(sp->draw);
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      /* check if changing cbuf */
+      if (sp->framebuffer.cbufs[i] != fb->cbufs[i]) {
+         /* flush old */
+         sp_flush_tile_cache(sp->cbuf_cache[i]);
+
+         /* assign new */
+         pipe_surface_reference(&sp->framebuffer.cbufs[i], fb->cbufs[i]);
+
+         /* update cache */
+         sp_tile_cache_set_surface(sp->cbuf_cache[i], fb->cbufs[i]);
+      }
+   }
+
+   sp->framebuffer.nr_cbufs = fb->nr_cbufs;
+
+   /* zbuf changing? */
+   if (sp->framebuffer.zsbuf != fb->zsbuf) {
+      /* flush old */
+      sp_flush_tile_cache(sp->zsbuf_cache);
+
+      /* assign new */
+      pipe_surface_reference(&sp->framebuffer.zsbuf, fb->zsbuf);
+
+      /* update cache */
+      sp_tile_cache_set_surface(sp->zsbuf_cache, fb->zsbuf);
+
+      /* Tell draw module how deep the Z/depth buffer is */
+      if (sp->framebuffer.zsbuf) {
+         int depth_bits;
+         double mrd;
+         depth_bits = util_format_get_component_bits(sp->framebuffer.zsbuf->format,
+                                                     UTIL_FORMAT_COLORSPACE_ZS,
+                                                     0);
+         if (depth_bits > 16) {
+            mrd = 0.0000001;
+         }
+         else {
+            mrd = 0.00002;
+         }
+         draw_set_mrd(sp->draw, mrd);
+      }
+   }
+
+   sp->framebuffer.width = fb->width;
+   sp->framebuffer.height = fb->height;
+
+   sp->dirty |= SP_NEW_FRAMEBUFFER;
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_vertex.c b/src/gallium/drivers/softpipe/sp_state_vertex.c
new file mode 100644
index 0000000000..462f4d2655
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_vertex.c
@@ -0,0 +1,90 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "sp_context.h"
+#include "sp_state.h"
+
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+
+
+void *
+softpipe_create_vertex_elements_state(struct pipe_context *pipe,
+                                      unsigned count,
+                                      const struct pipe_vertex_element *attribs)
+{
+   struct sp_velems_state *velems;
+   assert(count <= PIPE_MAX_ATTRIBS);
+   velems = (struct sp_velems_state *) MALLOC(sizeof(struct sp_velems_state));
+   if (velems) {
+      velems->count = count;
+      memcpy(velems->velem, attribs, sizeof(*attribs) * count);
+   }
+   return velems;
+}
+
+void
+softpipe_bind_vertex_elements_state(struct pipe_context *pipe,
+                                    void *velems)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct sp_velems_state *sp_velems = (struct sp_velems_state *) velems;
+
+   softpipe->velems = sp_velems;
+
+   softpipe->dirty |= SP_NEW_VERTEX;
+
+   if (sp_velems)
+      draw_set_vertex_elements(softpipe->draw, sp_velems->count, sp_velems->velem);
+}
+
+void
+softpipe_delete_vertex_elements_state(struct pipe_context *pipe, void *velems)
+{
+   FREE( velems );
+}
+
+void
+softpipe_set_vertex_buffers(struct pipe_context *pipe,
+                            unsigned count,
+                            const struct pipe_vertex_buffer *buffers)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(softpipe->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   softpipe->num_vertex_buffers = count;
+
+   softpipe->dirty |= SP_NEW_VERTEX;
+
+   draw_set_vertex_buffers(softpipe->draw, count, buffers);
+}
diff --git a/src/gallium/drivers/softpipe/sp_surface.c b/src/gallium/drivers/softpipe/sp_surface.c
new file mode 100644
index 0000000000..55b27e6010
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_surface.c
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_surface.h"
+#include "sp_context.h"
+#include "sp_surface.h"
+
+void
+sp_init_surface_functions(struct softpipe_context *sp)
+{
+   sp->pipe.resource_copy_region = util_resource_copy_region;
+   sp->pipe.clear_render_target = util_clear_render_target;
+   sp->pipe.clear_depth_stencil = util_clear_depth_stencil;
+}
diff --git a/src/gallium/drivers/softpipe/sp_surface.h b/src/gallium/drivers/softpipe/sp_surface.h
new file mode 100644
index 0000000000..22de3ba43f
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_surface.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_SURFACE_H
+#define SP_SURFACE_H
+
+
+struct softpipe_context;
+
+
+extern void
+sp_init_surface_functions(struct softpipe_context *sp);
+
+
+#endif /* SP_SURFACE_H */
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
new file mode 100644
index 0000000000..ff83c66d8b
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -0,0 +1,1986 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2008-2010 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture sampling
+ *
+ * Authors:
+ *   Brian Paul
+ *   Keith Whitwell
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "sp_quad.h"   /* only for #define QUAD_* tokens */
+#include "sp_tex_sample.h"
+#include "sp_tex_tile_cache.h"
+
+
+
+/*
+ * Return fractional part of 'f'.  Used for computing interpolation weights.
+ * Need to be careful with negative values.
+ * Note, if this function isn't perfect you'll sometimes see 1-pixel bands
+ * of improperly weighted linear-filtered textures.
+ * The tests/texwrap.c demo is a good test.
+ */
+static INLINE float
+frac(float f)
+{
+   return f - floorf(f);
+}
+
+
+
+/**
+ * Linear interpolation macro
+ */
+static INLINE float
+lerp(float a, float v0, float v1)
+{
+   return v0 + a * (v1 - v0);
+}
+
+
+/**
+ * Do 2D/biliner interpolation of float values.
+ * v00, v10, v01 and v11 are typically four texture samples in a square/box.
+ * a and b are the horizontal and vertical interpolants.
+ * It's important that this function is inlined when compiled with
+ * optimization!  If we find that's not true on some systems, convert
+ * to a macro.
+ */
+static INLINE float
+lerp_2d(float a, float b,
+        float v00, float v10, float v01, float v11)
+{
+   const float temp0 = lerp(a, v00, v10);
+   const float temp1 = lerp(a, v01, v11);
+   return lerp(b, temp0, temp1);
+}
+
+
+/**
+ * As above, but 3D interpolation of 8 values.
+ */
+static INLINE float
+lerp_3d(float a, float b, float c,
+        float v000, float v100, float v010, float v110,
+        float v001, float v101, float v011, float v111)
+{
+   const float temp0 = lerp_2d(a, b, v000, v100, v010, v110);
+   const float temp1 = lerp_2d(a, b, v001, v101, v011, v111);
+   return lerp(c, temp0, temp1);
+}
+
+
+
+/**
+ * Compute coord % size for repeat wrap modes.
+ * Note that if coord is a signed integer, coord % size doesn't give
+ * the right value for coord < 0 (in terms of texture repeat).  Just
+ * casting to unsigned fixes that.
+ */
+static INLINE int
+repeat(int coord, unsigned size)
+{
+   return (int) ((unsigned) coord % size);
+}
+
+
+/**
+ * Apply texture coord wrapping mode and return integer texture indexes
+ * for a vector of four texcoords (S or T or P).
+ * \param wrapMode  PIPE_TEX_WRAP_x
+ * \param s  the incoming texcoords
+ * \param size  the texture image size
+ * \param icoord  returns the integer texcoords
+ * \return  integer texture index
+ */
+static void
+wrap_nearest_repeat(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [0,1) */
+   /* i limited to [0,size-1] */
+   for (ch = 0; ch < 4; ch++) {
+      int i = util_ifloor(s[ch] * size);
+      icoord[ch] = repeat(i, size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [0,1] */
+   /* i limited to [0,size-1] */
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] <= 0.0F)
+         icoord[ch] = 0;
+      else if (s[ch] >= 1.0F)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp_to_edge(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] < min)
+         icoord[ch] = 0;
+      else if (s[ch] > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp_to_border(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [-1, size] */
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] <= min)
+         icoord[ch] = -1;
+      else if (s[ch] >= max)
+         icoord[ch] = size;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_repeat(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const int flr = util_ifloor(s[ch]);
+      float u = frac(s[ch]);
+      if (flr & 1)
+         u = 1.0F - u;
+      if (u < min)
+         icoord[ch] = 0;
+      else if (u > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      /* s limited to [0,1] */
+      /* i limited to [0,size-1] */
+      const float u = fabsf(s[ch]);
+      if (u <= 0.0F)
+         icoord[ch] = 0;
+      else if (u >= 1.0F)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp_to_edge(const float s[4], unsigned size,
+                                  int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const float u = fabsf(s[ch]);
+      if (u < min)
+         icoord[ch] = 0;
+      else if (u > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp_to_border(const float s[4], unsigned size,
+                                    int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const float u = fabsf(s[ch]);
+      if (u < min)
+         icoord[ch] = -1;
+      else if (u > max)
+         icoord[ch] = size;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+/**
+ * Used to compute texel locations for linear sampling for four texcoords.
+ * \param wrapMode  PIPE_TEX_WRAP_x
+ * \param s  the texcoords
+ * \param size  the texture image size
+ * \param icoord0  returns first texture indexes
+ * \param icoord1  returns second texture indexes (usually icoord0 + 1)
+ * \param w  returns blend factor/weight between texture indexes
+ * \param icoord  returns the computed integer texture coords
+ */
+static void
+wrap_linear_repeat(const float s[4], unsigned size,
+                   int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = s[ch] * size - 0.5F;
+      icoord0[ch] = repeat(util_ifloor(u), size);
+      icoord1[ch] = repeat(icoord0[ch] + 1, size);
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_clamp(const float s[4], unsigned size,
+                  int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.0F, 1.0F);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_clamp_to_edge(const float s[4], unsigned size,
+                          int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.0F, 1.0F);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_clamp_to_border(const float s[4], unsigned size,
+                            int icoord0[4], int icoord1[4], float w[4])
+{
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], min, max);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_repeat(const float s[4], unsigned size,
+                          int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      const int flr = util_ifloor(s[ch]);
+      float u = frac(s[ch]);
+      if (flr & 1)
+         u = 1.0F - u;
+      u = u * size - 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp(const float s[4], unsigned size,
+                         int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u >= 1.0F)
+         u = (float) size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp_to_edge(const float s[4], unsigned size,
+                                 int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u >= 1.0F)
+         u = (float) size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp_to_border(const float s[4], unsigned size,
+                                   int icoord0[4], int icoord1[4], float w[4])
+{
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u <= min)
+         u = min * size;
+      else if (u >= max)
+         u = max * size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+/**
+ * PIPE_TEX_WRAP_CLAMP for nearest sampling, unnormalized coords.
+ */
+static void
+wrap_nearest_unorm_clamp(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      int i = util_ifloor(s[ch]);
+      icoord[ch]= CLAMP(i, 0, (int) size-1);
+   }
+}
+
+
+/**
+ * PIPE_TEX_WRAP_CLAMP_TO_BORDER for nearest sampling, unnormalized coords.
+ */
+static void
+wrap_nearest_unorm_clamp_to_border(const float s[4], unsigned size,
+                                   int icoord[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      icoord[ch]= util_ifloor( CLAMP(s[ch], -0.5F, (float) size + 0.5F) );
+   }
+}
+
+
+/**
+ * PIPE_TEX_WRAP_CLAMP_TO_EDGE for nearest sampling, unnormalized coords.
+ */
+static void
+wrap_nearest_unorm_clamp_to_edge(const float s[4], unsigned size,
+                                 int icoord[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
+   }
+}
+
+
+/**
+ * PIPE_TEX_WRAP_CLAMP for linear sampling, unnormalized coords.
+ */
+static void
+wrap_linear_unorm_clamp(const float s[4], unsigned size,
+                        int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      /* Not exactly what the spec says, but it matches NVIDIA output */
+      float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+/**
+ * PIPE_TEX_WRAP_CLAMP_TO_BORDER for linear sampling, unnormalized coords.
+ */
+static void
+wrap_linear_unorm_clamp_to_border(const float s[4], unsigned size,
+                                  int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], -0.5F, (float) size + 0.5F);
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord1[ch] > (int) size - 1)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+/**
+ * PIPE_TEX_WRAP_CLAMP_TO_EDGE for linear sampling, unnormalized coords.
+ */
+static void
+wrap_linear_unorm_clamp_to_edge(const float s[4], unsigned size,
+                                int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], +0.5F, (float) size - 0.5F);
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord1[ch] > (int) size - 1)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+
+/**
+ * Examine the quad's texture coordinates to compute the partial
+ * derivatives w.r.t X and Y, then compute lambda (level of detail).
+ */
+static float
+compute_lambda_1d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE])
+{
+   const struct pipe_resource *texture = samp->texture;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float rho = MAX2(dsdx, dsdy) * texture->width0;
+
+   return util_fast_log2(rho);
+}
+
+
+static float
+compute_lambda_2d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE])
+{
+   const struct pipe_resource *texture = samp->texture;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float dtdx = fabsf(t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]);
+   float dtdy = fabsf(t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]);
+   float maxx = MAX2(dsdx, dsdy) * texture->width0;
+   float maxy = MAX2(dtdx, dtdy) * texture->height0;
+   float rho  = MAX2(maxx, maxy);
+
+   return util_fast_log2(rho);
+}
+
+
+static float
+compute_lambda_3d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE])
+{
+   const struct pipe_resource *texture = samp->texture;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float dtdx = fabsf(t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]);
+   float dtdy = fabsf(t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]);
+   float dpdx = fabsf(p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT]);
+   float dpdy = fabsf(p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT]);
+   float maxx = MAX2(dsdx, dsdy) * texture->width0;
+   float maxy = MAX2(dtdx, dtdy) * texture->height0;
+   float maxz = MAX2(dpdx, dpdy) * texture->depth0;
+   float rho;
+
+   rho = MAX2(maxx, maxy);
+   rho = MAX2(rho, maxz);
+
+   return util_fast_log2(rho);
+}
+
+
+/**
+ * Compute lambda for a vertex texture sampler.
+ * Since there aren't derivatives to use, just return 0.
+ */
+static float
+compute_lambda_vert(const struct sp_sampler_varient *samp,
+                    const float s[QUAD_SIZE],
+                    const float t[QUAD_SIZE],
+                    const float p[QUAD_SIZE])
+{
+   return 0.0f;
+}
+
+
+
+/**
+ * Get a texel from a texture, using the texture tile cache.
+ *
+ * \param addr  the template tex address containing cube, z, face info.
+ * \param x  the x coord of texel within 2D image
+ * \param y  the y coord of texel within 2D image
+ * \param rgba  the quad to put the texel/color into
+ *
+ * XXX maybe move this into sp_tex_tile_cache.c and merge with the
+ * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
+ */
+
+
+
+
+static INLINE const float *
+get_texel_2d_no_border(const struct sp_sampler_varient *samp,
+		       union tex_tile_address addr, int x, int y)
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+
+   return &tile->data.color[y][x][0];
+}
+
+
+static INLINE const float *
+get_texel_2d(const struct sp_sampler_varient *samp,
+	     union tex_tile_address addr, int x, int y)
+{
+   const struct pipe_resource *texture = samp->texture;
+   unsigned level = addr.bits.level;
+
+   if (x < 0 || x >= (int) u_minify(texture->width0, level) ||
+       y < 0 || y >= (int) u_minify(texture->height0, level)) {
+      return samp->sampler->border_color;
+   }
+   else {
+      return get_texel_2d_no_border( samp, addr, x, y );
+   }
+}
+
+
+/* Gather a quad of adjacent texels within a tile:
+ */
+static INLINE void
+get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_varient *samp,
+					union tex_tile_address addr, 
+					unsigned x, unsigned y, 
+					const float *out[4])
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+      
+   out[0] = &tile->data.color[y  ][x  ][0];
+   out[1] = &tile->data.color[y  ][x+1][0];
+   out[2] = &tile->data.color[y+1][x  ][0];
+   out[3] = &tile->data.color[y+1][x+1][0];
+}
+
+
+/* Gather a quad of potentially non-adjacent texels:
+ */
+static INLINE void
+get_texel_quad_2d_no_border(const struct sp_sampler_varient *samp,
+			    union tex_tile_address addr,
+			    int x0, int y0, 
+			    int x1, int y1,
+			    const float *out[4])
+{
+   out[0] = get_texel_2d_no_border( samp, addr, x0, y0 );
+   out[1] = get_texel_2d_no_border( samp, addr, x1, y0 );
+   out[2] = get_texel_2d_no_border( samp, addr, x0, y1 );
+   out[3] = get_texel_2d_no_border( samp, addr, x1, y1 );
+}
+
+/* Can involve a lot of unnecessary checks for border color:
+ */
+static INLINE void
+get_texel_quad_2d(const struct sp_sampler_varient *samp,
+		  union tex_tile_address addr,
+		  int x0, int y0, 
+		  int x1, int y1,
+		  const float *out[4])
+{
+   out[0] = get_texel_2d( samp, addr, x0, y0 );
+   out[1] = get_texel_2d( samp, addr, x1, y0 );
+   out[3] = get_texel_2d( samp, addr, x1, y1 );
+   out[2] = get_texel_2d( samp, addr, x0, y1 );
+}
+
+
+
+/* 3d varients:
+ */
+static INLINE const float *
+get_texel_3d_no_border(const struct sp_sampler_varient *samp,
+                       union tex_tile_address addr, int x, int y, int z)
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   addr.bits.z = z;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+
+   return &tile->data.color[y][x][0];
+}
+
+
+static INLINE const float *
+get_texel_3d(const struct sp_sampler_varient *samp,
+	     union tex_tile_address addr, int x, int y, int z)
+{
+   const struct pipe_resource *texture = samp->texture;
+   unsigned level = addr.bits.level;
+
+   if (x < 0 || x >= (int) u_minify(texture->width0, level) ||
+       y < 0 || y >= (int) u_minify(texture->height0, level) ||
+       z < 0 || z >= (int) u_minify(texture->depth0, level)) {
+      return samp->sampler->border_color;
+   }
+   else {
+      return get_texel_3d_no_border( samp, addr, x, y, z );
+   }
+}
+
+
+/**
+ * Given the logbase2 of a mipmap's base level size and a mipmap level,
+ * return the size (in texels) of that mipmap level.
+ * For example, if level[0].width = 256 then base_pot will be 8.
+ * If level = 2, then we'll return 64 (the width at level=2).
+ * Return 1 if level > base_pot.
+ */
+static INLINE unsigned
+pot_level_size(unsigned base_pot, unsigned level)
+{
+   return (base_pot >= level) ? (1 << (base_pot - level)) : 1;
+}
+
+
+/* Some image-filter fastpaths:
+ */
+static INLINE void
+img_filter_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                const float s[QUAD_SIZE],
+                                const float t[QUAD_SIZE],
+                                const float p[QUAD_SIZE],
+                                const float c0[QUAD_SIZE],
+                                enum tgsi_sampler_control control,
+                                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   unsigned xmax = (xpot - 1) & (TILE_SIZE - 1); /* MIN2(TILE_SIZE, xpot) - 1; */
+   unsigned ymax = (ypot - 1) & (TILE_SIZE - 1); /* MIN2(TILE_SIZE, ypot) - 1; */
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot - 0.5F;
+      float v = t[j] * ypot - 0.5F;
+
+      int uflr = util_ifloor(u);
+      int vflr = util_ifloor(v);
+
+      float xw = u - (float)uflr;
+      float yw = v - (float)vflr;
+
+      int x0 = uflr & (xpot - 1);
+      int y0 = vflr & (ypot - 1);
+
+      const float *tx[4];      
+      
+      /* Can we fetch all four at once:
+       */
+      if (x0 < xmax && y0 < ymax) {
+         get_texel_quad_2d_no_border_single_tile(samp, addr, x0, y0, tx);
+      }
+      else {
+         unsigned x1 = (x0 + 1) & (xpot - 1);
+         unsigned y1 = (y0 + 1) & (ypot - 1);
+         get_texel_quad_2d_no_border(samp, addr, x0, y0, x1, y1, tx);
+      }
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw, yw, 
+                              tx[0][c], tx[1][c], 
+                              tx[2][c], tx[3][c]);
+      }
+   }
+}
+
+
+static INLINE void
+img_filter_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                 const float s[QUAD_SIZE],
+                                 const float t[QUAD_SIZE],
+                                 const float p[QUAD_SIZE],
+                                 const float c0[QUAD_SIZE],
+                                 enum tgsi_sampler_control control,
+                                 float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot;
+      float v = t[j] * ypot;
+
+      int uflr = util_ifloor(u);
+      int vflr = util_ifloor(v);
+
+      int x0 = uflr & (xpot - 1);
+      int y0 = vflr & (ypot - 1);
+
+      const float *out = get_texel_2d_no_border(samp, addr, x0, y0);
+
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }
+   }
+}
+
+
+static INLINE void
+img_filter_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
+                                const float s[QUAD_SIZE],
+                                const float t[QUAD_SIZE],
+                                const float p[QUAD_SIZE],
+                                const float c0[QUAD_SIZE],
+                                enum tgsi_sampler_control control,
+                                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot;
+      float v = t[j] * ypot;
+
+      int x0, y0;
+      const float *out;
+
+      x0 = util_ifloor(u);
+      if (x0 < 0) 
+         x0 = 0;
+      else if (x0 > xpot - 1)
+         x0 = xpot - 1;
+
+      y0 = util_ifloor(v);
+      if (y0 < 0) 
+         y0 = 0;
+      else if (y0 > ypot - 1)
+         y0 = ypot - 1;
+      
+      out = get_texel_2d_no_border(samp, addr, x0, y0);
+
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }
+   }
+}
+
+
+static void
+img_filter_1d_nearest(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        const float c0[QUAD_SIZE],
+                        enum tgsi_sampler_control control,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_resource *texture = samp->texture;
+   unsigned level0, j;
+   int width;
+   int x[4];
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = u_minify(texture->width0, level0);
+
+   assert(width > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, addr, x[j], 0);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }
+   }
+}
+
+
+static void
+img_filter_2d_nearest(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      const float c0[QUAD_SIZE],
+                      enum tgsi_sampler_control control,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_resource *texture = samp->texture;
+   unsigned level0, j;
+   int width, height;
+   int x[4], y[4];
+   union tex_tile_address addr;
+
+
+   level0 = samp->level;
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
+
+   assert(width > 0);
+   assert(height > 0);
+ 
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+   samp->nearest_texcoord_t(t, height, y);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, addr, x[j], y[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }
+   }
+}
+
+
+static INLINE union tex_tile_address
+face(union tex_tile_address addr, unsigned face )
+{
+   addr.bits.face = face;
+   return addr;
+}
+
+
+static void
+img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        const float c0[QUAD_SIZE],
+                        enum tgsi_sampler_control control,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_resource *texture = samp->texture;
+   const unsigned *faces = samp->faces; /* zero when not cube-mapping */
+   unsigned level0, j;
+   int width, height;
+   int x[4], y[4];
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
+
+   assert(width > 0);
+   assert(height > 0);
+ 
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+   samp->nearest_texcoord_t(t, height, y);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, face(addr, faces[j]), x[j], y[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }
+   }
+}
+
+
+static void
+img_filter_3d_nearest(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      const float c0[QUAD_SIZE],
+                      enum tgsi_sampler_control control,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_resource *texture = samp->texture;
+   unsigned level0, j;
+   int width, height, depth;
+   int x[4], y[4], z[4];
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
+   depth = u_minify(texture->depth0, level0);
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   samp->nearest_texcoord_s(s, width,  x);
+   samp->nearest_texcoord_t(t, height, y);
+   samp->nearest_texcoord_p(p, depth,  z);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_3d(samp, addr, x[j], y[j], z[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }      
+   }
+}
+
+
+static void
+img_filter_1d_linear(struct tgsi_sampler *tgsi_sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     const float c0[QUAD_SIZE],
+                     enum tgsi_sampler_control control,
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_resource *texture = samp->texture;
+   unsigned level0, j;
+   int width;
+   int x0[4], x1[4];
+   float xw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = u_minify(texture->width0, level0);
+
+   assert(width > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width, x0, x1, xw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *tx0 = get_texel_2d(samp, addr, x0[j], 0);
+      const float *tx1 = get_texel_2d(samp, addr, x1[j], 0);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp(xw[j], tx0[c], tx1[c]);
+      }
+   }
+}
+
+
+static void
+img_filter_2d_linear(struct tgsi_sampler *tgsi_sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     const float c0[QUAD_SIZE],
+                     enum tgsi_sampler_control control,
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_resource *texture = samp->texture;
+   unsigned level0, j;
+   int width, height;
+   int x0[4], y0[4], x1[4], y1[4];
+   float xw[4], yw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
+
+   assert(width > 0);
+   assert(height > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *tx0 = get_texel_2d(samp, addr, x0[j], y0[j]);
+      const float *tx1 = get_texel_2d(samp, addr, x1[j], y0[j]);
+      const float *tx2 = get_texel_2d(samp, addr, x0[j], y1[j]);
+      const float *tx3 = get_texel_2d(samp, addr, x1[j], y1[j]);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw[j], yw[j],
+                              tx0[c], tx1[c],
+                              tx2[c], tx3[c]);
+      }
+   }
+}
+
+
+static void
+img_filter_cube_linear(struct tgsi_sampler *tgsi_sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       const float c0[QUAD_SIZE],
+                       enum tgsi_sampler_control control,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_resource *texture = samp->texture;
+   const unsigned *faces = samp->faces; /* zero when not cube-mapping */
+   unsigned level0, j;
+   int width, height;
+   int x0[4], y0[4], x1[4], y1[4];
+   float xw[4], yw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
+
+   assert(width > 0);
+   assert(height > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      union tex_tile_address addrj = face(addr, faces[j]);
+      const float *tx0 = get_texel_2d(samp, addrj, x0[j], y0[j]);
+      const float *tx1 = get_texel_2d(samp, addrj, x1[j], y0[j]);
+      const float *tx2 = get_texel_2d(samp, addrj, x0[j], y1[j]);
+      const float *tx3 = get_texel_2d(samp, addrj, x1[j], y1[j]);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw[j], yw[j],
+                              tx0[c], tx1[c],
+                              tx2[c], tx3[c]);
+      }
+   }
+}
+
+
+static void
+img_filter_3d_linear(struct tgsi_sampler *tgsi_sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     const float c0[QUAD_SIZE],
+                     enum tgsi_sampler_control control,
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_resource *texture = samp->texture;
+   unsigned level0, j;
+   int width, height, depth;
+   int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
+   float xw[4], yw[4], zw[4]; /* interpolation weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
+   depth = u_minify(texture->depth0, level0);
+
+   addr.value = 0;
+   addr.bits.level = level0;
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+   samp->linear_texcoord_p(p, depth,  z0, z1, zw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      const float *tx00 = get_texel_3d(samp, addr, x0[j], y0[j], z0[j]);
+      const float *tx01 = get_texel_3d(samp, addr, x1[j], y0[j], z0[j]);
+      const float *tx02 = get_texel_3d(samp, addr, x0[j], y1[j], z0[j]);
+      const float *tx03 = get_texel_3d(samp, addr, x1[j], y1[j], z0[j]);
+      
+      const float *tx10 = get_texel_3d(samp, addr, x0[j], y0[j], z1[j]);
+      const float *tx11 = get_texel_3d(samp, addr, x1[j], y0[j], z1[j]);
+      const float *tx12 = get_texel_3d(samp, addr, x0[j], y1[j], z1[j]);
+      const float *tx13 = get_texel_3d(samp, addr, x1[j], y1[j], z1[j]);
+      
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                              tx00[c], tx01[c],
+                              tx02[c], tx03[c],
+                              tx10[c], tx11[c],
+                              tx12[c], tx13[c]);
+      }
+   }
+}
+
+
+/* Calculate level of detail for every fragment.
+ * Note that lambda has already been biased by global LOD bias.
+ */
+static INLINE void
+compute_lod(const struct pipe_sampler_state *sampler,
+            const float biased_lambda,
+            const float lodbias[QUAD_SIZE],
+            float lod[QUAD_SIZE])
+{
+   uint i;
+
+   for (i = 0; i < QUAD_SIZE; i++) {
+      lod[i] = biased_lambda + lodbias[i];
+      lod[i] = CLAMP(lod[i], sampler->min_lod, sampler->max_lod);
+   }
+}
+
+
+static void
+mip_filter_linear(struct tgsi_sampler *tgsi_sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  const float c0[QUAD_SIZE],
+                  enum tgsi_sampler_control control,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_resource *texture = samp->texture;
+   int level0;
+   float lambda;
+   float lod[QUAD_SIZE];
+
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
+   level0 = (int)lambda;
+
+   if (lambda < 0.0) { 
+      samp->level = 0;
+      samp->mag_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
+   }
+   else if (level0 >= texture->last_level) {
+      samp->level = texture->last_level;
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
+   }
+   else {
+      float levelBlend = lambda - level0;
+      float rgba0[4][4];
+      float rgba1[4][4];
+      int c,j;
+
+      samp->level = level0;
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba0);
+
+      samp->level = level0+1;
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba1);
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
+         }
+      }
+   }
+}
+
+
+/**
+ * Compute nearest mipmap level from texcoords.
+ * Then sample the texture level for four elements of a quad.
+ * \param c0  the LOD bias factors, or absolute LODs (depending on control)
+ */
+static void
+mip_filter_nearest(struct tgsi_sampler *tgsi_sampler,
+                   const float s[QUAD_SIZE],
+                   const float t[QUAD_SIZE],
+                   const float p[QUAD_SIZE],
+                   const float c0[QUAD_SIZE],
+                   enum tgsi_sampler_control control,
+                   float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_resource *texture = samp->texture;
+   float lambda;
+   float lod[QUAD_SIZE];
+
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
+
+   if (lambda < 0.0) { 
+      samp->level = 0;
+      samp->mag_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
+   }
+   else {
+      samp->level = (int)(lambda + 0.5) ;
+      samp->level = MIN2(samp->level, (int)texture->last_level);
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
+   }
+
+#if 0
+   printf("RGBA %g %g %g %g, %g %g %g %g, %g %g %g %g, %g %g %g %g\n",
+          rgba[0][0], rgba[1][0], rgba[2][0], rgba[3][0],
+          rgba[0][1], rgba[1][1], rgba[2][1], rgba[3][1],
+          rgba[0][2], rgba[1][2], rgba[2][2], rgba[3][2],
+          rgba[0][3], rgba[1][3], rgba[2][3], rgba[3][3]);
+#endif
+}
+
+
+static void
+mip_filter_none(struct tgsi_sampler *tgsi_sampler,
+                const float s[QUAD_SIZE],
+                const float t[QUAD_SIZE],
+                const float p[QUAD_SIZE],
+                const float c0[QUAD_SIZE],
+                enum tgsi_sampler_control control,
+                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   float lambda;
+   float lod[QUAD_SIZE];
+
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
+
+   if (lambda < 0.0) { 
+      samp->mag_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
+   }
+   else {
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
+   }
+}
+
+
+
+/**
+ * Specialized version of mip_filter_linear with hard-wired calls to
+ * 2d lambda calculation and 2d_linear_repeat_POT img filters.
+ */
+static void
+mip_filter_linear_2d_linear_repeat_POT(
+   struct tgsi_sampler *tgsi_sampler,
+   const float s[QUAD_SIZE],
+   const float t[QUAD_SIZE],
+   const float p[QUAD_SIZE],
+   const float c0[QUAD_SIZE],
+   enum tgsi_sampler_control control,
+   float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_resource *texture = samp->texture;
+   int level0;
+   float lambda;
+   float lod[QUAD_SIZE];
+
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
+   level0 = (int)lambda;
+
+   /* Catches both negative and large values of level0:
+    */
+   if ((unsigned)level0 >= texture->last_level) { 
+      if (level0 < 0)
+         samp->level = 0;
+      else
+         samp->level = texture->last_level;
+
+      img_filter_2d_linear_repeat_POT(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
+   }
+   else {
+      float levelBlend = lambda - level0;
+      float rgba0[4][4];
+      float rgba1[4][4];
+      int c,j;
+
+      samp->level = level0;
+      img_filter_2d_linear_repeat_POT(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba0);
+
+      samp->level = level0+1;
+      img_filter_2d_linear_repeat_POT(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba1);
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
+         }
+      }
+   }
+}
+
+
+
+/**
+ * Do shadow/depth comparisons.
+ */
+static void
+sample_compare(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               const float c0[QUAD_SIZE],
+               enum tgsi_sampler_control control,
+               float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   int j, k0, k1, k2, k3;
+   float val;
+
+   samp->mip_filter(tgsi_sampler, s, t, p, c0, control, rgba);
+
+   /**
+    * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
+    * When we sampled the depth texture, the depth value was put into all
+    * RGBA channels.  We look at the red channel here.
+    */
+
+   /* compare four texcoords vs. four texture samples */
+   switch (sampler->compare_func) {
+   case PIPE_FUNC_LESS:
+      k0 = p[0] < rgba[0][0];
+      k1 = p[1] < rgba[0][1];
+      k2 = p[2] < rgba[0][2];
+      k3 = p[3] < rgba[0][3];
+      break;
+   case PIPE_FUNC_LEQUAL:
+      k0 = p[0] <= rgba[0][0];
+      k1 = p[1] <= rgba[0][1];
+      k2 = p[2] <= rgba[0][2];
+      k3 = p[3] <= rgba[0][3];
+      break;
+   case PIPE_FUNC_GREATER:
+      k0 = p[0] > rgba[0][0];
+      k1 = p[1] > rgba[0][1];
+      k2 = p[2] > rgba[0][2];
+      k3 = p[3] > rgba[0][3];
+      break;
+   case PIPE_FUNC_GEQUAL:
+      k0 = p[0] >= rgba[0][0];
+      k1 = p[1] >= rgba[0][1];
+      k2 = p[2] >= rgba[0][2];
+      k3 = p[3] >= rgba[0][3];
+      break;
+   case PIPE_FUNC_EQUAL:
+      k0 = p[0] == rgba[0][0];
+      k1 = p[1] == rgba[0][1];
+      k2 = p[2] == rgba[0][2];
+      k3 = p[3] == rgba[0][3];
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      k0 = p[0] != rgba[0][0];
+      k1 = p[1] != rgba[0][1];
+      k2 = p[2] != rgba[0][2];
+      k3 = p[3] != rgba[0][3];
+      break;
+   case PIPE_FUNC_ALWAYS:
+      k0 = k1 = k2 = k3 = 1;
+      break;
+   case PIPE_FUNC_NEVER:
+      k0 = k1 = k2 = k3 = 0;
+      break;
+   default:
+      k0 = k1 = k2 = k3 = 0;
+      assert(0);
+      break;
+   }
+
+   /* convert four pass/fail values to an intensity in [0,1] */
+   val = 0.25F * (k0 + k1 + k2 + k3);
+
+   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
+   for (j = 0; j < 4; j++) {
+      rgba[0][j] = rgba[1][j] = rgba[2][j] = val;
+      rgba[3][j] = 1.0F;
+   }
+}
+
+
+/**
+ * Use 3D texcoords to choose a cube face, then sample the 2D cube faces.
+ * Put face info into the sampler faces[] array.
+ */
+static void
+sample_cube(struct tgsi_sampler *tgsi_sampler,
+            const float s[QUAD_SIZE],
+            const float t[QUAD_SIZE],
+            const float p[QUAD_SIZE],
+            const float c0[QUAD_SIZE],
+            enum tgsi_sampler_control control,
+            float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned j;
+   float ssss[4], tttt[4];
+
+   /*
+     major axis
+     direction    target                             sc     tc    ma
+     ----------   -------------------------------    ---    ---   ---
+     +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+     -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+     +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+     -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+     +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+     -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+
+   /* Choose the cube face and compute new s/t coords for the 2D face.
+    *
+    * Use the same cube face for all four pixels in the quad.
+    *
+    * This isn't ideal, but if we want to use a different cube face
+    * per pixel in the quad, we'd have to also compute the per-face
+    * LOD here too.  That's because the four post-face-selection
+    * texcoords are no longer related to each other (they're
+    * per-face!)  so we can't use subtraction to compute the partial
+    * deriviates to compute the LOD.  Doing so (near cube edges
+    * anyway) gives us pretty much random values.
+    */
+   {
+      /* use the average of the four pixel's texcoords to choose the face */
+      const float rx = 0.25 * (s[0] + s[1] + s[2] + s[3]);
+      const float ry = 0.25 * (t[0] + t[1] + t[2] + t[3]);
+      const float rz = 0.25 * (p[0] + p[1] + p[2] + p[3]);
+      const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
+
+      if (arx >= ary && arx >= arz) {
+         float sign = (rx >= 0.0F) ? 1.0F : -1.0F;
+         uint face = (rx >= 0.0F) ? PIPE_TEX_FACE_POS_X : PIPE_TEX_FACE_NEG_X;
+         for (j = 0; j < QUAD_SIZE; j++) {
+            const float ima = -0.5F / fabsf(s[j]);
+            ssss[j] = sign *  p[j] * ima + 0.5F;
+            tttt[j] =         t[j] * ima + 0.5F;
+            samp->faces[j] = face;
+         }
+      }
+      else if (ary >= arx && ary >= arz) {
+         float sign = (ry >= 0.0F) ? 1.0F : -1.0F;
+         uint face = (ry >= 0.0F) ? PIPE_TEX_FACE_POS_Y : PIPE_TEX_FACE_NEG_Y;
+         for (j = 0; j < QUAD_SIZE; j++) {
+            const float ima = -0.5F / fabsf(t[j]);
+            ssss[j] =        -s[j] * ima + 0.5F;
+            tttt[j] = sign * -p[j] * ima + 0.5F;
+            samp->faces[j] = face;
+         }
+      }
+      else {
+         float sign = (rz >= 0.0F) ? 1.0F : -1.0F;
+         uint face = (rz >= 0.0F) ? PIPE_TEX_FACE_POS_Z : PIPE_TEX_FACE_NEG_Z;
+         for (j = 0; j < QUAD_SIZE; j++) {
+            const float ima = -0.5 / fabsf(p[j]);
+            ssss[j] = sign * -s[j] * ima + 0.5F;
+            tttt[j] =         t[j] * ima + 0.5F;
+            samp->faces[j] = face;
+         }
+      }
+   }
+
+   /* In our little pipeline, the compare stage is next.  If compare
+    * is not active, this will point somewhere deeper into the
+    * pipeline, eg. to mip_filter or even img_filter.
+    */
+   samp->compare(tgsi_sampler, ssss, tttt, NULL, c0, control, rgba);
+}
+
+
+
+static wrap_nearest_func
+get_nearest_unorm_wrap(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_nearest_unorm_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return wrap_nearest_unorm_clamp_to_edge;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_nearest_unorm_clamp_to_border;
+   default:
+      assert(0);
+      return wrap_nearest_unorm_clamp;
+   }
+}
+
+
+static wrap_nearest_func
+get_nearest_wrap(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return wrap_nearest_repeat;
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_nearest_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return wrap_nearest_clamp_to_edge;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_nearest_clamp_to_border;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return wrap_nearest_mirror_repeat;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      return wrap_nearest_mirror_clamp;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return wrap_nearest_mirror_clamp_to_edge;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return wrap_nearest_mirror_clamp_to_border;
+   default:
+      assert(0);
+      return wrap_nearest_repeat;
+   }
+}
+
+
+static wrap_linear_func
+get_linear_unorm_wrap(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_linear_unorm_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return wrap_linear_unorm_clamp_to_edge;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_linear_unorm_clamp_to_border;
+   default:
+      assert(0);
+      return wrap_linear_unorm_clamp;
+   }
+}
+
+
+static wrap_linear_func
+get_linear_wrap(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return wrap_linear_repeat;
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_linear_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return wrap_linear_clamp_to_edge;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_linear_clamp_to_border;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return wrap_linear_mirror_repeat;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      return wrap_linear_mirror_clamp;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return wrap_linear_mirror_clamp_to_edge;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return wrap_linear_mirror_clamp_to_border;
+   default:
+      assert(0);
+      return wrap_linear_repeat;
+   }
+}
+
+
+static compute_lambda_func
+get_lambda_func(const union sp_sampler_key key)
+{
+   if (key.bits.processor == TGSI_PROCESSOR_VERTEX)
+      return compute_lambda_vert;
+   
+   switch (key.bits.target) {
+   case PIPE_TEXTURE_1D:
+      return compute_lambda_1d;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+      return compute_lambda_2d;
+   case PIPE_TEXTURE_3D:
+      return compute_lambda_3d;
+   default:
+      assert(0);
+      return compute_lambda_1d;
+   }
+}
+
+
+static filter_func
+get_img_filter(const union sp_sampler_key key,
+               unsigned filter,
+               const struct pipe_sampler_state *sampler)
+{
+   switch (key.bits.target) {
+   case PIPE_TEXTURE_1D:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_1d_nearest;
+      else
+         return img_filter_1d_linear;
+      break;
+   case PIPE_TEXTURE_2D:
+      /* Try for fast path:
+       */
+      if (key.bits.is_pot &&
+          sampler->wrap_s == sampler->wrap_t &&
+          sampler->normalized_coords) 
+      {
+         switch (sampler->wrap_s) {
+         case PIPE_TEX_WRAP_REPEAT:
+            switch (filter) {
+            case PIPE_TEX_FILTER_NEAREST:
+               return img_filter_2d_nearest_repeat_POT;
+            case PIPE_TEX_FILTER_LINEAR:
+               return img_filter_2d_linear_repeat_POT;
+            default:
+               break;
+            }
+            break;
+         case PIPE_TEX_WRAP_CLAMP:
+            switch (filter) {
+            case PIPE_TEX_FILTER_NEAREST:
+               return img_filter_2d_nearest_clamp_POT;
+            default:
+               break;
+            }
+         }
+      }
+      /* Otherwise use default versions:
+       */
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_2d_nearest;
+      else
+         return img_filter_2d_linear;
+      break;
+   case PIPE_TEXTURE_CUBE:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_cube_nearest;
+      else
+         return img_filter_cube_linear;
+      break;
+   case PIPE_TEXTURE_3D:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_3d_nearest;
+      else
+         return img_filter_3d_linear;
+      break;
+   default:
+      assert(0);
+      return img_filter_1d_nearest;
+   }
+}
+
+
+/**
+ * Bind the given texture object and texture cache to the sampler varient.
+ */
+void
+sp_sampler_varient_bind_texture( struct sp_sampler_varient *samp,
+                                 struct softpipe_tex_tile_cache *tex_cache,
+                                 const struct pipe_resource *texture )
+{
+   const struct pipe_sampler_state *sampler = samp->sampler;
+
+   samp->texture = texture;
+   samp->cache = tex_cache;
+   samp->xpot = util_unsigned_logbase2( texture->width0 );
+   samp->ypot = util_unsigned_logbase2( texture->height0 );
+   samp->level = CLAMP((int) sampler->min_lod, 0, (int) texture->last_level);
+}
+
+
+void
+sp_sampler_varient_destroy( struct sp_sampler_varient *samp )
+{
+   FREE(samp);
+}
+
+
+/**
+ * Create a sampler varient for a given set of non-orthogonal state.
+ */
+struct sp_sampler_varient *
+sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
+                           const union sp_sampler_key key )
+{
+   struct sp_sampler_varient *samp = CALLOC_STRUCT(sp_sampler_varient);
+   if (!samp)
+      return NULL;
+
+   samp->sampler = sampler;
+   samp->key = key;
+
+   /* Note that (for instance) linear_texcoord_s and
+    * nearest_texcoord_s may be active at the same time, if the
+    * sampler min_img_filter differs from its mag_img_filter.
+    */
+   if (sampler->normalized_coords) {
+      samp->linear_texcoord_s = get_linear_wrap( sampler->wrap_s );
+      samp->linear_texcoord_t = get_linear_wrap( sampler->wrap_t );
+      samp->linear_texcoord_p = get_linear_wrap( sampler->wrap_r );
+      
+      samp->nearest_texcoord_s = get_nearest_wrap( sampler->wrap_s );
+      samp->nearest_texcoord_t = get_nearest_wrap( sampler->wrap_t );
+      samp->nearest_texcoord_p = get_nearest_wrap( sampler->wrap_r );
+   }
+   else {
+      samp->linear_texcoord_s = get_linear_unorm_wrap( sampler->wrap_s );
+      samp->linear_texcoord_t = get_linear_unorm_wrap( sampler->wrap_t );
+      samp->linear_texcoord_p = get_linear_unorm_wrap( sampler->wrap_r );
+      
+      samp->nearest_texcoord_s = get_nearest_unorm_wrap( sampler->wrap_s );
+      samp->nearest_texcoord_t = get_nearest_unorm_wrap( sampler->wrap_t );
+      samp->nearest_texcoord_p = get_nearest_unorm_wrap( sampler->wrap_r );
+   }
+   
+   samp->compute_lambda = get_lambda_func( key );
+
+   samp->min_img_filter = get_img_filter(key, sampler->min_img_filter, sampler);
+   samp->mag_img_filter = get_img_filter(key, sampler->mag_img_filter, sampler);
+
+   switch (sampler->min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NONE:
+      if (sampler->min_img_filter == sampler->mag_img_filter) 
+         samp->mip_filter = samp->min_img_filter;         
+      else
+         samp->mip_filter = mip_filter_none;
+      break;
+
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      samp->mip_filter = mip_filter_nearest;
+      break;
+
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      if (key.bits.is_pot &&
+          sampler->min_img_filter == sampler->mag_img_filter &&
+          sampler->normalized_coords &&
+          sampler->wrap_s == PIPE_TEX_WRAP_REPEAT &&
+          sampler->wrap_t == PIPE_TEX_WRAP_REPEAT &&
+          sampler->min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      {
+         samp->mip_filter = mip_filter_linear_2d_linear_repeat_POT;
+      }
+      else 
+      {
+         samp->mip_filter = mip_filter_linear;
+      }
+      break;
+   }
+
+   if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
+      samp->compare = sample_compare;
+   }
+   else {
+      /* Skip compare operation by promoting the mip_filter function
+       * pointer:
+       */
+      samp->compare = samp->mip_filter;
+   }
+   
+   if (key.bits.target == PIPE_TEXTURE_CUBE) {
+      samp->base.get_samples = sample_cube;
+   }
+   else {
+      samp->faces[0] = 0;
+      samp->faces[1] = 0;
+      samp->faces[2] = 0;
+      samp->faces[3] = 0;
+
+      /* Skip cube face determination by promoting the compare
+       * function pointer:
+       */
+      samp->base.get_samples = samp->compare;
+   }
+
+   return samp;
+}
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
new file mode 100644
index 0000000000..6114acf737
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -0,0 +1,153 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2010 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TEX_SAMPLE_H
+#define SP_TEX_SAMPLE_H
+
+
+#include "tgsi/tgsi_exec.h"
+
+struct sp_sampler_varient;
+
+typedef void (*wrap_nearest_func)(const float s[4],
+                                  unsigned size,
+                                  int icoord[4]);
+
+typedef void (*wrap_linear_func)(const float s[4], 
+                                 unsigned size,
+                                 int icoord0[4],
+                                 int icoord1[4],
+                                 float w[4]);
+
+typedef float (*compute_lambda_func)(const struct sp_sampler_varient *sampler,
+                                     const float s[QUAD_SIZE],
+                                     const float t[QUAD_SIZE],
+                                     const float p[QUAD_SIZE]);
+
+typedef void (*filter_func)(struct tgsi_sampler *tgsi_sampler,
+                            const float s[QUAD_SIZE],
+                            const float t[QUAD_SIZE],
+                            const float p[QUAD_SIZE],
+                            const float c0[QUAD_SIZE],
+                            enum tgsi_sampler_control control,
+                            float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+union sp_sampler_key {
+   struct {
+      unsigned target:3;
+      unsigned is_pot:1;
+      unsigned processor:2;
+      unsigned unit:4;
+      unsigned pad:22;
+   } bits;
+   unsigned value;
+};
+
+/**
+ * Subclass of tgsi_sampler
+ */
+struct sp_sampler_varient
+{
+   struct tgsi_sampler base;  /**< base class */
+
+   union sp_sampler_key key;
+
+   /* The owner of this struct:
+    */
+   const struct pipe_sampler_state *sampler;
+
+
+   /* Currently bound texture:
+    */
+   const struct pipe_resource *texture;
+   struct softpipe_tex_tile_cache *cache;
+
+   unsigned processor;
+
+   /* For sp_get_samples_2d_linear_POT:
+    */
+   unsigned xpot;
+   unsigned ypot;
+   unsigned level;
+
+   unsigned faces[4];
+   
+   wrap_nearest_func nearest_texcoord_s;
+   wrap_nearest_func nearest_texcoord_t;
+   wrap_nearest_func nearest_texcoord_p;
+
+   wrap_linear_func linear_texcoord_s;
+   wrap_linear_func linear_texcoord_t;
+   wrap_linear_func linear_texcoord_p;
+
+   filter_func min_img_filter;
+   filter_func mag_img_filter;
+
+   compute_lambda_func compute_lambda;
+
+   filter_func mip_filter;
+   filter_func compare;
+   
+   /* Linked list:
+    */
+   struct sp_sampler_varient *next;
+};
+
+struct sp_sampler;
+
+/* Create a sampler varient for a given set of non-orthogonal state.  Currently the 
+ */
+struct sp_sampler_varient *
+sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
+                           const union sp_sampler_key key );
+
+void sp_sampler_varient_bind_texture( struct sp_sampler_varient *varient,
+                                      struct softpipe_tex_tile_cache *tex_cache,
+                                      const struct pipe_resource *tex );
+
+void sp_sampler_varient_destroy( struct sp_sampler_varient * );
+
+
+
+static INLINE struct sp_sampler_varient *
+sp_sampler_varient(const struct tgsi_sampler *sampler)
+{
+   return (struct sp_sampler_varient *) sampler;
+}
+
+extern void
+sp_get_samples(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+#endif /* SP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
new file mode 100644
index 0000000000..b3e1c49406
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
@@ -0,0 +1,298 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture tile caching.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_tile.h"
+#include "util/u_math.h"
+#include "sp_context.h"
+#include "sp_texture.h"
+#include "sp_tex_tile_cache.h"
+
+   
+
+struct softpipe_tex_tile_cache *
+sp_create_tex_tile_cache( struct pipe_context *pipe )
+{
+   struct softpipe_tex_tile_cache *tc;
+   uint pos;
+
+   tc = CALLOC_STRUCT( softpipe_tex_tile_cache );
+   if (tc) {
+      tc->pipe = pipe;
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].addr.bits.invalid = 1;
+      }
+      tc->last_tile = &tc->entries[0]; /* any tile */
+   }
+   return tc;
+}
+
+
+void
+sp_destroy_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
+{
+   uint pos;
+
+   for (pos = 0; pos < NUM_ENTRIES; pos++) {
+      /*assert(tc->entries[pos].x < 0);*/
+   }
+   if (tc->transfer) {
+      tc->pipe->transfer_destroy(tc->pipe, tc->transfer);
+   }
+   if (tc->tex_trans) {
+      tc->pipe->transfer_destroy(tc->pipe, tc->tex_trans);
+   }
+
+   FREE( tc );
+}
+
+
+
+
+void
+sp_tex_tile_cache_map_transfers(struct softpipe_tex_tile_cache *tc)
+{
+   if (tc->tex_trans && !tc->tex_trans_map)
+      tc->tex_trans_map = tc->pipe->transfer_map(tc->pipe, tc->tex_trans);
+}
+
+
+void
+sp_tex_tile_cache_unmap_transfers(struct softpipe_tex_tile_cache *tc)
+{
+   if (tc->tex_trans_map) {
+      tc->pipe->transfer_unmap(tc->pipe, tc->tex_trans);
+      tc->tex_trans_map = NULL;
+   }
+}
+
+/**
+ * Invalidate all cached tiles for the cached texture.
+ * Should be called when the texture is modified.
+ */
+void
+sp_tex_tile_cache_validate_texture(struct softpipe_tex_tile_cache *tc)
+{
+   unsigned i;
+
+   assert(tc);
+   assert(tc->texture);
+
+   for (i = 0; i < NUM_ENTRIES; i++) {
+      tc->entries[i].addr.bits.invalid = 1;
+   }
+}
+
+static boolean
+sp_tex_tile_is_compat_view(struct softpipe_tex_tile_cache *tc,
+                           struct pipe_sampler_view *view)
+{
+   if (!view)
+      return FALSE;
+   return (tc->texture == view->texture &&
+           tc->format == view->format &&
+           tc->swizzle_r == view->swizzle_r &&
+           tc->swizzle_g == view->swizzle_g &&
+           tc->swizzle_b == view->swizzle_b &&
+           tc->swizzle_a == view->swizzle_a);
+}
+
+/**
+ * Specify the sampler view to cache.
+ */
+void
+sp_tex_tile_cache_set_sampler_view(struct softpipe_tex_tile_cache *tc,
+                                   struct pipe_sampler_view *view)
+{
+   struct pipe_resource *texture = view ? view->texture : NULL;
+   uint i;
+
+   assert(!tc->transfer);
+
+   if (!sp_tex_tile_is_compat_view(tc, view)) {
+      pipe_resource_reference(&tc->texture, texture);
+
+      if (tc->tex_trans) {
+         if (tc->tex_trans_map) {
+            tc->pipe->transfer_unmap(tc->pipe, tc->tex_trans);
+            tc->tex_trans_map = NULL;
+         }
+
+         tc->pipe->transfer_destroy(tc->pipe, tc->tex_trans);
+         tc->tex_trans = NULL;
+      }
+
+      if (view) {
+         tc->swizzle_r = view->swizzle_r;
+         tc->swizzle_g = view->swizzle_g;
+         tc->swizzle_b = view->swizzle_b;
+         tc->swizzle_a = view->swizzle_a;
+         tc->format = view->format;
+      }
+
+      /* mark as entries as invalid/empty */
+      /* XXX we should try to avoid this when the teximage hasn't changed */
+      for (i = 0; i < NUM_ENTRIES; i++) {
+         tc->entries[i].addr.bits.invalid = 1;
+      }
+
+      tc->tex_face = -1; /* any invalid value here */
+   }
+}
+
+
+
+
+/**
+ * Flush the tile cache: write all dirty tiles back to the transfer.
+ * any tiles "flagged" as cleared will be "really" cleared.
+ */
+void
+sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
+{
+   int pos;
+
+   if (tc->texture) {
+      /* caching a texture, mark all entries as empty */
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].addr.bits.invalid = 1;
+      }
+      tc->tex_face = -1;
+   }
+
+}
+
+
+/**
+ * Given the texture face, level, zslice, x and y values, compute
+ * the cache entry position/index where we'd hope to find the
+ * cached texture tile.
+ * This is basically a direct-map cache.
+ * XXX There's probably lots of ways in which we can improve this.
+ */
+static INLINE uint
+tex_cache_pos( union tex_tile_address addr )
+{
+   uint entry = (addr.bits.x + 
+                 addr.bits.y * 9 + 
+                 addr.bits.z * 3 + 
+                 addr.bits.face + 
+                 addr.bits.level * 7);
+
+   return entry % NUM_ENTRIES;
+}
+
+/**
+ * Similar to sp_get_cached_tile() but for textures.
+ * Tiles are read-only and indexed with more params.
+ */
+const struct softpipe_tex_cached_tile *
+sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                        union tex_tile_address addr )
+{
+   struct softpipe_tex_cached_tile *tile;
+   
+   tile = tc->entries + tex_cache_pos( addr );
+
+   if (addr.value != tile->addr.value) {
+
+      /* cache miss.  Most misses are because we've invaldiated the
+       * texture cache previously -- most commonly on binding a new
+       * texture.  Currently we effectively flush the cache on texture
+       * bind.
+       */
+#if 0
+      _debug_printf("miss at %u:  x=%d y=%d z=%d face=%d level=%d\n"
+                    "   tile %u:  x=%d y=%d z=%d face=%d level=%d\n",
+                    pos, x/TILE_SIZE, y/TILE_SIZE, z, face, level,
+                    pos, tile->addr.bits.x, tile->addr.bits.y, tile->z, tile->face, tile->level);
+#endif
+
+      /* check if we need to get a new transfer */
+      if (!tc->tex_trans ||
+          tc->tex_face != addr.bits.face ||
+          tc->tex_level != addr.bits.level ||
+          tc->tex_z != addr.bits.z) {
+         /* get new transfer (view into texture) */
+
+         if (tc->tex_trans) {
+            if (tc->tex_trans_map) {
+               tc->pipe->transfer_unmap(tc->pipe, tc->tex_trans);
+               tc->tex_trans_map = NULL;
+            }
+
+            tc->pipe->transfer_destroy(tc->pipe, tc->tex_trans);
+            tc->tex_trans = NULL;
+         }
+
+         tc->tex_trans = 
+            pipe_get_transfer(tc->pipe, tc->texture, 
+			      addr.bits.face, 
+			      addr.bits.level, 
+			      addr.bits.z, 
+			      PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED,
+			      0, 0,
+			      u_minify(tc->texture->width0, addr.bits.level),
+			      u_minify(tc->texture->height0, addr.bits.level));
+         
+         tc->tex_trans_map = tc->pipe->transfer_map(tc->pipe, tc->tex_trans);
+
+         tc->tex_face = addr.bits.face;
+         tc->tex_level = addr.bits.level;
+         tc->tex_z = addr.bits.z;
+      }
+
+      /* get tile from the transfer (view into texture) */
+      pipe_get_tile_swizzle(tc->pipe,
+			    tc->tex_trans,
+                            addr.bits.x * TILE_SIZE, 
+                            addr.bits.y * TILE_SIZE,
+                            TILE_SIZE,
+                            TILE_SIZE,
+                            tc->swizzle_r,
+                            tc->swizzle_g,
+                            tc->swizzle_b,
+                            tc->swizzle_a,
+                            tc->format,
+                            (float *) tile->data.color);
+      tile->addr = addr;
+   }
+
+   tc->last_tile = tile;
+   return tile;
+}
+
+
+
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
new file mode 100644
index 0000000000..0794ffa0c5
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
@@ -0,0 +1,161 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TEX_TILE_CACHE_H
+#define SP_TEX_TILE_CACHE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+struct softpipe_context;
+struct softpipe_tex_tile_cache;
+
+
+/**
+ * Cache tile size (width and height). This needs to be a power of two.
+ */
+#define TILE_SIZE 64
+
+
+/* If we need to support > 4096, just expand this to be a 64 bit
+ * union, or consider tiling in Z as well.
+ */
+union tex_tile_address {
+   struct {
+      unsigned x:6;             /* 4096 / TILE_SIZE */
+      unsigned y:6;             /* 4096 / TILE_SIZE */
+      unsigned z:12;            /* 4096 -- z not tiled */
+      unsigned face:3;
+      unsigned level:4;
+      unsigned invalid:1;
+   } bits;
+   unsigned value;
+};
+
+
+struct softpipe_tex_cached_tile
+{
+   union tex_tile_address addr;
+   union {
+      float color[TILE_SIZE][TILE_SIZE][4];
+   } data;
+};
+
+#define NUM_ENTRIES 50
+
+struct softpipe_tex_tile_cache
+{
+   struct pipe_context *pipe;
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct pipe_resource *texture;  /**< if caching a texture */
+   unsigned timestamp;
+
+   struct softpipe_tex_cached_tile entries[NUM_ENTRIES];
+
+   struct pipe_transfer *tex_trans;
+   void *tex_trans_map;
+   int tex_face, tex_level, tex_z;
+
+   unsigned swizzle_r;
+   unsigned swizzle_g;
+   unsigned swizzle_b;
+   unsigned swizzle_a;
+   unsigned format;
+
+   struct softpipe_tex_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
+extern struct softpipe_tex_tile_cache *
+sp_create_tex_tile_cache( struct pipe_context *pipe );
+
+extern void
+sp_destroy_tex_tile_cache(struct softpipe_tex_tile_cache *tc);
+
+
+extern void
+sp_tex_tile_cache_map_transfers(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_tex_tile_cache_unmap_transfers(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_tex_tile_cache_set_sampler_view(struct softpipe_tex_tile_cache *tc,
+                                   struct pipe_sampler_view *view);
+
+void
+sp_tex_tile_cache_validate_texture(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc);
+
+
+
+extern const struct softpipe_tex_cached_tile *
+sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                         union tex_tile_address addr );
+
+static INLINE union tex_tile_address
+tex_tile_address( unsigned x,
+		  unsigned y,
+		  unsigned z,
+		  unsigned face,
+		  unsigned level )
+{
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   addr.bits.z = z;
+   addr.bits.face = face;
+   addr.bits.level = level;
+      
+   return addr;
+}
+
+/* Quickly retrieve tile if it matches last lookup.
+ */
+static INLINE const struct softpipe_tex_cached_tile *
+sp_get_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                         union tex_tile_address addr )
+{
+   if (tc->last_tile->addr.value == addr.value)
+      return tc->last_tile;
+
+   return sp_find_cached_tile_tex( tc, addr );
+}
+
+
+
+
+
+#endif /* SP_TEX_TILE_CACHE_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
new file mode 100644
index 0000000000..4e6123fbd0
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -0,0 +1,488 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  */
+
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_transfer.h"
+
+#include "sp_context.h"
+#include "sp_flush.h"
+#include "sp_texture.h"
+#include "sp_screen.h"
+
+#include "state_tracker/sw_winsys.h"
+
+
+/**
+ * Conventional allocation path for non-display textures:
+ * Use a simple, maximally packed layout.
+ */
+static boolean
+softpipe_resource_layout(struct pipe_screen *screen,
+                         struct softpipe_resource *spr)
+{
+   struct pipe_resource *pt = &spr->base;
+   unsigned level;
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
+   unsigned buffer_size = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      spr->stride[level] = util_format_get_stride(pt->format, width);
+
+      spr->level_offset[level] = buffer_size;
+
+      buffer_size += (util_format_get_nblocksy(pt->format, height) *
+                      ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
+                      spr->stride[level]);
+
+      width  = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
+   }
+
+   spr->data = align_malloc(buffer_size, 16);
+
+   return spr->data != NULL;
+}
+
+
+/**
+ * Texture layout for simple color buffers.
+ */
+static boolean
+softpipe_displaytarget_layout(struct pipe_screen *screen,
+                              struct softpipe_resource *spr)
+{
+   struct sw_winsys *winsys = softpipe_screen(screen)->winsys;
+
+   /* Round up the surface size to a multiple of the tile size?
+    */
+   spr->dt = winsys->displaytarget_create(winsys,
+                                          spr->base.bind,
+                                          spr->base.format,
+                                          spr->base.width0, 
+                                          spr->base.height0,
+                                          16,
+                                          &spr->stride[0] );
+
+   return spr->dt != NULL;
+}
+
+
+/**
+ * Create new pipe_resource given the template information.
+ */
+static struct pipe_resource *
+softpipe_resource_create(struct pipe_screen *screen,
+                         const struct pipe_resource *templat)
+{
+   struct softpipe_resource *spr = CALLOC_STRUCT(softpipe_resource);
+   if (!spr)
+      return NULL;
+
+   assert(templat->format != PIPE_FORMAT_NONE);
+
+   spr->base = *templat;
+   pipe_reference_init(&spr->base.reference, 1);
+   spr->base.screen = screen;
+
+   spr->pot = (util_is_power_of_two(templat->width0) &&
+               util_is_power_of_two(templat->height0) &&
+               util_is_power_of_two(templat->depth0));
+
+   if (spr->base.bind & (PIPE_BIND_DISPLAY_TARGET |
+			 PIPE_BIND_SCANOUT |
+			 PIPE_BIND_SHARED)) {
+      if (!softpipe_displaytarget_layout(screen, spr))
+         goto fail;
+   }
+   else {
+      if (!softpipe_resource_layout(screen, spr))
+         goto fail;
+   }
+    
+   return &spr->base;
+
+ fail:
+   FREE(spr);
+   return NULL;
+}
+
+
+static void
+softpipe_resource_destroy(struct pipe_screen *pscreen,
+			  struct pipe_resource *pt)
+{
+   struct softpipe_screen *screen = softpipe_screen(pscreen);
+   struct softpipe_resource *spr = softpipe_resource(pt);
+
+   if (spr->dt) {
+      /* display target */
+      struct sw_winsys *winsys = screen->winsys;
+      winsys->displaytarget_destroy(winsys, spr->dt);
+   }
+   else if (!spr->userBuffer) {
+      /* regular texture */
+      align_free(spr->data);
+   }
+
+   FREE(spr);
+}
+
+
+static struct pipe_resource *
+softpipe_resource_from_handle(struct pipe_screen *screen,
+                              const struct pipe_resource *templat,
+                              struct winsys_handle *whandle)
+{
+   struct sw_winsys *winsys = softpipe_screen(screen)->winsys;
+   struct softpipe_resource *spr = CALLOC_STRUCT(softpipe_resource);
+   if (!spr)
+      return NULL;
+
+   spr->base = *templat;
+   pipe_reference_init(&spr->base.reference, 1);
+   spr->base.screen = screen;
+
+   spr->pot = (util_is_power_of_two(templat->width0) &&
+               util_is_power_of_two(templat->height0) &&
+               util_is_power_of_two(templat->depth0));
+
+   spr->dt = winsys->displaytarget_from_handle(winsys,
+                                               templat,
+                                               whandle,
+                                               &spr->stride[0]);
+   if (!spr->dt)
+      goto fail;
+
+   return &spr->base;
+
+ fail:
+   FREE(spr);
+   return NULL;
+}
+
+
+static boolean
+softpipe_resource_get_handle(struct pipe_screen *screen,
+                             struct pipe_resource *pt,
+                             struct winsys_handle *whandle)
+{
+   struct sw_winsys *winsys = softpipe_screen(screen)->winsys;
+   struct softpipe_resource *spr = softpipe_resource(pt);
+
+   assert(spr->dt);
+   if (!spr->dt)
+      return FALSE;
+
+   return winsys->displaytarget_get_handle(winsys, spr->dt, whandle);
+}
+
+
+/**
+ * Helper function to compute offset (in bytes) for a particular
+ * texture level/face/slice from the start of the buffer.
+ */
+static unsigned
+sp_get_tex_image_offset(const struct softpipe_resource *spr,
+                        unsigned level, unsigned face, unsigned zslice)
+{
+   const unsigned hgt = u_minify(spr->base.height0, level);
+   const unsigned nblocksy = util_format_get_nblocksy(spr->base.format, hgt);
+   unsigned offset = spr->level_offset[level];
+
+   if (spr->base.target == PIPE_TEXTURE_CUBE) {
+      assert(zslice == 0);
+      offset += face * nblocksy * spr->stride[level];
+   }
+   else if (spr->base.target == PIPE_TEXTURE_3D) {
+      assert(face == 0);
+      offset += zslice * nblocksy * spr->stride[level];
+   }
+   else {
+      assert(face == 0);
+      assert(zslice == 0);
+   }
+
+   return offset;
+}
+
+
+/**
+ * Get a pipe_surface "view" into a texture resource.
+ */
+static struct pipe_surface *
+softpipe_get_tex_surface(struct pipe_screen *screen,
+                         struct pipe_resource *pt,
+                         unsigned face, unsigned level, unsigned zslice,
+                         unsigned usage)
+{
+   struct softpipe_resource *spr = softpipe_resource(pt);
+   struct pipe_surface *ps;
+
+   assert(level <= pt->last_level);
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      pipe_reference_init(&ps->reference, 1);
+      pipe_resource_reference(&ps->texture, pt);
+      ps->format = pt->format;
+      ps->width = u_minify(pt->width0, level);
+      ps->height = u_minify(pt->height0, level);
+      ps->offset = sp_get_tex_image_offset(spr, level, face, zslice);
+      ps->usage = usage;
+
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
+   }
+   return ps;
+}
+
+
+/**
+ * Free a pipe_surface which was created with softpipe_get_tex_surface().
+ */
+static void 
+softpipe_tex_surface_destroy(struct pipe_surface *surf)
+{
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For softpipe, nothing to do.
+    */
+   assert(surf->texture);
+   pipe_resource_reference(&surf->texture, NULL);
+   FREE(surf);
+}
+
+
+/**
+ * Geta pipe_transfer object which is used for moving data in/out of
+ * a resource object.
+ * \param pipe  rendering context
+ * \param resource  the resource to transfer in/out of
+ * \param sr  indicates cube face or 3D texture slice
+ * \param usage  bitmask of PIPE_TRANSFER_x flags
+ * \param box  the 1D/2D/3D region of interest
+ */
+static struct pipe_transfer *
+softpipe_get_transfer(struct pipe_context *pipe,
+		      struct pipe_resource *resource,
+		      struct pipe_subresource sr,
+		      unsigned usage,
+		      const struct pipe_box *box)
+{
+   struct softpipe_resource *spr = softpipe_resource(resource);
+   struct softpipe_transfer *spt;
+
+   assert(resource);
+   assert(sr.level <= resource->last_level);
+
+   /* make sure the requested region is in the image bounds */
+   assert(box->x + box->width <= u_minify(resource->width0, sr.level));
+   assert(box->y + box->height <= u_minify(resource->height0, sr.level));
+   assert(box->z + box->depth <= u_minify(resource->depth0, sr.level));
+
+   /*
+    * Transfers, like other pipe operations, must happen in order, so flush the
+    * context if necessary.
+    */
+   if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+      boolean read_only = !(usage & PIPE_TRANSFER_WRITE);
+      boolean do_not_block = !!(usage & PIPE_TRANSFER_DONTBLOCK);
+      if (!softpipe_flush_resource(pipe, resource,
+                                   sr.face, sr.level,
+                                   0, /* flush_flags */
+                                   read_only,
+                                   TRUE, /* cpu_access */
+                                   do_not_block)) {
+         /*
+          * It would have blocked, but state tracker requested no to.
+          */
+         assert(do_not_block);
+         return NULL;
+      }
+   }
+
+   spt = CALLOC_STRUCT(softpipe_transfer);
+   if (spt) {
+      struct pipe_transfer *pt = &spt->base;
+      enum pipe_format format = resource->format;
+      const unsigned hgt = u_minify(spr->base.height0, sr.level);
+      const unsigned nblocksy = util_format_get_nblocksy(format, hgt);
+
+      pipe_resource_reference(&pt->resource, resource);
+      pt->sr = sr;
+      pt->usage = usage;
+      pt->box = *box;
+      pt->stride = spr->stride[sr.level];
+      pt->slice_stride = pt->stride * nblocksy;
+
+      spt->offset = sp_get_tex_image_offset(spr, sr.level, sr.face, box->z);
+ 
+      spt->offset += 
+	 box->y / util_format_get_blockheight(format) * spt->base.stride +
+	 box->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
+
+      return pt;
+   }
+   return NULL;
+}
+
+
+/**
+ * Free a pipe_transfer object which was created with
+ * softpipe_get_transfer().
+ */
+static void 
+softpipe_transfer_destroy(struct pipe_context *pipe,
+                          struct pipe_transfer *transfer)
+{
+   pipe_resource_reference(&transfer->resource, NULL);
+   FREE(transfer);
+}
+
+
+/**
+ * Create memory mapping for given pipe_transfer object.
+ */
+static void *
+softpipe_transfer_map(struct pipe_context *pipe,
+                      struct pipe_transfer *transfer)
+{
+   struct softpipe_transfer *spt = softpipe_transfer(transfer);
+   struct softpipe_resource *spr = softpipe_resource(transfer->resource);
+   struct sw_winsys *winsys = softpipe_screen(pipe->screen)->winsys;
+   uint8_t *map;
+   
+   /* resources backed by display target treated specially:
+    */
+   if (spr->dt) {
+      map = winsys->displaytarget_map(winsys, spr->dt, transfer->usage);
+   }
+   else {
+      map = spr->data;
+   }
+
+   if (map == NULL)
+      return NULL;
+   else
+      return map + spt->offset;
+}
+
+
+/**
+ * Unmap memory mapping for given pipe_transfer object.
+ */
+static void
+softpipe_transfer_unmap(struct pipe_context *pipe,
+                        struct pipe_transfer *transfer)
+{
+   struct softpipe_resource *spr;
+
+   assert(transfer->resource);
+   spr = softpipe_resource(transfer->resource);
+
+   if (spr->dt) {
+      /* display target */
+      struct sw_winsys *winsys = softpipe_screen(pipe->screen)->winsys;
+      winsys->displaytarget_unmap(winsys, spr->dt);
+   }
+
+   if (transfer->usage & PIPE_TRANSFER_WRITE) {
+      /* Mark the texture as dirty to expire the tile caches. */
+      spr->timestamp++;
+   }
+}
+
+/**
+ * Create buffer which wraps user-space data.
+ */
+static struct pipe_resource *
+softpipe_user_buffer_create(struct pipe_screen *screen,
+                            void *ptr,
+                            unsigned bytes,
+			    unsigned bind_flags)
+{
+   struct softpipe_resource *spr;
+
+   spr = CALLOC_STRUCT(softpipe_resource);
+   if (!spr)
+      return NULL;
+
+   pipe_reference_init(&spr->base.reference, 1);
+   spr->base.screen = screen;
+   spr->base.format = PIPE_FORMAT_R8_UNORM; /* ?? */
+   spr->base.bind = bind_flags;
+   spr->base.usage = PIPE_USAGE_IMMUTABLE;
+   spr->base.flags = 0;
+   spr->base.width0 = bytes;
+   spr->base.height0 = 1;
+   spr->base.depth0 = 1;
+   spr->userBuffer = TRUE;
+   spr->data = ptr;
+
+   return &spr->base;
+}
+
+
+void
+softpipe_init_texture_funcs(struct pipe_context *pipe)
+{
+   pipe->get_transfer = softpipe_get_transfer;
+   pipe->transfer_destroy = softpipe_transfer_destroy;
+   pipe->transfer_map = softpipe_transfer_map;
+   pipe->transfer_unmap = softpipe_transfer_unmap;
+
+   pipe->transfer_flush_region = u_default_transfer_flush_region;
+   pipe->transfer_inline_write = u_default_transfer_inline_write;
+}
+
+
+void
+softpipe_init_screen_texture_funcs(struct pipe_screen *screen)
+{
+   screen->resource_create = softpipe_resource_create;
+   screen->resource_destroy = softpipe_resource_destroy;
+   screen->resource_from_handle = softpipe_resource_from_handle;
+   screen->resource_get_handle = softpipe_resource_get_handle;
+   screen->user_buffer_create = softpipe_user_buffer_create;
+
+   screen->get_tex_surface = softpipe_get_tex_surface;
+   screen->tex_surface_destroy = softpipe_tex_surface_destroy;
+}
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
new file mode 100644
index 0000000000..6b205dc532
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -0,0 +1,107 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TEXTURE_H
+#define SP_TEXTURE_H
+
+
+#include "pipe/p_state.h"
+
+
+#define SP_MAX_TEXTURE_2D_LEVELS 13  /* 4K x 4K */
+#define SP_MAX_TEXTURE_3D_LEVELS 9   /* 512 x 512 x 512 */
+
+
+struct pipe_context;
+struct pipe_screen;
+struct softpipe_context;
+
+
+/**
+ * Subclass of pipe_resource.
+ */
+struct softpipe_resource
+{
+   struct pipe_resource base;
+
+   unsigned long level_offset[SP_MAX_TEXTURE_2D_LEVELS];
+   unsigned stride[SP_MAX_TEXTURE_2D_LEVELS];
+
+   /**
+    * Display target, only valid for PIPE_TEXTURE_2D with the
+    * PIPE_BIND_DISPLAY_TARGET usage.
+    */
+   struct sw_displaytarget *dt;
+
+   /**
+    * Malloc'ed data for regular buffers and textures, or a mapping to dt above.
+    */
+   void *data;
+
+   /* True if texture images are power-of-two in all dimensions:
+    */
+   boolean pot;
+   boolean userBuffer;
+
+   unsigned timestamp;
+};
+
+
+/**
+ * Subclass of pipe_transfer.
+ */
+struct softpipe_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned long offset;
+};
+
+
+
+/** cast wrappers */
+static INLINE struct softpipe_resource *
+softpipe_resource(struct pipe_resource *pt)
+{
+   return (struct softpipe_resource *) pt;
+}
+
+static INLINE struct softpipe_transfer *
+softpipe_transfer(struct pipe_transfer *pt)
+{
+   return (struct softpipe_transfer *) pt;
+}
+
+
+extern void
+softpipe_init_screen_texture_funcs(struct pipe_screen *screen);
+
+extern void
+softpipe_init_texture_funcs(struct pipe_context *pipe);
+
+
+#endif /* SP_TEXTURE */
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
new file mode 100644
index 0000000000..f4db6f6ef0
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -0,0 +1,457 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Render target tile caching.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_tile.h"
+#include "sp_tile_cache.h"
+
+
+
+/**
+ * Return the position in the cache for the tile that contains win pos (x,y).
+ * We currently use a direct mapped cache so this is like a hack key.
+ * At some point we should investige something more sophisticated, like
+ * a LRU replacement policy.
+ */
+#define CACHE_POS(x, y) \
+   (((x) + (y) * 5) % NUM_ENTRIES)
+
+
+
+/**
+ * Is the tile at (x,y) in cleared state?
+ */
+static INLINE uint
+is_clear_flag_set(const uint *bitvec, union tile_address addr)
+{
+   int pos, bit;
+   pos = addr.bits.y * (MAX_WIDTH / TILE_SIZE) + addr.bits.x;
+   assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
+   bit = bitvec[pos / 32] & (1 << (pos & 31));
+   return bit;
+}
+   
+
+/**
+ * Mark the tile at (x,y) as not cleared.
+ */
+static INLINE void
+clear_clear_flag(uint *bitvec, union tile_address addr)
+{
+   int pos;
+   pos = addr.bits.y * (MAX_WIDTH / TILE_SIZE) + addr.bits.x;
+   assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
+   bitvec[pos / 32] &= ~(1 << (pos & 31));
+}
+   
+
+struct softpipe_tile_cache *
+sp_create_tile_cache( struct pipe_context *pipe )
+{
+   struct softpipe_tile_cache *tc;
+   uint pos;
+   int maxLevels, maxTexSize;
+
+   /* sanity checking: max sure MAX_WIDTH/HEIGHT >= largest texture image */
+   maxLevels = pipe->screen->get_param(pipe->screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
+   maxTexSize = 1 << (maxLevels - 1);
+   assert(MAX_WIDTH >= maxTexSize);
+
+   tc = CALLOC_STRUCT( softpipe_tile_cache );
+   if (tc) {
+      tc->pipe = pipe;
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].addr.bits.invalid = 1;
+      }
+      tc->last_tile = &tc->entries[0]; /* any tile */
+
+      /* XXX this code prevents valgrind warnings about use of uninitialized
+       * memory in programs that don't clear the surface before rendering.
+       * However, it breaks clearing in other situations (such as in
+       * progs/tests/drawbuffers, see bug 24402).
+       */
+#if 0
+      /* set flags to indicate all the tiles are cleared */
+      memset(tc->clear_flags, 255, sizeof(tc->clear_flags));
+#endif
+   }
+   return tc;
+}
+
+
+void
+sp_destroy_tile_cache(struct softpipe_tile_cache *tc)
+{
+   uint pos;
+
+   for (pos = 0; pos < NUM_ENTRIES; pos++) {
+      /*assert(tc->entries[pos].x < 0);*/
+   }
+   if (tc->transfer) {
+      tc->pipe->transfer_destroy(tc->pipe, tc->transfer);
+   }
+
+   FREE( tc );
+}
+
+
+/**
+ * Specify the surface to cache.
+ */
+void
+sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
+                          struct pipe_surface *ps)
+{
+   struct pipe_context *pipe = tc->pipe;
+
+   if (tc->transfer) {
+      if (ps == tc->surface)
+         return;
+
+      if (tc->transfer_map) {
+         pipe->transfer_unmap(pipe, tc->transfer);
+         tc->transfer_map = NULL;
+      }
+
+      pipe->transfer_destroy(pipe, tc->transfer);
+      tc->transfer = NULL;
+   }
+
+   tc->surface = ps;
+
+   if (ps) {
+      tc->transfer = pipe_get_transfer(pipe, ps->texture, ps->face,
+					   ps->level, ps->zslice,
+					   PIPE_TRANSFER_READ_WRITE |
+					   PIPE_TRANSFER_UNSYNCHRONIZED,
+					   0, 0, ps->width, ps->height);
+
+      tc->depth_stencil = (ps->format == PIPE_FORMAT_Z24_UNORM_S8_USCALED ||
+                           ps->format == PIPE_FORMAT_Z24X8_UNORM ||
+                           ps->format == PIPE_FORMAT_S8_USCALED_Z24_UNORM ||
+                           ps->format == PIPE_FORMAT_X8Z24_UNORM ||
+                           ps->format == PIPE_FORMAT_Z16_UNORM ||
+                           ps->format == PIPE_FORMAT_Z32_UNORM ||
+                           ps->format == PIPE_FORMAT_S8_USCALED);
+   }
+}
+
+
+/**
+ * Return the transfer being cached.
+ */
+struct pipe_surface *
+sp_tile_cache_get_surface(struct softpipe_tile_cache *tc)
+{
+   return tc->surface;
+}
+
+
+void
+sp_tile_cache_map_transfers(struct softpipe_tile_cache *tc)
+{
+   if (tc->transfer && !tc->transfer_map)
+      tc->transfer_map = tc->pipe->transfer_map(tc->pipe, tc->transfer);
+}
+
+
+void
+sp_tile_cache_unmap_transfers(struct softpipe_tile_cache *tc)
+{
+   if (tc->transfer_map) {
+      tc->pipe->transfer_unmap(tc->pipe, tc->transfer);
+      tc->transfer_map = NULL;
+   }
+}
+
+
+/**
+ * Set pixels in a tile to the given clear color/value, float.
+ */
+static void
+clear_tile_rgba(struct softpipe_cached_tile *tile,
+                enum pipe_format format,
+                const float clear_value[4])
+{
+   if (clear_value[0] == 0.0 &&
+       clear_value[1] == 0.0 &&
+       clear_value[2] == 0.0 &&
+       clear_value[3] == 0.0) {
+      memset(tile->data.color, 0, sizeof(tile->data.color));
+   }
+   else {
+      uint i, j;
+      for (i = 0; i < TILE_SIZE; i++) {
+         for (j = 0; j < TILE_SIZE; j++) {
+            tile->data.color[i][j][0] = clear_value[0];
+            tile->data.color[i][j][1] = clear_value[1];
+            tile->data.color[i][j][2] = clear_value[2];
+            tile->data.color[i][j][3] = clear_value[3];
+         }
+      }
+   }
+}
+
+
+/**
+ * Set a tile to a solid value/color.
+ */
+static void
+clear_tile(struct softpipe_cached_tile *tile,
+           enum pipe_format format,
+           uint clear_value)
+{
+   uint i, j;
+
+   switch (util_format_get_blocksize(format)) {
+   case 1:
+      memset(tile->data.any, clear_value, TILE_SIZE * TILE_SIZE);
+      break;
+   case 2:
+      if (clear_value == 0) {
+         memset(tile->data.any, 0, 2 * TILE_SIZE * TILE_SIZE);
+      }
+      else {
+         for (i = 0; i < TILE_SIZE; i++) {
+            for (j = 0; j < TILE_SIZE; j++) {
+               tile->data.depth16[i][j] = (ushort) clear_value;
+            }
+         }
+      }
+      break;
+   case 4:
+      if (clear_value == 0) {
+         memset(tile->data.any, 0, 4 * TILE_SIZE * TILE_SIZE);
+      }
+      else {
+         for (i = 0; i < TILE_SIZE; i++) {
+            for (j = 0; j < TILE_SIZE; j++) {
+               tile->data.color32[i][j] = clear_value;
+            }
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * Actually clear the tiles which were flagged as being in a clear state.
+ */
+static void
+sp_tile_cache_flush_clear(struct softpipe_tile_cache *tc)
+{
+   struct pipe_transfer *pt = tc->transfer;
+   const uint w = tc->transfer->box.width;
+   const uint h = tc->transfer->box.height;
+   uint x, y;
+   uint numCleared = 0;
+
+   assert(pt->resource);
+   /* clear the scratch tile to the clear value */
+   clear_tile(&tc->tile, pt->resource->format, tc->clear_val);
+
+   /* push the tile to all positions marked as clear */
+   for (y = 0; y < h; y += TILE_SIZE) {
+      for (x = 0; x < w; x += TILE_SIZE) {
+         union tile_address addr = tile_address(x, y);
+
+         if (is_clear_flag_set(tc->clear_flags, addr)) {
+            pipe_put_tile_raw(tc->pipe,
+                              pt,
+                              x, y, TILE_SIZE, TILE_SIZE,
+                              tc->tile.data.color32, 0/*STRIDE*/);
+
+            numCleared++;
+         }
+      }
+   }
+
+   /* reset all clear flags to zero */
+   memset(tc->clear_flags, 0, sizeof(tc->clear_flags));
+
+#if 0
+   debug_printf("num cleared: %u\n", numCleared);
+#endif
+}
+
+
+/**
+ * Flush the tile cache: write all dirty tiles back to the transfer.
+ * any tiles "flagged" as cleared will be "really" cleared.
+ */
+void
+sp_flush_tile_cache(struct softpipe_tile_cache *tc)
+{
+   struct pipe_transfer *pt = tc->transfer;
+   int inuse = 0, pos;
+
+   if (pt) {
+      /* caching a drawing transfer */
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         struct softpipe_cached_tile *tile = tc->entries + pos;
+         if (!tile->addr.bits.invalid) {
+            if (tc->depth_stencil) {
+               pipe_put_tile_raw(tc->pipe, pt,
+                                 tile->addr.bits.x * TILE_SIZE, 
+                                 tile->addr.bits.y * TILE_SIZE, 
+                                 TILE_SIZE, TILE_SIZE,
+                                 tile->data.depth32, 0/*STRIDE*/);
+            }
+            else {
+               pipe_put_tile_rgba(tc->pipe, pt,
+                                  tile->addr.bits.x * TILE_SIZE, 
+                                  tile->addr.bits.y * TILE_SIZE, 
+                                  TILE_SIZE, TILE_SIZE,
+                                  (float *) tile->data.color);
+            }
+            tile->addr.bits.invalid = 1;  /* mark as empty */
+            inuse++;
+         }
+      }
+
+      sp_tile_cache_flush_clear(tc);
+   }
+
+#if 0
+   debug_printf("flushed tiles in use: %d\n", inuse);
+#endif
+}
+
+
+/**
+ * Get a tile from the cache.
+ * \param x, y  position of tile, in pixels
+ */
+struct softpipe_cached_tile *
+sp_find_cached_tile(struct softpipe_tile_cache *tc, 
+                    union tile_address addr )
+{
+   struct pipe_transfer *pt = tc->transfer;
+   
+   /* cache pos/entry: */
+   const int pos = CACHE_POS(addr.bits.x,
+                             addr.bits.y);
+   struct softpipe_cached_tile *tile = tc->entries + pos;
+
+   if (addr.value != tile->addr.value) {
+
+      assert(pt->resource);
+      if (tile->addr.bits.invalid == 0) {
+         /* put dirty tile back in framebuffer */
+         if (tc->depth_stencil) {
+            pipe_put_tile_raw(tc->pipe, pt,
+                              tile->addr.bits.x * TILE_SIZE,
+                              tile->addr.bits.y * TILE_SIZE,
+                              TILE_SIZE, TILE_SIZE,
+                              tile->data.depth32, 0/*STRIDE*/);
+         }
+         else {
+            pipe_put_tile_rgba(tc->pipe, pt,
+                               tile->addr.bits.x * TILE_SIZE,
+                               tile->addr.bits.y * TILE_SIZE,
+                               TILE_SIZE, TILE_SIZE,
+                               (float *) tile->data.color);
+         }
+      }
+
+      tile->addr = addr;
+
+      if (is_clear_flag_set(tc->clear_flags, addr)) {
+         /* don't get tile from framebuffer, just clear it */
+         if (tc->depth_stencil) {
+            clear_tile(tile, pt->resource->format, tc->clear_val);
+         }
+         else {
+            clear_tile_rgba(tile, pt->resource->format, tc->clear_color);
+         }
+         clear_clear_flag(tc->clear_flags, addr);
+      }
+      else {
+         /* get new tile data from transfer */
+         if (tc->depth_stencil) {
+            pipe_get_tile_raw(tc->pipe, pt,
+                              tile->addr.bits.x * TILE_SIZE, 
+                              tile->addr.bits.y * TILE_SIZE, 
+                              TILE_SIZE, TILE_SIZE,
+                              tile->data.depth32, 0/*STRIDE*/);
+         }
+         else {
+            pipe_get_tile_rgba(tc->pipe, pt,
+                               tile->addr.bits.x * TILE_SIZE, 
+                               tile->addr.bits.y * TILE_SIZE,
+                               TILE_SIZE, TILE_SIZE,
+                               (float *) tile->data.color);
+         }
+      }
+   }
+
+   tc->last_tile = tile;
+   return tile;
+}
+
+
+
+
+
+/**
+ * When a whole surface is being cleared to a value we can avoid
+ * fetching tiles above.
+ * Save the color and set a 'clearflag' for each tile of the screen.
+ */
+void
+sp_tile_cache_clear(struct softpipe_tile_cache *tc, const float *rgba,
+                    uint clearValue)
+{
+   uint pos;
+
+   tc->clear_color[0] = rgba[0];
+   tc->clear_color[1] = rgba[1];
+   tc->clear_color[2] = rgba[2];
+   tc->clear_color[3] = rgba[3];
+
+   tc->clear_val = clearValue;
+
+   /* set flags to indicate all the tiles are cleared */
+   memset(tc->clear_flags, 255, sizeof(tc->clear_flags));
+
+   for (pos = 0; pos < NUM_ENTRIES; pos++) {
+      struct softpipe_cached_tile *tile = tc->entries + pos;
+      tile->addr.bits.invalid = 1;
+   }
+}
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
new file mode 100644
index 0000000000..e03d53eb24
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -0,0 +1,160 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TILE_CACHE_H
+#define SP_TILE_CACHE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+struct softpipe_tile_cache;
+
+
+/**
+ * Cache tile size (width and height). This needs to be a power of two.
+ */
+#define TILE_SIZE 64
+
+
+/* If we need to support > 4096, just expand this to be a 64 bit
+ * union, or consider tiling in Z as well.
+ */
+union tile_address {
+   struct {
+      unsigned x:6;             /* 4096 / TILE_SIZE */
+      unsigned y:6;             /* 4096 / TILE_SIZE */
+      unsigned invalid:1;
+      unsigned pad:19;
+   } bits;
+   unsigned value;
+};
+
+
+struct softpipe_cached_tile
+{
+   union tile_address addr;
+   union {
+      float color[TILE_SIZE][TILE_SIZE][4];
+      uint color32[TILE_SIZE][TILE_SIZE];
+      uint depth32[TILE_SIZE][TILE_SIZE];
+      ushort depth16[TILE_SIZE][TILE_SIZE];
+      ubyte stencil8[TILE_SIZE][TILE_SIZE];
+      ubyte any[1];
+   } data;
+};
+
+#define NUM_ENTRIES 50
+
+
+/** XXX move these */
+#define MAX_WIDTH 4096
+#define MAX_HEIGHT 4096
+
+
+struct softpipe_tile_cache
+{
+   struct pipe_context *pipe;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct softpipe_cached_tile entries[NUM_ENTRIES];
+   uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
+   float clear_color[4];  /**< for color bufs */
+   uint clear_val;        /**< for z+stencil, or packed color clear value */
+   boolean depth_stencil; /**< Is the surface a depth/stencil format? */
+
+   struct softpipe_cached_tile tile;  /**< scratch tile for clears */
+
+   struct softpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
+extern struct softpipe_tile_cache *
+sp_create_tile_cache( struct pipe_context *pipe );
+
+extern void
+sp_destroy_tile_cache(struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
+                          struct pipe_surface *sps);
+
+extern struct pipe_surface *
+sp_tile_cache_get_surface(struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_map_transfers(struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_unmap_transfers(struct softpipe_tile_cache *tc);
+
+extern void
+sp_flush_tile_cache(struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_clear(struct softpipe_tile_cache *tc, const float *rgba,
+                    uint clearValue);
+
+extern struct softpipe_cached_tile *
+sp_find_cached_tile(struct softpipe_tile_cache *tc, 
+                    union tile_address addr );
+
+
+static INLINE union tile_address
+tile_address( unsigned x,
+              unsigned y )
+{
+   union tile_address addr;
+
+   addr.value = 0;
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+      
+   return addr;
+}
+
+/* Quickly retrieve tile if it matches last lookup.
+ */
+static INLINE struct softpipe_cached_tile *
+sp_get_cached_tile(struct softpipe_tile_cache *tc, 
+                   int x, int y )
+{
+   union tile_address addr = tile_address( x, y );
+
+   if (tc->last_tile->addr.value == addr.value)
+      return tc->last_tile;
+
+   return sp_find_cached_tile( tc, addr );
+}
+
+
+
+
+#endif /* SP_TILE_CACHE_H */
+
diff --git a/src/gallium/drivers/svga/Makefile b/src/gallium/drivers/svga/Makefile
new file mode 100644
index 0000000000..27287793bd
--- /dev/null
+++ b/src/gallium/drivers/svga/Makefile
@@ -0,0 +1,65 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = svga
+
+C_SOURCES = \
+	svgadump/svga_shader_dump.c \
+	svgadump/svga_shader_op.c \
+	svgadump/svga_dump.c \
+	svga_cmd.c \
+	svga_context.c \
+	svga_draw.c \
+	svga_draw_arrays.c \
+	svga_draw_elements.c \
+	svga_pipe_blend.c \
+	svga_pipe_blit.c \
+	svga_pipe_clear.c \
+	svga_pipe_constants.c \
+	svga_pipe_depthstencil.c \
+	svga_pipe_draw.c \
+	svga_pipe_flush.c \
+	svga_pipe_fs.c \
+	svga_pipe_misc.c \
+	svga_pipe_query.c \
+	svga_pipe_rasterizer.c \
+	svga_pipe_sampler.c \
+	svga_pipe_vertex.c \
+	svga_pipe_vs.c \
+	svga_screen.c \
+	svga_screen_cache.c \
+	svga_state.c \
+	svga_state_need_swtnl.c \
+	svga_state_constants.c \
+	svga_state_framebuffer.c \
+	svga_state_rss.c \
+	svga_state_tss.c \
+	svga_state_vdecl.c \
+	svga_state_fs.c \
+	svga_state_vs.c \
+	svga_swtnl_backend.c \
+	svga_swtnl_draw.c \
+	svga_swtnl_state.c \
+	svga_tgsi.c \
+	svga_tgsi_decl_sm20.c \
+	svga_tgsi_decl_sm30.c \
+	svga_tgsi_insn.c \
+	svga_sampler_view.c \
+	svga_surface.c \
+	svga_resource.c \
+	svga_resource_texture.c \
+	svga_resource_buffer.c \
+	svga_resource_buffer_upload.c 
+
+
+LIBRARY_INCLUDES = \
+	-I$(TOP)/src/gallium/drivers/svga/include
+
+# With linux-debug we get a lots of warnings, filter out the bad flags.
+CFLAGS := $(filter-out -pedantic, $(filter-out -ansi, $(CFLAGS)))
+
+LIBRARY_DEFINES = \
+	-std=gnu99 -fvisibility=hidden \
+	-DHAVE_STDINT_H -DHAVE_SYS_TYPES_H
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/svga/SConscript b/src/gallium/drivers/svga/SConscript
new file mode 100644
index 0000000000..12ce4732d1
--- /dev/null
+++ b/src/gallium/drivers/svga/SConscript
@@ -0,0 +1,76 @@
+Import('*')
+
+env = env.Clone()
+
+if env['platform'] in ['linux']:
+	env.Append(CCFLAGS = ['-fvisibility=hidden'])
+
+if env['gcc']:
+	env.Append(CPPDEFINES = [
+		'HAVE_STDINT_H', 
+		'HAVE_SYS_TYPES_H',
+	])
+	
+env.Prepend(CPPPATH = [
+	'include',
+])
+
+env.Append(CPPDEFINES = [
+])
+
+sources = [
+    'svga_cmd.c',
+    'svga_context.c',
+    'svga_draw.c',
+    'svga_draw_arrays.c',
+    'svga_draw_elements.c',
+    'svga_pipe_blend.c',
+    'svga_pipe_blit.c',
+    'svga_pipe_clear.c',
+    'svga_pipe_constants.c',
+    'svga_pipe_depthstencil.c',
+    'svga_pipe_draw.c',
+    'svga_pipe_flush.c',
+    'svga_pipe_fs.c',
+    'svga_pipe_misc.c',
+    'svga_pipe_query.c',
+    'svga_pipe_rasterizer.c',
+    'svga_pipe_sampler.c',
+    'svga_pipe_vertex.c',
+    'svga_pipe_vs.c',
+    'svga_resource.c',
+    'svga_resource_buffer.c',
+    'svga_resource_buffer_upload.c',
+    'svga_resource_texture.c',
+    'svga_sampler_view.c',
+    'svga_screen.c',
+    'svga_screen_cache.c',
+    'svga_state.c',
+    'svga_state_constants.c',
+    'svga_state_framebuffer.c',
+    'svga_state_need_swtnl.c',
+    'svga_state_rss.c',
+    'svga_state_tss.c',
+    'svga_state_vdecl.c',
+    'svga_state_fs.c',
+    'svga_state_vs.c',
+    'svga_surface.c',
+    'svga_swtnl_backend.c',
+    'svga_swtnl_draw.c',
+    'svga_swtnl_state.c',
+    'svga_tgsi.c',
+    'svga_tgsi_decl_sm20.c',
+    'svga_tgsi_decl_sm30.c',
+    'svga_tgsi_insn.c',
+    
+    'svgadump/svga_dump.c',
+    'svgadump/svga_shader_dump.c',
+    'svgadump/svga_shader_op.c',
+]
+
+svga = env.ConvenienceLibrary(
+	target = 'svga',
+	source = sources,
+)
+
+Export('svga')
diff --git a/src/gallium/drivers/svga/include/README b/src/gallium/drivers/svga/include/README
new file mode 100644
index 0000000000..a0b8916104
--- /dev/null
+++ b/src/gallium/drivers/svga/include/README
@@ -0,0 +1,3 @@
+This directory contains the headers from the VMware SVGA Device Developer Kit:
+
+   https://vmware-svga.svn.sourceforge.net/svnroot/vmware-svga/trunk/lib/vmware/
diff --git a/src/gallium/drivers/svga/include/svga3d_caps.h b/src/gallium/drivers/svga/include/svga3d_caps.h
new file mode 100644
index 0000000000..714ce9f45f
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga3d_caps.h
@@ -0,0 +1,139 @@
+/**********************************************************
+ * Copyright 2007-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga3d_caps.h --
+ *
+ *       Definitions for SVGA3D hardware capabilities.  Capabilities
+ *       are used to query for optional rendering features during
+ *       driver initialization. The capability data is stored as very
+ *       basic key/value dictionary within the "FIFO register" memory
+ *       area at the beginning of BAR2.
+ *
+ *       Note that these definitions are only for 3D capabilities.
+ *       The SVGA device also has "device capabilities" and "FIFO
+ *       capabilities", which are non-3D-specific and are stored as
+ *       bitfields rather than key/value pairs.
+ */
+
+#ifndef _SVGA3D_CAPS_H_
+#define _SVGA3D_CAPS_H_
+
+#define SVGA_FIFO_3D_CAPS_SIZE   (SVGA_FIFO_3D_CAPS_LAST - \
+                                  SVGA_FIFO_3D_CAPS + 1)
+
+
+/*
+ * SVGA3dCapsRecordType
+ *
+ *    Record types that can be found in the caps block.
+ *    Related record types are grouped together numerically so that
+ *    SVGA3dCaps_FindRecord() can be applied on a range of record
+ *    types.
+ */
+
+typedef enum {
+   SVGA3DCAPS_RECORD_UNKNOWN        = 0,
+   SVGA3DCAPS_RECORD_DEVCAPS_MIN    = 0x100,
+   SVGA3DCAPS_RECORD_DEVCAPS        = 0x100,
+   SVGA3DCAPS_RECORD_DEVCAPS_MAX    = 0x1ff,
+} SVGA3dCapsRecordType;
+
+
+/*
+ * SVGA3dCapsRecordHeader
+ *
+ *    Header field leading each caps block record. Contains the offset (in
+ *    register words, NOT bytes) to the next caps block record (or the end
+ *    of caps block records which will be a zero word) and the record type
+ *    as defined above.
+ */
+
+typedef
+struct SVGA3dCapsRecordHeader {
+   uint32 length;
+   SVGA3dCapsRecordType type;
+}
+SVGA3dCapsRecordHeader;
+
+
+/*
+ * SVGA3dCapsRecord
+ *
+ *    Caps block record; "data" is a placeholder for the actual data structure
+ *    contained within the record; for example a record containing a FOOBAR
+ *    structure would be of size "sizeof(SVGA3dCapsRecordHeader) +
+ *    sizeof(FOOBAR)".
+ */
+
+typedef
+struct SVGA3dCapsRecord {
+   SVGA3dCapsRecordHeader header;
+   uint32 data[1];
+}
+SVGA3dCapsRecord;
+
+
+typedef uint32 SVGA3dCapPair[2];
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3dCaps_FindRecord
+ *
+ *    Finds the record with the highest-valued type within the given range
+ *    in the caps block.
+ *
+ *    Result: pointer to found record, or NULL if not found.
+ *
+ *----------------------------------------------------------------------
+ */
+
+static INLINE SVGA3dCapsRecord *
+SVGA3dCaps_FindRecord(const uint32 *capsBlock,
+                      SVGA3dCapsRecordType recordTypeMin,
+                      SVGA3dCapsRecordType recordTypeMax)
+{
+   SVGA3dCapsRecord *record, *found = NULL;
+   uint32 offset;
+
+   /*
+    * Search linearly through the caps block records for the specified type.
+    */
+   for (offset = 0; capsBlock[offset] != 0; offset += capsBlock[offset]) {
+      record = (SVGA3dCapsRecord *) (capsBlock + offset);
+      if ((record->header.type >= recordTypeMin) &&
+          (record->header.type <= recordTypeMax) &&
+          (!found || (record->header.type > found->header.type))) {
+         found = record;
+      }
+   }
+
+   return found;
+}
+
+
+#endif // _SVGA3D_CAPS_H_
diff --git a/src/gallium/drivers/svga/include/svga3d_reg.h b/src/gallium/drivers/svga/include/svga3d_reg.h
new file mode 100644
index 0000000000..77cb453310
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga3d_reg.h
@@ -0,0 +1,1793 @@
+/**********************************************************
+ * Copyright 1998-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga3d_reg.h --
+ *
+ *       SVGA 3D hardware definitions
+ */
+
+#ifndef _SVGA3D_REG_H_
+#define _SVGA3D_REG_H_
+
+#include "svga_reg.h"
+
+
+/*
+ * 3D Hardware Version
+ *
+ *   The hardware version is stored in the SVGA_FIFO_3D_HWVERSION fifo
+ *   register.   Is set by the host and read by the guest.  This lets
+ *   us make new guest drivers which are backwards-compatible with old
+ *   SVGA hardware revisions.  It does not let us support old guest
+ *   drivers.  Good enough for now.
+ *
+ */
+
+#define SVGA3D_MAKE_HWVERSION(major, minor)      (((major) << 16) | ((minor) & 0xFF))
+#define SVGA3D_MAJOR_HWVERSION(version)          ((version) >> 16)
+#define SVGA3D_MINOR_HWVERSION(version)          ((version) & 0xFF)
+
+typedef enum {
+   SVGA3D_HWVERSION_WS5_RC1   = SVGA3D_MAKE_HWVERSION(0, 1),
+   SVGA3D_HWVERSION_WS5_RC2   = SVGA3D_MAKE_HWVERSION(0, 2),
+   SVGA3D_HWVERSION_WS51_RC1  = SVGA3D_MAKE_HWVERSION(0, 3),
+   SVGA3D_HWVERSION_WS6_B1    = SVGA3D_MAKE_HWVERSION(1, 1),
+   SVGA3D_HWVERSION_FUSION_11 = SVGA3D_MAKE_HWVERSION(1, 4),
+   SVGA3D_HWVERSION_WS65_B1   = SVGA3D_MAKE_HWVERSION(2, 0),
+   SVGA3D_HWVERSION_CURRENT   = SVGA3D_HWVERSION_WS65_B1,
+} SVGA3dHardwareVersion;
+
+/*
+ * Generic Types
+ */
+
+typedef uint32 SVGA3dBool; /* 32-bit Bool definition */
+#define SVGA3D_NUM_CLIPPLANES                   6
+#define SVGA3D_MAX_SIMULTANEOUS_RENDER_TARGETS  8
+
+
+/*
+ * Surface formats.
+ *
+ * If you modify this list, be sure to keep GLUtil.c in sync. It
+ * includes the internal format definition of each surface in
+ * GLUtil_ConvertSurfaceFormat, and it contains a table of
+ * human-readable names in GLUtil_GetFormatName.
+ */
+
+typedef enum SVGA3dSurfaceFormat {
+   SVGA3D_FORMAT_INVALID = 0,
+
+   SVGA3D_X8R8G8B8       = 1,
+   SVGA3D_A8R8G8B8       = 2,
+
+   SVGA3D_R5G6B5         = 3,
+   SVGA3D_X1R5G5B5       = 4,
+   SVGA3D_A1R5G5B5       = 5,
+   SVGA3D_A4R4G4B4       = 6,
+
+   SVGA3D_Z_D32          = 7,
+   SVGA3D_Z_D16          = 8,
+   SVGA3D_Z_D24S8        = 9,
+   SVGA3D_Z_D15S1        = 10,
+
+   SVGA3D_LUMINANCE8            = 11,
+   SVGA3D_LUMINANCE4_ALPHA4     = 12,
+   SVGA3D_LUMINANCE16           = 13,
+   SVGA3D_LUMINANCE8_ALPHA8     = 14,
+
+   SVGA3D_DXT1           = 15,
+   SVGA3D_DXT2           = 16,
+   SVGA3D_DXT3           = 17,
+   SVGA3D_DXT4           = 18,
+   SVGA3D_DXT5           = 19,
+
+   SVGA3D_BUMPU8V8       = 20,
+   SVGA3D_BUMPL6V5U5     = 21,
+   SVGA3D_BUMPX8L8V8U8   = 22,
+   SVGA3D_BUMPL8V8U8     = 23,
+
+   SVGA3D_ARGB_S10E5     = 24,   /* 16-bit floating-point ARGB */
+   SVGA3D_ARGB_S23E8     = 25,   /* 32-bit floating-point ARGB */
+
+   SVGA3D_A2R10G10B10    = 26,
+
+   /* signed formats */
+   SVGA3D_V8U8           = 27,
+   SVGA3D_Q8W8V8U8       = 28,
+   SVGA3D_CxV8U8         = 29,
+
+   /* mixed formats */
+   SVGA3D_X8L8V8U8       = 30,
+   SVGA3D_A2W10V10U10    = 31,
+
+   SVGA3D_ALPHA8         = 32,
+
+   /* Single- and dual-component floating point formats */
+   SVGA3D_R_S10E5        = 33,
+   SVGA3D_R_S23E8        = 34,
+   SVGA3D_RG_S10E5       = 35,
+   SVGA3D_RG_S23E8       = 36,
+
+   /*
+    * Any surface can be used as a buffer object, but SVGA3D_BUFFER is
+    * the most efficient format to use when creating new surfaces
+    * expressly for index or vertex data.
+    */
+   SVGA3D_BUFFER         = 37,
+
+   SVGA3D_Z_D24X8        = 38,
+
+   SVGA3D_V16U16         = 39,
+
+   SVGA3D_G16R16         = 40,
+   SVGA3D_A16B16G16R16   = 41,
+
+   /* Packed Video formats */
+   SVGA3D_UYVY           = 42,
+   SVGA3D_YUY2           = 43,
+
+   SVGA3D_FORMAT_MAX
+} SVGA3dSurfaceFormat;
+
+typedef uint32 SVGA3dColor; /* a, r, g, b */
+
+/*
+ * These match the D3DFORMAT_OP definitions used by Direct3D. We need
+ * them so that we can query the host for what the supported surface
+ * operations are (when we're using the D3D backend, in particular),
+ * and so we can send those operations to the guest.
+ */
+typedef enum {
+   SVGA3DFORMAT_OP_TEXTURE                               = 0x00000001,
+   SVGA3DFORMAT_OP_VOLUMETEXTURE                         = 0x00000002,
+   SVGA3DFORMAT_OP_CUBETEXTURE                           = 0x00000004,
+   SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET                = 0x00000008,
+   SVGA3DFORMAT_OP_SAME_FORMAT_RENDERTARGET              = 0x00000010,
+   SVGA3DFORMAT_OP_ZSTENCIL                              = 0x00000040,
+   SVGA3DFORMAT_OP_ZSTENCIL_WITH_ARBITRARY_COLOR_DEPTH   = 0x00000080,
+
+/*
+ * This format can be used as a render target if the current display mode
+ * is the same depth if the alpha channel is ignored. e.g. if the device
+ * can render to A8R8G8B8 when the display mode is X8R8G8B8, then the
+ * format op list entry for A8R8G8B8 should have this cap.
+ */
+   SVGA3DFORMAT_OP_SAME_FORMAT_UP_TO_ALPHA_RENDERTARGET  = 0x00000100,
+
+/*
+ * This format contains DirectDraw support (including Flip).  This flag
+ * should not to be set on alpha formats.
+ */
+   SVGA3DFORMAT_OP_DISPLAYMODE                           = 0x00000400,
+
+/*
+ * The rasterizer can support some level of Direct3D support in this format
+ * and implies that the driver can create a Context in this mode (for some
+ * render target format).  When this flag is set, the SVGA3DFORMAT_OP_DISPLAYMODE
+ * flag must also be set.
+ */
+   SVGA3DFORMAT_OP_3DACCELERATION                        = 0x00000800,
+
+/*
+ * This is set for a private format when the driver has put the bpp in
+ * the structure.
+ */
+   SVGA3DFORMAT_OP_PIXELSIZE                             = 0x00001000,
+
+/*
+ * Indicates that this format can be converted to any RGB format for which
+ * SVGA3DFORMAT_OP_MEMBEROFGROUP_ARGB is specified
+ */
+   SVGA3DFORMAT_OP_CONVERT_TO_ARGB                       = 0x00002000,
+
+/*
+ * Indicates that this format can be used to create offscreen plain surfaces.
+ */
+   SVGA3DFORMAT_OP_OFFSCREENPLAIN                        = 0x00004000,
+
+/*
+ * Indicated that this format can be read as an SRGB texture (meaning that the
+ * sampler will linearize the looked up data)
+ */
+   SVGA3DFORMAT_OP_SRGBREAD                              = 0x00008000,
+
+/*
+ * Indicates that this format can be used in the bumpmap instructions
+ */
+   SVGA3DFORMAT_OP_BUMPMAP                               = 0x00010000,
+
+/*
+ * Indicates that this format can be sampled by the displacement map sampler
+ */
+   SVGA3DFORMAT_OP_DMAP                                  = 0x00020000,
+
+/*
+ * Indicates that this format cannot be used with texture filtering
+ */
+   SVGA3DFORMAT_OP_NOFILTER                              = 0x00040000,
+
+/*
+ * Indicates that format conversions are supported to this RGB format if
+ * SVGA3DFORMAT_OP_CONVERT_TO_ARGB is specified in the source format.
+ */
+   SVGA3DFORMAT_OP_MEMBEROFGROUP_ARGB                    = 0x00080000,
+
+/*
+ * Indicated that this format can be written as an SRGB target (meaning that the
+ * pixel pipe will DE-linearize data on output to format)
+ */
+   SVGA3DFORMAT_OP_SRGBWRITE                             = 0x00100000,
+
+/*
+ * Indicates that this format cannot be used with alpha blending
+ */
+   SVGA3DFORMAT_OP_NOALPHABLEND                          = 0x00200000,
+
+/*
+ * Indicates that the device can auto-generated sublevels for resources
+ * of this format
+ */
+   SVGA3DFORMAT_OP_AUTOGENMIPMAP                         = 0x00400000,
+
+/*
+ * Indicates that this format can be used by vertex texture sampler
+ */
+   SVGA3DFORMAT_OP_VERTEXTEXTURE                         = 0x00800000,
+
+/*
+ * Indicates that this format supports neither texture coordinate wrap
+ * modes, nor mipmapping
+ */
+   SVGA3DFORMAT_OP_NOTEXCOORDWRAPNORMIP                  = 0x01000000
+} SVGA3dFormatOp;
+
+/*
+ * This structure is a conversion of SVGA3DFORMAT_OP_*.
+ * Entries must be located at the same position.
+ */
+typedef union {
+   uint32 value;
+   struct {
+      uint32 texture : 1;
+      uint32 volumeTexture : 1;
+      uint32 cubeTexture : 1;
+      uint32 offscreenRenderTarget : 1;
+      uint32 sameFormatRenderTarget : 1;
+      uint32 unknown1 : 1;
+      uint32 zStencil : 1;
+      uint32 zStencilArbitraryDepth : 1;
+      uint32 sameFormatUpToAlpha : 1;
+      uint32 unknown2 : 1;
+      uint32 displayMode : 1;
+      uint32 acceleration3d : 1;
+      uint32 pixelSize : 1;
+      uint32 convertToARGB : 1;
+      uint32 offscreenPlain : 1;
+      uint32 sRGBRead : 1;
+      uint32 bumpMap : 1;
+      uint32 dmap : 1;
+      uint32 noFilter : 1;
+      uint32 memberOfGroupARGB : 1;
+      uint32 sRGBWrite : 1;
+      uint32 noAlphaBlend : 1;
+      uint32 autoGenMipMap : 1;
+      uint32 vertexTexture : 1;
+      uint32 noTexCoordWrapNorMip : 1;
+   };
+} SVGA3dSurfaceFormatCaps;
+
+/*
+ * SVGA_3D_CMD_SETRENDERSTATE Types.  All value types
+ * must fit in a uint32.
+ */
+
+typedef enum {
+   SVGA3D_RS_INVALID                   = 0,
+   SVGA3D_RS_ZENABLE                   = 1,     /* SVGA3dBool */
+   SVGA3D_RS_ZWRITEENABLE              = 2,     /* SVGA3dBool */
+   SVGA3D_RS_ALPHATESTENABLE           = 3,     /* SVGA3dBool */
+   SVGA3D_RS_DITHERENABLE              = 4,     /* SVGA3dBool */
+   SVGA3D_RS_BLENDENABLE               = 5,     /* SVGA3dBool */
+   SVGA3D_RS_FOGENABLE                 = 6,     /* SVGA3dBool */
+   SVGA3D_RS_SPECULARENABLE            = 7,     /* SVGA3dBool */
+   SVGA3D_RS_STENCILENABLE             = 8,     /* SVGA3dBool */
+   SVGA3D_RS_LIGHTINGENABLE            = 9,     /* SVGA3dBool */
+   SVGA3D_RS_NORMALIZENORMALS          = 10,    /* SVGA3dBool */
+   SVGA3D_RS_POINTSPRITEENABLE         = 11,    /* SVGA3dBool */
+   SVGA3D_RS_POINTSCALEENABLE          = 12,    /* SVGA3dBool */
+   SVGA3D_RS_STENCILREF                = 13,    /* uint32 */
+   SVGA3D_RS_STENCILMASK               = 14,    /* uint32 */
+   SVGA3D_RS_STENCILWRITEMASK          = 15,    /* uint32 */
+   SVGA3D_RS_FOGSTART                  = 16,    /* float */
+   SVGA3D_RS_FOGEND                    = 17,    /* float */
+   SVGA3D_RS_FOGDENSITY                = 18,    /* float */
+   SVGA3D_RS_POINTSIZE                 = 19,    /* float */
+   SVGA3D_RS_POINTSIZEMIN              = 20,    /* float */
+   SVGA3D_RS_POINTSIZEMAX              = 21,    /* float */
+   SVGA3D_RS_POINTSCALE_A              = 22,    /* float */
+   SVGA3D_RS_POINTSCALE_B              = 23,    /* float */
+   SVGA3D_RS_POINTSCALE_C              = 24,    /* float */
+   SVGA3D_RS_FOGCOLOR                  = 25,    /* SVGA3dColor */
+   SVGA3D_RS_AMBIENT                   = 26,    /* SVGA3dColor */
+   SVGA3D_RS_CLIPPLANEENABLE           = 27,    /* SVGA3dClipPlanes */
+   SVGA3D_RS_FOGMODE                   = 28,    /* SVGA3dFogMode */
+   SVGA3D_RS_FILLMODE                  = 29,    /* SVGA3dFillMode */
+   SVGA3D_RS_SHADEMODE                 = 30,    /* SVGA3dShadeMode */
+   SVGA3D_RS_LINEPATTERN               = 31,    /* SVGA3dLinePattern */
+   SVGA3D_RS_SRCBLEND                  = 32,    /* SVGA3dBlendOp */
+   SVGA3D_RS_DSTBLEND                  = 33,    /* SVGA3dBlendOp */
+   SVGA3D_RS_BLENDEQUATION             = 34,    /* SVGA3dBlendEquation */
+   SVGA3D_RS_CULLMODE                  = 35,    /* SVGA3dFace */
+   SVGA3D_RS_ZFUNC                     = 36,    /* SVGA3dCmpFunc */
+   SVGA3D_RS_ALPHAFUNC                 = 37,    /* SVGA3dCmpFunc */
+   SVGA3D_RS_STENCILFUNC               = 38,    /* SVGA3dCmpFunc */
+   SVGA3D_RS_STENCILFAIL               = 39,    /* SVGA3dStencilOp */
+   SVGA3D_RS_STENCILZFAIL              = 40,    /* SVGA3dStencilOp */
+   SVGA3D_RS_STENCILPASS               = 41,    /* SVGA3dStencilOp */
+   SVGA3D_RS_ALPHAREF                  = 42,    /* float (0.0 .. 1.0) */
+   SVGA3D_RS_FRONTWINDING              = 43,    /* SVGA3dFrontWinding */
+   SVGA3D_RS_COORDINATETYPE            = 44,    /* SVGA3dCoordinateType */
+   SVGA3D_RS_ZBIAS                     = 45,    /* float */
+   SVGA3D_RS_RANGEFOGENABLE            = 46,    /* SVGA3dBool */
+   SVGA3D_RS_COLORWRITEENABLE          = 47,    /* SVGA3dColorMask */
+   SVGA3D_RS_VERTEXMATERIALENABLE      = 48,    /* SVGA3dBool */
+   SVGA3D_RS_DIFFUSEMATERIALSOURCE     = 49,    /* SVGA3dVertexMaterial */
+   SVGA3D_RS_SPECULARMATERIALSOURCE    = 50,    /* SVGA3dVertexMaterial */
+   SVGA3D_RS_AMBIENTMATERIALSOURCE     = 51,    /* SVGA3dVertexMaterial */
+   SVGA3D_RS_EMISSIVEMATERIALSOURCE    = 52,    /* SVGA3dVertexMaterial */
+   SVGA3D_RS_TEXTUREFACTOR             = 53,    /* SVGA3dColor */
+   SVGA3D_RS_LOCALVIEWER               = 54,    /* SVGA3dBool */
+   SVGA3D_RS_SCISSORTESTENABLE         = 55,    /* SVGA3dBool */
+   SVGA3D_RS_BLENDCOLOR                = 56,    /* SVGA3dColor */
+   SVGA3D_RS_STENCILENABLE2SIDED       = 57,    /* SVGA3dBool */
+   SVGA3D_RS_CCWSTENCILFUNC            = 58,    /* SVGA3dCmpFunc */
+   SVGA3D_RS_CCWSTENCILFAIL            = 59,    /* SVGA3dStencilOp */
+   SVGA3D_RS_CCWSTENCILZFAIL           = 60,    /* SVGA3dStencilOp */
+   SVGA3D_RS_CCWSTENCILPASS            = 61,    /* SVGA3dStencilOp */
+   SVGA3D_RS_VERTEXBLEND               = 62,    /* SVGA3dVertexBlendFlags */
+   SVGA3D_RS_SLOPESCALEDEPTHBIAS       = 63,    /* float */
+   SVGA3D_RS_DEPTHBIAS                 = 64,    /* float */
+
+
+   /*
+    * Output Gamma Level
+    *
+    * Output gamma effects the gamma curve of colors that are output from the
+    * rendering pipeline.  A value of 1.0 specifies a linear color space. If the
+    * value is <= 0.0, gamma correction is ignored and linear color space is
+    * used.
+    */
+
+   SVGA3D_RS_OUTPUTGAMMA               = 65,    /* float */
+   SVGA3D_RS_ZVISIBLE                  = 66,    /* SVGA3dBool */
+   SVGA3D_RS_LASTPIXEL                 = 67,    /* SVGA3dBool */
+   SVGA3D_RS_CLIPPING                  = 68,    /* SVGA3dBool */
+   SVGA3D_RS_WRAP0                     = 69,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP1                     = 70,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP2                     = 71,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP3                     = 72,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP4                     = 73,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP5                     = 74,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP6                     = 75,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP7                     = 76,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP8                     = 77,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP9                     = 78,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP10                    = 79,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP11                    = 80,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP12                    = 81,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP13                    = 82,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP14                    = 83,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP15                    = 84,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_MULTISAMPLEANTIALIAS      = 85,    /* SVGA3dBool */
+   SVGA3D_RS_MULTISAMPLEMASK           = 86,    /* uint32 */
+   SVGA3D_RS_INDEXEDVERTEXBLENDENABLE  = 87,    /* SVGA3dBool */
+   SVGA3D_RS_TWEENFACTOR               = 88,    /* float */
+   SVGA3D_RS_ANTIALIASEDLINEENABLE     = 89,    /* SVGA3dBool */
+   SVGA3D_RS_COLORWRITEENABLE1         = 90,    /* SVGA3dColorMask */
+   SVGA3D_RS_COLORWRITEENABLE2         = 91,    /* SVGA3dColorMask */
+   SVGA3D_RS_COLORWRITEENABLE3         = 92,    /* SVGA3dColorMask */
+   SVGA3D_RS_SEPARATEALPHABLENDENABLE  = 93,    /* SVGA3dBool */
+   SVGA3D_RS_SRCBLENDALPHA             = 94,    /* SVGA3dBlendOp */
+   SVGA3D_RS_DSTBLENDALPHA             = 95,    /* SVGA3dBlendOp */
+   SVGA3D_RS_BLENDEQUATIONALPHA        = 96,    /* SVGA3dBlendEquation */
+   SVGA3D_RS_MAX
+} SVGA3dRenderStateName;
+
+typedef enum {
+   SVGA3D_VERTEXMATERIAL_NONE     = 0,    /* Use the value in the current material */
+   SVGA3D_VERTEXMATERIAL_DIFFUSE  = 1,    /* Use the value in the diffuse component */
+   SVGA3D_VERTEXMATERIAL_SPECULAR = 2,    /* Use the value in the specular component */
+} SVGA3dVertexMaterial;
+
+typedef enum {
+   SVGA3D_FILLMODE_INVALID = 0,
+   SVGA3D_FILLMODE_POINT   = 1,
+   SVGA3D_FILLMODE_LINE    = 2,
+   SVGA3D_FILLMODE_FILL    = 3,
+   SVGA3D_FILLMODE_MAX
+} SVGA3dFillModeType;
+
+
+typedef
+union {
+   struct {
+      uint16   mode;       /* SVGA3dFillModeType */
+      uint16   face;       /* SVGA3dFace */
+   };
+   uint32 uintValue;
+} SVGA3dFillMode;
+
+typedef enum {
+   SVGA3D_SHADEMODE_INVALID = 0,
+   SVGA3D_SHADEMODE_FLAT    = 1,
+   SVGA3D_SHADEMODE_SMOOTH  = 2,
+   SVGA3D_SHADEMODE_PHONG   = 3,     /* Not supported */
+   SVGA3D_SHADEMODE_MAX
+} SVGA3dShadeMode;
+
+typedef
+union {
+   struct {
+      uint16 repeat;
+      uint16 pattern;
+   };
+   uint32 uintValue;
+} SVGA3dLinePattern;
+
+typedef enum {
+   SVGA3D_BLENDOP_INVALID            = 0,
+   SVGA3D_BLENDOP_ZERO               = 1,
+   SVGA3D_BLENDOP_ONE                = 2,
+   SVGA3D_BLENDOP_SRCCOLOR           = 3,
+   SVGA3D_BLENDOP_INVSRCCOLOR        = 4,
+   SVGA3D_BLENDOP_SRCALPHA           = 5,
+   SVGA3D_BLENDOP_INVSRCALPHA        = 6,
+   SVGA3D_BLENDOP_DESTALPHA          = 7,
+   SVGA3D_BLENDOP_INVDESTALPHA       = 8,
+   SVGA3D_BLENDOP_DESTCOLOR          = 9,
+   SVGA3D_BLENDOP_INVDESTCOLOR       = 10,
+   SVGA3D_BLENDOP_SRCALPHASAT        = 11,
+   SVGA3D_BLENDOP_BLENDFACTOR        = 12,
+   SVGA3D_BLENDOP_INVBLENDFACTOR     = 13,
+   SVGA3D_BLENDOP_MAX
+} SVGA3dBlendOp;
+
+typedef enum {
+   SVGA3D_BLENDEQ_INVALID            = 0,
+   SVGA3D_BLENDEQ_ADD                = 1,
+   SVGA3D_BLENDEQ_SUBTRACT           = 2,
+   SVGA3D_BLENDEQ_REVSUBTRACT        = 3,
+   SVGA3D_BLENDEQ_MINIMUM            = 4,
+   SVGA3D_BLENDEQ_MAXIMUM            = 5,
+   SVGA3D_BLENDEQ_MAX
+} SVGA3dBlendEquation;
+
+typedef enum {
+   SVGA3D_FRONTWINDING_INVALID = 0,
+   SVGA3D_FRONTWINDING_CW      = 1,
+   SVGA3D_FRONTWINDING_CCW     = 2,
+   SVGA3D_FRONTWINDING_MAX
+} SVGA3dFrontWinding;
+
+typedef enum {
+   SVGA3D_FACE_INVALID  = 0,
+   SVGA3D_FACE_NONE     = 1,
+   SVGA3D_FACE_FRONT    = 2,
+   SVGA3D_FACE_BACK     = 3,
+   SVGA3D_FACE_FRONT_BACK = 4,
+   SVGA3D_FACE_MAX
+} SVGA3dFace;
+
+/*
+ * The order and the values should not be changed
+ */
+
+typedef enum {
+   SVGA3D_CMP_INVALID              = 0,
+   SVGA3D_CMP_NEVER                = 1,
+   SVGA3D_CMP_LESS                 = 2,
+   SVGA3D_CMP_EQUAL                = 3,
+   SVGA3D_CMP_LESSEQUAL            = 4,
+   SVGA3D_CMP_GREATER              = 5,
+   SVGA3D_CMP_NOTEQUAL             = 6,
+   SVGA3D_CMP_GREATEREQUAL         = 7,
+   SVGA3D_CMP_ALWAYS               = 8,
+   SVGA3D_CMP_MAX
+} SVGA3dCmpFunc;
+
+/*
+ * SVGA3D_FOGFUNC_* specifies the fog equation, or PER_VERTEX which allows
+ * the fog factor to be specified in the alpha component of the specular
+ * (a.k.a. secondary) vertex color.
+ */
+typedef enum {
+   SVGA3D_FOGFUNC_INVALID          = 0,
+   SVGA3D_FOGFUNC_EXP              = 1,
+   SVGA3D_FOGFUNC_EXP2             = 2,
+   SVGA3D_FOGFUNC_LINEAR           = 3,
+   SVGA3D_FOGFUNC_PER_VERTEX       = 4
+} SVGA3dFogFunction;
+
+/*
+ * SVGA3D_FOGTYPE_* specifies if fog factors are computed on a per-vertex
+ * or per-pixel basis.
+ */
+typedef enum {
+   SVGA3D_FOGTYPE_INVALID          = 0,
+   SVGA3D_FOGTYPE_VERTEX           = 1,
+   SVGA3D_FOGTYPE_PIXEL            = 2,
+   SVGA3D_FOGTYPE_MAX              = 3
+} SVGA3dFogType;
+
+/*
+ * SVGA3D_FOGBASE_* selects depth or range-based fog. Depth-based fog is
+ * computed using the eye Z value of each pixel (or vertex), whereas range-
+ * based fog is computed using the actual distance (range) to the eye.
+ */
+typedef enum {
+   SVGA3D_FOGBASE_INVALID          = 0,
+   SVGA3D_FOGBASE_DEPTHBASED       = 1,
+   SVGA3D_FOGBASE_RANGEBASED       = 2,
+   SVGA3D_FOGBASE_MAX              = 3
+} SVGA3dFogBase;
+
+typedef enum {
+   SVGA3D_STENCILOP_INVALID        = 0,
+   SVGA3D_STENCILOP_KEEP           = 1,
+   SVGA3D_STENCILOP_ZERO           = 2,
+   SVGA3D_STENCILOP_REPLACE        = 3,
+   SVGA3D_STENCILOP_INCRSAT        = 4,
+   SVGA3D_STENCILOP_DECRSAT        = 5,
+   SVGA3D_STENCILOP_INVERT         = 6,
+   SVGA3D_STENCILOP_INCR           = 7,
+   SVGA3D_STENCILOP_DECR           = 8,
+   SVGA3D_STENCILOP_MAX
+} SVGA3dStencilOp;
+
+typedef enum {
+   SVGA3D_CLIPPLANE_0              = (1 << 0),
+   SVGA3D_CLIPPLANE_1              = (1 << 1),
+   SVGA3D_CLIPPLANE_2              = (1 << 2),
+   SVGA3D_CLIPPLANE_3              = (1 << 3),
+   SVGA3D_CLIPPLANE_4              = (1 << 4),
+   SVGA3D_CLIPPLANE_5              = (1 << 5),
+} SVGA3dClipPlanes;
+
+typedef enum {
+   SVGA3D_CLEAR_COLOR              = 0x1,
+   SVGA3D_CLEAR_DEPTH              = 0x2,
+   SVGA3D_CLEAR_STENCIL            = 0x4
+} SVGA3dClearFlag;
+
+typedef enum {
+   SVGA3D_RT_DEPTH                 = 0,
+   SVGA3D_RT_STENCIL               = 1,
+   SVGA3D_RT_COLOR0                = 2,
+   SVGA3D_RT_COLOR1                = 3,
+   SVGA3D_RT_COLOR2                = 4,
+   SVGA3D_RT_COLOR3                = 5,
+   SVGA3D_RT_COLOR4                = 6,
+   SVGA3D_RT_COLOR5                = 7,
+   SVGA3D_RT_COLOR6                = 8,
+   SVGA3D_RT_COLOR7                = 9,
+   SVGA3D_RT_MAX,
+   SVGA3D_RT_INVALID               = ((uint32)-1),
+} SVGA3dRenderTargetType;
+
+#define SVGA3D_MAX_RT_COLOR (SVGA3D_RT_COLOR7 - SVGA3D_RT_COLOR0 + 1)
+
+typedef
+union {
+   struct {
+      uint32  red   : 1;
+      uint32  green : 1;
+      uint32  blue  : 1;
+      uint32  alpha : 1;
+   };
+   uint32 uintValue;
+} SVGA3dColorMask;
+
+typedef enum {
+   SVGA3D_VBLEND_DISABLE            = 0,
+   SVGA3D_VBLEND_1WEIGHT            = 1,
+   SVGA3D_VBLEND_2WEIGHT            = 2,
+   SVGA3D_VBLEND_3WEIGHT            = 3,
+} SVGA3dVertexBlendFlags;
+
+typedef enum {
+   SVGA3D_WRAPCOORD_0   = 1 << 0,
+   SVGA3D_WRAPCOORD_1   = 1 << 1,
+   SVGA3D_WRAPCOORD_2   = 1 << 2,
+   SVGA3D_WRAPCOORD_3   = 1 << 3,
+   SVGA3D_WRAPCOORD_ALL = 0xF,
+} SVGA3dWrapFlags;
+
+/*
+ * SVGA_3D_CMD_TEXTURESTATE Types.  All value types
+ * must fit in a uint32.
+ */
+
+typedef enum {
+   SVGA3D_TS_INVALID                    = 0,
+   SVGA3D_TS_BIND_TEXTURE               = 1,    /* SVGA3dSurfaceId */
+   SVGA3D_TS_COLOROP                    = 2,    /* SVGA3dTextureCombiner */
+   SVGA3D_TS_COLORARG1                  = 3,    /* SVGA3dTextureArgData */
+   SVGA3D_TS_COLORARG2                  = 4,    /* SVGA3dTextureArgData */
+   SVGA3D_TS_ALPHAOP                    = 5,    /* SVGA3dTextureCombiner */
+   SVGA3D_TS_ALPHAARG1                  = 6,    /* SVGA3dTextureArgData */
+   SVGA3D_TS_ALPHAARG2                  = 7,    /* SVGA3dTextureArgData */
+   SVGA3D_TS_ADDRESSU                   = 8,    /* SVGA3dTextureAddress */
+   SVGA3D_TS_ADDRESSV                   = 9,    /* SVGA3dTextureAddress */
+   SVGA3D_TS_MIPFILTER                  = 10,   /* SVGA3dTextureFilter */
+   SVGA3D_TS_MAGFILTER                  = 11,   /* SVGA3dTextureFilter */
+   SVGA3D_TS_MINFILTER                  = 12,   /* SVGA3dTextureFilter */
+   SVGA3D_TS_BORDERCOLOR                = 13,   /* SVGA3dColor */
+   SVGA3D_TS_TEXCOORDINDEX              = 14,   /* uint32 */
+   SVGA3D_TS_TEXTURETRANSFORMFLAGS      = 15,   /* SVGA3dTexTransformFlags */
+   SVGA3D_TS_TEXCOORDGEN                = 16,   /* SVGA3dTextureCoordGen */
+   SVGA3D_TS_BUMPENVMAT00               = 17,   /* float */
+   SVGA3D_TS_BUMPENVMAT01               = 18,   /* float */
+   SVGA3D_TS_BUMPENVMAT10               = 19,   /* float */
+   SVGA3D_TS_BUMPENVMAT11               = 20,   /* float */
+   SVGA3D_TS_TEXTURE_MIPMAP_LEVEL       = 21,   /* uint32 */
+   SVGA3D_TS_TEXTURE_LOD_BIAS           = 22,   /* float */
+   SVGA3D_TS_TEXTURE_ANISOTROPIC_LEVEL  = 23,   /* uint32 */
+   SVGA3D_TS_ADDRESSW                   = 24,   /* SVGA3dTextureAddress */
+
+
+   /*
+    * Sampler Gamma Level
+    *
+    * Sampler gamma effects the color of samples taken from the sampler.  A
+    * value of 1.0 will produce linear samples.  If the value is <= 0.0 the
+    * gamma value is ignored and a linear space is used.
+    */
+
+   SVGA3D_TS_GAMMA                      = 25,   /* float */
+   SVGA3D_TS_BUMPENVLSCALE              = 26,   /* float */
+   SVGA3D_TS_BUMPENVLOFFSET             = 27,   /* float */
+   SVGA3D_TS_COLORARG0                  = 28,   /* SVGA3dTextureArgData */
+   SVGA3D_TS_ALPHAARG0                  = 29,   /* SVGA3dTextureArgData */
+   SVGA3D_TS_MAX
+} SVGA3dTextureStateName;
+
+typedef enum {
+   SVGA3D_TC_INVALID                   = 0,
+   SVGA3D_TC_DISABLE                   = 1,
+   SVGA3D_TC_SELECTARG1                = 2,
+   SVGA3D_TC_SELECTARG2                = 3,
+   SVGA3D_TC_MODULATE                  = 4,
+   SVGA3D_TC_ADD                       = 5,
+   SVGA3D_TC_ADDSIGNED                 = 6,
+   SVGA3D_TC_SUBTRACT                  = 7,
+   SVGA3D_TC_BLENDTEXTUREALPHA         = 8,
+   SVGA3D_TC_BLENDDIFFUSEALPHA         = 9,
+   SVGA3D_TC_BLENDCURRENTALPHA         = 10,
+   SVGA3D_TC_BLENDFACTORALPHA          = 11,
+   SVGA3D_TC_MODULATE2X                = 12,
+   SVGA3D_TC_MODULATE4X                = 13,
+   SVGA3D_TC_DSDT                      = 14,
+   SVGA3D_TC_DOTPRODUCT3               = 15,
+   SVGA3D_TC_BLENDTEXTUREALPHAPM       = 16,
+   SVGA3D_TC_ADDSIGNED2X               = 17,
+   SVGA3D_TC_ADDSMOOTH                 = 18,
+   SVGA3D_TC_PREMODULATE               = 19,
+   SVGA3D_TC_MODULATEALPHA_ADDCOLOR    = 20,
+   SVGA3D_TC_MODULATECOLOR_ADDALPHA    = 21,
+   SVGA3D_TC_MODULATEINVALPHA_ADDCOLOR = 22,
+   SVGA3D_TC_MODULATEINVCOLOR_ADDALPHA = 23,
+   SVGA3D_TC_BUMPENVMAPLUMINANCE       = 24,
+   SVGA3D_TC_MULTIPLYADD               = 25,
+   SVGA3D_TC_LERP                      = 26,
+   SVGA3D_TC_MAX
+} SVGA3dTextureCombiner;
+
+#define SVGA3D_TC_CAP_BIT(svga3d_tc_op) (svga3d_tc_op ? (1 << (svga3d_tc_op - 1)) : 0)
+
+typedef enum {
+   SVGA3D_TEX_ADDRESS_INVALID    = 0,
+   SVGA3D_TEX_ADDRESS_WRAP       = 1,
+   SVGA3D_TEX_ADDRESS_MIRROR     = 2,
+   SVGA3D_TEX_ADDRESS_CLAMP      = 3,
+   SVGA3D_TEX_ADDRESS_BORDER     = 4,
+   SVGA3D_TEX_ADDRESS_MIRRORONCE = 5,
+   SVGA3D_TEX_ADDRESS_EDGE       = 6,
+   SVGA3D_TEX_ADDRESS_MAX
+} SVGA3dTextureAddress;
+
+/*
+ * SVGA3D_TEX_FILTER_NONE as the minification filter means mipmapping is
+ * disabled, and the rasterizer should use the magnification filter instead.
+ */
+typedef enum {
+   SVGA3D_TEX_FILTER_NONE           = 0,
+   SVGA3D_TEX_FILTER_NEAREST        = 1,
+   SVGA3D_TEX_FILTER_LINEAR         = 2,
+   SVGA3D_TEX_FILTER_ANISOTROPIC    = 3,
+   SVGA3D_TEX_FILTER_FLATCUBIC      = 4, // Deprecated, not implemented
+   SVGA3D_TEX_FILTER_GAUSSIANCUBIC  = 5, // Deprecated, not implemented
+   SVGA3D_TEX_FILTER_PYRAMIDALQUAD  = 6, // Not currently implemented
+   SVGA3D_TEX_FILTER_GAUSSIANQUAD   = 7, // Not currently implemented
+   SVGA3D_TEX_FILTER_MAX
+} SVGA3dTextureFilter;
+
+typedef enum {
+   SVGA3D_TEX_TRANSFORM_OFF    = 0,
+   SVGA3D_TEX_TRANSFORM_S      = (1 << 0),
+   SVGA3D_TEX_TRANSFORM_T      = (1 << 1),
+   SVGA3D_TEX_TRANSFORM_R      = (1 << 2),
+   SVGA3D_TEX_TRANSFORM_Q      = (1 << 3),
+   SVGA3D_TEX_PROJECTED        = (1 << 15),
+} SVGA3dTexTransformFlags;
+
+typedef enum {
+   SVGA3D_TEXCOORD_GEN_OFF              = 0,
+   SVGA3D_TEXCOORD_GEN_EYE_POSITION     = 1,
+   SVGA3D_TEXCOORD_GEN_EYE_NORMAL       = 2,
+   SVGA3D_TEXCOORD_GEN_REFLECTIONVECTOR = 3,
+   SVGA3D_TEXCOORD_GEN_SPHERE           = 4,
+   SVGA3D_TEXCOORD_GEN_MAX
+} SVGA3dTextureCoordGen;
+
+/*
+ * Texture argument constants for texture combiner
+ */
+typedef enum {
+   SVGA3D_TA_INVALID    = 0,
+   SVGA3D_TA_CONSTANT   = 1,
+   SVGA3D_TA_PREVIOUS   = 2,
+   SVGA3D_TA_DIFFUSE    = 3,
+   SVGA3D_TA_TEXTURE    = 4,
+   SVGA3D_TA_SPECULAR   = 5,
+   SVGA3D_TA_MAX
+} SVGA3dTextureArgData;
+
+#define SVGA3D_TM_MASK_LEN 4
+
+/* Modifiers for texture argument constants defined above. */
+typedef enum {
+   SVGA3D_TM_NONE       = 0,
+   SVGA3D_TM_ALPHA      = (1 << SVGA3D_TM_MASK_LEN),
+   SVGA3D_TM_ONE_MINUS  = (2 << SVGA3D_TM_MASK_LEN),
+} SVGA3dTextureArgModifier;
+
+#define SVGA3D_INVALID_ID         ((uint32)-1)
+#define SVGA3D_MAX_CLIP_PLANES    6
+
+/*
+ * This is the limit to the number of fixed-function texture
+ * transforms and texture coordinates we can support. It does *not*
+ * correspond to the number of texture image units (samplers) we
+ * support!
+ */
+#define SVGA3D_MAX_TEXTURE_COORDS 8
+
+/*
+ * Vertex declarations
+ *
+ * Notes:
+ *
+ * SVGA3D_DECLUSAGE_POSITIONT is for pre-transformed vertices. If you
+ * draw with any POSITIONT vertex arrays, the programmable vertex
+ * pipeline will be implicitly disabled. Drawing will take place as if
+ * no vertex shader was bound.
+ */
+
+typedef enum {
+   SVGA3D_DECLUSAGE_POSITION     = 0,
+   SVGA3D_DECLUSAGE_BLENDWEIGHT,       //  1
+   SVGA3D_DECLUSAGE_BLENDINDICES,      //  2
+   SVGA3D_DECLUSAGE_NORMAL,            //  3
+   SVGA3D_DECLUSAGE_PSIZE,             //  4
+   SVGA3D_DECLUSAGE_TEXCOORD,          //  5
+   SVGA3D_DECLUSAGE_TANGENT,           //  6
+   SVGA3D_DECLUSAGE_BINORMAL,          //  7
+   SVGA3D_DECLUSAGE_TESSFACTOR,        //  8
+   SVGA3D_DECLUSAGE_POSITIONT,         //  9
+   SVGA3D_DECLUSAGE_COLOR,             // 10
+   SVGA3D_DECLUSAGE_FOG,               // 11
+   SVGA3D_DECLUSAGE_DEPTH,             // 12
+   SVGA3D_DECLUSAGE_SAMPLE,            // 13
+   SVGA3D_DECLUSAGE_MAX
+} SVGA3dDeclUsage;
+
+typedef enum {
+   SVGA3D_DECLMETHOD_DEFAULT     = 0,
+   SVGA3D_DECLMETHOD_PARTIALU,
+   SVGA3D_DECLMETHOD_PARTIALV,
+   SVGA3D_DECLMETHOD_CROSSUV,          // Normal
+   SVGA3D_DECLMETHOD_UV,
+   SVGA3D_DECLMETHOD_LOOKUP,           // Lookup a displacement map
+   SVGA3D_DECLMETHOD_LOOKUPPRESAMPLED, // Lookup a pre-sampled displacement map
+} SVGA3dDeclMethod;
+
+typedef enum {
+   SVGA3D_DECLTYPE_FLOAT1        =  0,
+   SVGA3D_DECLTYPE_FLOAT2        =  1,
+   SVGA3D_DECLTYPE_FLOAT3        =  2,
+   SVGA3D_DECLTYPE_FLOAT4        =  3,
+   SVGA3D_DECLTYPE_D3DCOLOR      =  4,
+   SVGA3D_DECLTYPE_UBYTE4        =  5,
+   SVGA3D_DECLTYPE_SHORT2        =  6,
+   SVGA3D_DECLTYPE_SHORT4        =  7,
+   SVGA3D_DECLTYPE_UBYTE4N       =  8,
+   SVGA3D_DECLTYPE_SHORT2N       =  9,
+   SVGA3D_DECLTYPE_SHORT4N       = 10,
+   SVGA3D_DECLTYPE_USHORT2N      = 11,
+   SVGA3D_DECLTYPE_USHORT4N      = 12,
+   SVGA3D_DECLTYPE_UDEC3         = 13,
+   SVGA3D_DECLTYPE_DEC3N         = 14,
+   SVGA3D_DECLTYPE_FLOAT16_2     = 15,
+   SVGA3D_DECLTYPE_FLOAT16_4     = 16,
+   SVGA3D_DECLTYPE_MAX,
+} SVGA3dDeclType;
+
+/*
+ * This structure is used for the divisor for geometry instancing;
+ * it's a direct translation of the Direct3D equivalent.
+ */
+typedef union {
+   struct {
+      /*
+       * For index data, this number represents the number of instances to draw.
+       * For instance data, this number represents the number of
+       * instances/vertex in this stream
+       */
+      uint32 count : 30;
+
+      /*
+       * This is 1 if this is supposed to be the data that is repeated for
+       * every instance.
+       */
+      uint32 indexedData : 1;
+
+      /*
+       * This is 1 if this is supposed to be the per-instance data.
+       */
+      uint32 instanceData : 1;
+   };
+
+   uint32 value;
+} SVGA3dVertexDivisor;
+
+typedef enum {
+   SVGA3D_PRIMITIVE_INVALID                     = 0,
+   SVGA3D_PRIMITIVE_TRIANGLELIST                = 1,
+   SVGA3D_PRIMITIVE_POINTLIST                   = 2,
+   SVGA3D_PRIMITIVE_LINELIST                    = 3,
+   SVGA3D_PRIMITIVE_LINESTRIP                   = 4,
+   SVGA3D_PRIMITIVE_TRIANGLESTRIP               = 5,
+   SVGA3D_PRIMITIVE_TRIANGLEFAN                 = 6,
+   SVGA3D_PRIMITIVE_MAX
+} SVGA3dPrimitiveType;
+
+typedef enum {
+   SVGA3D_COORDINATE_INVALID                   = 0,
+   SVGA3D_COORDINATE_LEFTHANDED                = 1,
+   SVGA3D_COORDINATE_RIGHTHANDED               = 2,
+   SVGA3D_COORDINATE_MAX
+} SVGA3dCoordinateType;
+
+typedef enum {
+   SVGA3D_TRANSFORM_INVALID                     = 0,
+   SVGA3D_TRANSFORM_WORLD                       = 1,
+   SVGA3D_TRANSFORM_VIEW                        = 2,
+   SVGA3D_TRANSFORM_PROJECTION                  = 3,
+   SVGA3D_TRANSFORM_TEXTURE0                    = 4,
+   SVGA3D_TRANSFORM_TEXTURE1                    = 5,
+   SVGA3D_TRANSFORM_TEXTURE2                    = 6,
+   SVGA3D_TRANSFORM_TEXTURE3                    = 7,
+   SVGA3D_TRANSFORM_TEXTURE4                    = 8,
+   SVGA3D_TRANSFORM_TEXTURE5                    = 9,
+   SVGA3D_TRANSFORM_TEXTURE6                    = 10,
+   SVGA3D_TRANSFORM_TEXTURE7                    = 11,
+   SVGA3D_TRANSFORM_WORLD1                      = 12,
+   SVGA3D_TRANSFORM_WORLD2                      = 13,
+   SVGA3D_TRANSFORM_WORLD3                      = 14,
+   SVGA3D_TRANSFORM_MAX
+} SVGA3dTransformType;
+
+typedef enum {
+   SVGA3D_LIGHTTYPE_INVALID                     = 0,
+   SVGA3D_LIGHTTYPE_POINT                       = 1,
+   SVGA3D_LIGHTTYPE_SPOT1                       = 2, /* 1-cone, in degrees */
+   SVGA3D_LIGHTTYPE_SPOT2                       = 3, /* 2-cone, in radians */
+   SVGA3D_LIGHTTYPE_DIRECTIONAL                 = 4,
+   SVGA3D_LIGHTTYPE_MAX
+} SVGA3dLightType;
+
+typedef enum {
+   SVGA3D_CUBEFACE_POSX                         = 0,
+   SVGA3D_CUBEFACE_NEGX                         = 1,
+   SVGA3D_CUBEFACE_POSY                         = 2,
+   SVGA3D_CUBEFACE_NEGY                         = 3,
+   SVGA3D_CUBEFACE_POSZ                         = 4,
+   SVGA3D_CUBEFACE_NEGZ                         = 5,
+} SVGA3dCubeFace;
+
+typedef enum {
+   SVGA3D_SHADERTYPE_COMPILED_DX8               = 0,
+   SVGA3D_SHADERTYPE_VS                         = 1,
+   SVGA3D_SHADERTYPE_PS                         = 2,
+   SVGA3D_SHADERTYPE_MAX
+} SVGA3dShaderType;
+
+typedef enum {
+   SVGA3D_CONST_TYPE_FLOAT                      = 0,
+   SVGA3D_CONST_TYPE_INT                        = 1,
+   SVGA3D_CONST_TYPE_BOOL                       = 2,
+} SVGA3dShaderConstType;
+
+#define SVGA3D_MAX_SURFACE_FACES                6
+
+typedef enum {
+   SVGA3D_STRETCH_BLT_POINT                     = 0,
+   SVGA3D_STRETCH_BLT_LINEAR                    = 1,
+   SVGA3D_STRETCH_BLT_MAX
+} SVGA3dStretchBltMode;
+
+typedef enum {
+   SVGA3D_QUERYTYPE_OCCLUSION                   = 0,
+   SVGA3D_QUERYTYPE_MAX
+} SVGA3dQueryType;
+
+typedef enum {
+   SVGA3D_QUERYSTATE_PENDING     = 0,      /* Waiting on the host (set by guest) */
+   SVGA3D_QUERYSTATE_SUCCEEDED   = 1,      /* Completed successfully (set by host) */
+   SVGA3D_QUERYSTATE_FAILED      = 2,      /* Completed unsuccessfully (set by host) */
+   SVGA3D_QUERYSTATE_NEW         = 3,      /* Never submitted (For guest use only) */
+} SVGA3dQueryState;
+
+typedef enum {
+   SVGA3D_WRITE_HOST_VRAM        = 1,
+   SVGA3D_READ_HOST_VRAM         = 2,
+} SVGA3dTransferType;
+
+/*
+ * The maximum number vertex arrays we're guaranteed to support in
+ * SVGA_3D_CMD_DRAWPRIMITIVES.
+ */
+#define SVGA3D_MAX_VERTEX_ARRAYS   32
+
+/*
+ * Identifiers for commands in the command FIFO.
+ *
+ * IDs between 1000 and 1039 (inclusive) were used by obsolete versions of
+ * the SVGA3D protocol and remain reserved; they should not be used in the
+ * future.
+ *
+ * IDs between 1040 and 1999 (inclusive) are available for use by the
+ * current SVGA3D protocol.
+ *
+ * FIFO clients other than SVGA3D should stay below 1000, or at 2000
+ * and up.
+ */
+
+#define SVGA_3D_CMD_LEGACY_BASE            1000
+#define SVGA_3D_CMD_BASE                   1040
+
+#define SVGA_3D_CMD_SURFACE_DEFINE         SVGA_3D_CMD_BASE + 0
+#define SVGA_3D_CMD_SURFACE_DESTROY        SVGA_3D_CMD_BASE + 1
+#define SVGA_3D_CMD_SURFACE_COPY           SVGA_3D_CMD_BASE + 2
+#define SVGA_3D_CMD_SURFACE_STRETCHBLT     SVGA_3D_CMD_BASE + 3
+#define SVGA_3D_CMD_SURFACE_DMA            SVGA_3D_CMD_BASE + 4
+#define SVGA_3D_CMD_CONTEXT_DEFINE         SVGA_3D_CMD_BASE + 5
+#define SVGA_3D_CMD_CONTEXT_DESTROY        SVGA_3D_CMD_BASE + 6
+#define SVGA_3D_CMD_SETTRANSFORM           SVGA_3D_CMD_BASE + 7
+#define SVGA_3D_CMD_SETZRANGE              SVGA_3D_CMD_BASE + 8
+#define SVGA_3D_CMD_SETRENDERSTATE         SVGA_3D_CMD_BASE + 9
+#define SVGA_3D_CMD_SETRENDERTARGET        SVGA_3D_CMD_BASE + 10
+#define SVGA_3D_CMD_SETTEXTURESTATE        SVGA_3D_CMD_BASE + 11
+#define SVGA_3D_CMD_SETMATERIAL            SVGA_3D_CMD_BASE + 12
+#define SVGA_3D_CMD_SETLIGHTDATA           SVGA_3D_CMD_BASE + 13
+#define SVGA_3D_CMD_SETLIGHTENABLED        SVGA_3D_CMD_BASE + 14
+#define SVGA_3D_CMD_SETVIEWPORT            SVGA_3D_CMD_BASE + 15
+#define SVGA_3D_CMD_SETCLIPPLANE           SVGA_3D_CMD_BASE + 16
+#define SVGA_3D_CMD_CLEAR                  SVGA_3D_CMD_BASE + 17
+#define SVGA_3D_CMD_PRESENT                SVGA_3D_CMD_BASE + 18    // Deprecated
+#define SVGA_3D_CMD_SHADER_DEFINE          SVGA_3D_CMD_BASE + 19
+#define SVGA_3D_CMD_SHADER_DESTROY         SVGA_3D_CMD_BASE + 20
+#define SVGA_3D_CMD_SET_SHADER             SVGA_3D_CMD_BASE + 21
+#define SVGA_3D_CMD_SET_SHADER_CONST       SVGA_3D_CMD_BASE + 22
+#define SVGA_3D_CMD_DRAW_PRIMITIVES        SVGA_3D_CMD_BASE + 23
+#define SVGA_3D_CMD_SETSCISSORRECT         SVGA_3D_CMD_BASE + 24
+#define SVGA_3D_CMD_BEGIN_QUERY            SVGA_3D_CMD_BASE + 25
+#define SVGA_3D_CMD_END_QUERY              SVGA_3D_CMD_BASE + 26
+#define SVGA_3D_CMD_WAIT_FOR_QUERY         SVGA_3D_CMD_BASE + 27
+#define SVGA_3D_CMD_PRESENT_READBACK       SVGA_3D_CMD_BASE + 28    // Deprecated
+#define SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN SVGA_3D_CMD_BASE + 29
+#define SVGA_3D_CMD_MAX                    SVGA_3D_CMD_BASE + 30
+
+#define SVGA_3D_CMD_FUTURE_MAX             2000
+
+/*
+ * Common substructures used in multiple FIFO commands:
+ */
+
+typedef struct {
+   union {
+      struct {
+         uint16  function;       // SVGA3dFogFunction
+         uint8   type;           // SVGA3dFogType
+         uint8   base;           // SVGA3dFogBase
+      };
+      uint32     uintValue;
+   };
+} SVGA3dFogMode;
+
+/*
+ * Uniquely identify one image (a 1D/2D/3D array) from a surface. This
+ * is a surface ID as well as face/mipmap indices.
+ */
+
+typedef
+struct SVGA3dSurfaceImageId {
+   uint32               sid;
+   uint32               face;
+   uint32               mipmap;
+} SVGA3dSurfaceImageId;
+
+typedef
+struct SVGA3dGuestImage {
+   SVGAGuestPtr         ptr;
+
+   /*
+    * A note on interpretation of pitch: This value of pitch is the
+    * number of bytes between vertically adjacent image
+    * blocks. Normally this is the number of bytes between the first
+    * pixel of two adjacent scanlines. With compressed textures,
+    * however, this may represent the number of bytes between
+    * compression blocks rather than between rows of pixels.
+    *
+    * XXX: Compressed textures currently must be tightly packed in guest memory.
+    *
+    * If the image is 1-dimensional, pitch is ignored.
+    *
+    * If 'pitch' is zero, the SVGA3D device calculates a pitch value
+    * assuming each row of blocks is tightly packed.
+    */
+   uint32 pitch;
+} SVGA3dGuestImage;
+
+
+/*
+ * FIFO command format definitions:
+ */
+
+/*
+ * The data size header following cmdNum for every 3d command
+ */
+typedef
+struct {
+   uint32               id;
+   uint32               size;
+} SVGA3dCmdHeader;
+
+/*
+ * A surface is a hierarchy of host VRAM surfaces: 1D, 2D, or 3D, with
+ * optional mipmaps and cube faces.
+ */
+
+typedef
+struct {
+   uint32               width;
+   uint32               height;
+   uint32               depth;
+} SVGA3dSize;
+
+typedef enum {
+   SVGA3D_SURFACE_CUBEMAP              = (1 << 0),
+   SVGA3D_SURFACE_HINT_STATIC          = (1 << 1),
+   SVGA3D_SURFACE_HINT_DYNAMIC         = (1 << 2),
+   SVGA3D_SURFACE_HINT_INDEXBUFFER     = (1 << 3),
+   SVGA3D_SURFACE_HINT_VERTEXBUFFER    = (1 << 4),
+   SVGA3D_SURFACE_HINT_TEXTURE         = (1 << 5),
+   SVGA3D_SURFACE_HINT_RENDERTARGET    = (1 << 6),
+   SVGA3D_SURFACE_HINT_DEPTHSTENCIL    = (1 << 7),
+   SVGA3D_SURFACE_HINT_WRITEONLY       = (1 << 8),
+} SVGA3dSurfaceFlags;
+
+typedef
+struct {
+   uint32               numMipLevels;
+} SVGA3dSurfaceFace;
+
+typedef
+struct {
+   uint32                      sid;
+   SVGA3dSurfaceFlags          surfaceFlags;
+   SVGA3dSurfaceFormat         format;
+   SVGA3dSurfaceFace           face[SVGA3D_MAX_SURFACE_FACES];
+   /*
+    * Followed by an SVGA3dSize structure for each mip level in each face.
+    *
+    * A note on surface sizes: Sizes are always specified in pixels,
+    * even if the true surface size is not a multiple of the minimum
+    * block size of the surface's format. For example, a 3x3x1 DXT1
+    * compressed texture would actually be stored as a 4x4x1 image in
+    * memory.
+    */
+} SVGA3dCmdDefineSurface;       /* SVGA_3D_CMD_SURFACE_DEFINE */
+
+typedef
+struct {
+   uint32               sid;
+} SVGA3dCmdDestroySurface;      /* SVGA_3D_CMD_SURFACE_DESTROY */
+
+typedef
+struct {
+   uint32               cid;
+} SVGA3dCmdDefineContext;       /* SVGA_3D_CMD_CONTEXT_DEFINE */
+
+typedef
+struct {
+   uint32               cid;
+} SVGA3dCmdDestroyContext;      /* SVGA_3D_CMD_CONTEXT_DESTROY */
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dClearFlag      clearFlag;
+   uint32               color;
+   float                depth;
+   uint32               stencil;
+   /* Followed by variable number of SVGA3dRect structures */
+} SVGA3dCmdClear;               /* SVGA_3D_CMD_CLEAR */
+
+typedef
+struct SVGA3dCopyRect {
+   uint32               x;
+   uint32               y;
+   uint32               w;
+   uint32               h;
+   uint32               srcx;
+   uint32               srcy;
+} SVGA3dCopyRect;
+
+typedef
+struct SVGA3dCopyBox {
+   uint32               x;
+   uint32               y;
+   uint32               z;
+   uint32               w;
+   uint32               h;
+   uint32               d;
+   uint32               srcx;
+   uint32               srcy;
+   uint32               srcz;
+} SVGA3dCopyBox;
+
+typedef
+struct {
+   uint32               x;
+   uint32               y;
+   uint32               w;
+   uint32               h;
+} SVGA3dRect;
+
+typedef
+struct {
+   uint32               x;
+   uint32               y;
+   uint32               z;
+   uint32               w;
+   uint32               h;
+   uint32               d;
+} SVGA3dBox;
+
+typedef
+struct {
+   uint32               x;
+   uint32               y;
+   uint32               z;
+} SVGA3dPoint;
+
+typedef
+struct {
+   SVGA3dLightType      type;
+   SVGA3dBool           inWorldSpace;
+   float                diffuse[4];
+   float                specular[4];
+   float                ambient[4];
+   float                position[4];
+   float                direction[4];
+   float                range;
+   float                falloff;
+   float                attenuation0;
+   float                attenuation1;
+   float                attenuation2;
+   float                theta;
+   float                phi;
+} SVGA3dLightData;
+
+typedef
+struct {
+   uint32               sid;
+   /* Followed by variable number of SVGA3dCopyRect structures */
+} SVGA3dCmdPresent;             /* SVGA_3D_CMD_PRESENT */
+
+typedef
+struct {
+   SVGA3dRenderStateName   state;
+   union {
+      uint32               uintValue;
+      float                floatValue;
+   };
+} SVGA3dRenderState;
+
+typedef
+struct {
+   uint32               cid;
+   /* Followed by variable number of SVGA3dRenderState structures */
+} SVGA3dCmdSetRenderState;      /* SVGA_3D_CMD_SETRENDERSTATE */
+
+typedef
+struct {
+   uint32                 cid;
+   SVGA3dRenderTargetType type;
+   SVGA3dSurfaceImageId   target;
+} SVGA3dCmdSetRenderTarget;     /* SVGA_3D_CMD_SETRENDERTARGET */
+
+typedef
+struct {
+   SVGA3dSurfaceImageId  src;
+   SVGA3dSurfaceImageId  dest;
+   /* Followed by variable number of SVGA3dCopyBox structures */
+} SVGA3dCmdSurfaceCopy;               /* SVGA_3D_CMD_SURFACE_COPY */
+
+typedef
+struct {
+   SVGA3dSurfaceImageId  src;
+   SVGA3dSurfaceImageId  dest;
+   SVGA3dBox             boxSrc;
+   SVGA3dBox             boxDest;
+   SVGA3dStretchBltMode  mode;
+} SVGA3dCmdSurfaceStretchBlt;         /* SVGA_3D_CMD_SURFACE_STRETCHBLT */
+
+typedef
+struct {
+   /*
+    * If the discard flag is present in a surface DMA operation, the host may
+    * discard the contents of the current mipmap level and face of the target
+    * surface before applying the surface DMA contents.
+    */
+   uint32 discard : 1;
+
+   /*
+    * If the unsynchronized flag is present, the host may perform this upload
+    * without syncing to pending reads on this surface.
+    */
+   uint32 unsynchronized : 1;
+
+   /*
+    * Guests *MUST* set the reserved bits to 0 before submitting the command
+    * suffix as future flags may occupy these bits.
+    */
+   uint32 reserved : 30;
+} SVGA3dSurfaceDMAFlags;
+
+typedef
+struct {
+   SVGA3dGuestImage      guest;
+   SVGA3dSurfaceImageId  host;
+   SVGA3dTransferType    transfer;
+   /*
+    * Followed by variable number of SVGA3dCopyBox structures. For consistency
+    * in all clipping logic and coordinate translation, we define the
+    * "source" in each copyBox as the guest image and the
+    * "destination" as the host image, regardless of transfer
+    * direction.
+    *
+    * For efficiency, the SVGA3D device is free to copy more data than
+    * specified. For example, it may round copy boxes outwards such
+    * that they lie on particular alignment boundaries.
+    */
+} SVGA3dCmdSurfaceDMA;                /* SVGA_3D_CMD_SURFACE_DMA */
+
+/*
+ * SVGA3dCmdSurfaceDMASuffix --
+ *
+ *    This is a command suffix that will appear after a SurfaceDMA command in
+ *    the FIFO.  It contains some extra information that hosts may use to
+ *    optimize performance or protect the guest.  This suffix exists to preserve
+ *    backwards compatibility while also allowing for new functionality to be
+ *    implemented.
+ */
+
+typedef
+struct {
+   uint32 suffixSize;
+
+   /*
+    * The maximum offset is used to determine the maximum offset from the
+    * guestPtr base address that will be accessed or written to during this
+    * surfaceDMA.  If the suffix is supported, the host will respect this
+    * boundary while performing surface DMAs.
+    *
+    * Defaults to MAX_UINT32
+    */
+   uint32 maximumOffset;
+
+   /*
+    * A set of flags that describes optimizations that the host may perform
+    * while performing this surface DMA operation.  The guest should never rely
+    * on behaviour that is different when these flags are set for correctness.
+    *
+    * Defaults to 0
+    */
+   SVGA3dSurfaceDMAFlags flags;
+} SVGA3dCmdSurfaceDMASuffix;
+
+/*
+ * SVGA_3D_CMD_DRAW_PRIMITIVES --
+ *
+ *   This command is the SVGA3D device's generic drawing entry point.
+ *   It can draw multiple ranges of primitives, optionally using an
+ *   index buffer, using an arbitrary collection of vertex buffers.
+ *
+ *   Each SVGA3dVertexDecl defines a distinct vertex array to bind
+ *   during this draw call. The declarations specify which surface
+ *   the vertex data lives in, what that vertex data is used for,
+ *   and how to interpret it.
+ *
+ *   Each SVGA3dPrimitiveRange defines a collection of primitives
+ *   to render using the same vertex arrays. An index buffer is
+ *   optional.
+ */
+
+typedef
+struct {
+   /*
+    * A range hint is an optional specification for the range of indices
+    * in an SVGA3dArray that will be used. If 'last' is zero, it is assumed
+    * that the entire array will be used.
+    *
+    * These are only hints. The SVGA3D device may use them for
+    * performance optimization if possible, but it's also allowed to
+    * ignore these values.
+    */
+   uint32               first;
+   uint32               last;
+} SVGA3dArrayRangeHint;
+
+typedef
+struct {
+   /*
+    * Define the origin and shape of a vertex or index array. Both
+    * 'offset' and 'stride' are in bytes. The provided surface will be
+    * reinterpreted as a flat array of bytes in the same format used
+    * by surface DMA operations. To avoid unnecessary conversions, the
+    * surface should be created with the SVGA3D_BUFFER format.
+    *
+    * Index 0 in the array starts 'offset' bytes into the surface.
+    * Index 1 begins at byte 'offset + stride', etc. Array indices may
+    * not be negative.
+    */
+   uint32               surfaceId;
+   uint32               offset;
+   uint32               stride;
+} SVGA3dArray;
+
+typedef
+struct {
+   /*
+    * Describe a vertex array's data type, and define how it is to be
+    * used by the fixed function pipeline or the vertex shader. It
+    * isn't useful to have two VertexDecls with the same
+    * VertexArrayIdentity in one draw call.
+    */
+   SVGA3dDeclType       type;
+   SVGA3dDeclMethod     method;
+   SVGA3dDeclUsage      usage;
+   uint32               usageIndex;
+} SVGA3dVertexArrayIdentity;
+
+typedef
+struct {
+   SVGA3dVertexArrayIdentity  identity;
+   SVGA3dArray                array;
+   SVGA3dArrayRangeHint       rangeHint;
+} SVGA3dVertexDecl;
+
+typedef
+struct {
+   /*
+    * Define a group of primitives to render, from sequential indices.
+    *
+    * The value of 'primitiveType' and 'primitiveCount' imply the
+    * total number of vertices that will be rendered.
+    */
+   SVGA3dPrimitiveType  primType;
+   uint32               primitiveCount;
+
+   /*
+    * Optional index buffer. If indexArray.surfaceId is
+    * SVGA3D_INVALID_ID, we render without an index buffer. Rendering
+    * without an index buffer is identical to rendering with an index
+    * buffer containing the sequence [0, 1, 2, 3, ...].
+    *
+    * If an index buffer is in use, indexWidth specifies the width in
+    * bytes of each index value. It must be less than or equal to
+    * indexArray.stride.
+    *
+    * (Currently, the SVGA3D device requires index buffers to be tightly
+    * packed. In other words, indexWidth == indexArray.stride)
+    */
+   SVGA3dArray          indexArray;
+   uint32               indexWidth;
+
+   /*
+    * Optional index bias. This number is added to all indices from
+    * indexArray before they are used as vertex array indices. This
+    * can be used in multiple ways:
+    *
+    *  - When not using an indexArray, this bias can be used to
+    *    specify where in the vertex arrays to begin rendering.
+    *
+    *  - A positive number here is equivalent to increasing the
+    *    offset in each vertex array.
+    *
+    *  - A negative number can be used to render using a small
+    *    vertex array and an index buffer that contains large
+    *    values. This may be used by some applications that
+    *    crop a vertex buffer without modifying their index
+    *    buffer.
+    *
+    * Note that rendering with a negative bias value may be slower and
+    * use more memory than rendering with a positive or zero bias.
+    */
+   int32                indexBias;
+} SVGA3dPrimitiveRange;
+
+typedef
+struct {
+   uint32               cid;
+   uint32               numVertexDecls;
+   uint32               numRanges;
+
+   /*
+    * There are two variable size arrays after the
+    * SVGA3dCmdDrawPrimitives structure. In order,
+    * they are:
+    *
+    * 1. SVGA3dVertexDecl, quantity 'numVertexDecls'
+    * 2. SVGA3dPrimitiveRange, quantity 'numRanges'
+    * 3. Optionally, SVGA3dVertexDivisor, quantity 'numVertexDecls' (contains
+    *    the frequency divisor for this the corresponding vertex decl)
+    */
+} SVGA3dCmdDrawPrimitives;      /* SVGA_3D_CMD_DRAWPRIMITIVES */
+
+typedef
+struct {
+   uint32                   stage;
+   SVGA3dTextureStateName   name;
+   union {
+      uint32                value;
+      float                 floatValue;
+   };
+} SVGA3dTextureState;
+
+typedef
+struct {
+   uint32               cid;
+   /* Followed by variable number of SVGA3dTextureState structures */
+} SVGA3dCmdSetTextureState;      /* SVGA_3D_CMD_SETTEXTURESTATE */
+
+typedef
+struct {
+   uint32                   cid;
+   SVGA3dTransformType      type;
+   float                    matrix[16];
+} SVGA3dCmdSetTransform;          /* SVGA_3D_CMD_SETTRANSFORM */
+
+typedef
+struct {
+   float                min;
+   float                max;
+} SVGA3dZRange;
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dZRange         zRange;
+} SVGA3dCmdSetZRange;             /* SVGA_3D_CMD_SETZRANGE */
+
+typedef
+struct {
+   float                diffuse[4];
+   float                ambient[4];
+   float                specular[4];
+   float                emissive[4];
+   float                shininess;
+} SVGA3dMaterial;
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dFace           face;
+   SVGA3dMaterial       material;
+} SVGA3dCmdSetMaterial;           /* SVGA_3D_CMD_SETMATERIAL */
+
+typedef
+struct {
+   uint32               cid;
+   uint32               index;
+   SVGA3dLightData      data;
+} SVGA3dCmdSetLightData;           /* SVGA_3D_CMD_SETLIGHTDATA */
+
+typedef
+struct {
+   uint32               cid;
+   uint32               index;
+   uint32               enabled;
+} SVGA3dCmdSetLightEnabled;      /* SVGA_3D_CMD_SETLIGHTENABLED */
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dRect           rect;
+} SVGA3dCmdSetViewport;           /* SVGA_3D_CMD_SETVIEWPORT */
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dRect           rect;
+} SVGA3dCmdSetScissorRect;         /* SVGA_3D_CMD_SETSCISSORRECT */
+
+typedef
+struct {
+   uint32               cid;
+   uint32               index;
+   float                plane[4];
+} SVGA3dCmdSetClipPlane;           /* SVGA_3D_CMD_SETCLIPPLANE */
+
+typedef
+struct {
+   uint32               cid;
+   uint32               shid;
+   SVGA3dShaderType     type;
+   /* Followed by variable number of DWORDs for shader bycode */
+} SVGA3dCmdDefineShader;           /* SVGA_3D_CMD_SHADER_DEFINE */
+
+typedef
+struct {
+   uint32               cid;
+   uint32               shid;
+   SVGA3dShaderType     type;
+} SVGA3dCmdDestroyShader;         /* SVGA_3D_CMD_SHADER_DESTROY */
+
+typedef
+struct {
+   uint32                  cid;
+   uint32                  reg;     /* register number */
+   SVGA3dShaderType        type;
+   SVGA3dShaderConstType   ctype;
+   uint32                  values[4];
+} SVGA3dCmdSetShaderConst;        /* SVGA_3D_CMD_SET_SHADER_CONST */
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dShaderType     type;
+   uint32               shid;
+} SVGA3dCmdSetShader;             /* SVGA_3D_CMD_SET_SHADER */
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dQueryType      type;
+} SVGA3dCmdBeginQuery;           /* SVGA_3D_CMD_BEGIN_QUERY */
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dQueryType      type;
+   SVGAGuestPtr         guestResult;  /* Points to an SVGA3dQueryResult structure */
+} SVGA3dCmdEndQuery;                  /* SVGA_3D_CMD_END_QUERY */
+
+typedef
+struct {
+   uint32               cid;          /* Same parameters passed to END_QUERY */
+   SVGA3dQueryType      type;
+   SVGAGuestPtr         guestResult;
+} SVGA3dCmdWaitForQuery;              /* SVGA_3D_CMD_WAIT_FOR_QUERY */
+
+typedef
+struct {
+   uint32               totalSize;    /* Set by guest before query is ended. */
+   SVGA3dQueryState     state;        /* Set by host or guest. See SVGA3dQueryState. */
+   union {                            /* Set by host on exit from PENDING state */
+      uint32            result32;
+   };
+} SVGA3dQueryResult;
+
+/*
+ * SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN --
+ *
+ *    This is a blit from an SVGA3D surface to a Screen Object. Just
+ *    like GMR-to-screen blits, this blit may be directed at a
+ *    specific screen or to the virtual coordinate space.
+ *
+ *    The blit copies from a rectangular region of an SVGA3D surface
+ *    image to a rectangular region of a screen or screens.
+ *
+ *    This command takes an optional variable-length list of clipping
+ *    rectangles after the body of the command. If no rectangles are
+ *    specified, there is no clipping region. The entire destRect is
+ *    drawn to. If one or more rectangles are included, they describe
+ *    a clipping region. The clip rectangle coordinates are measured
+ *    relative to the top-left corner of destRect.
+ *
+ *    This clipping region serves multiple purposes:
+ *
+ *      - It can be used to perform an irregularly shaped blit more
+ *        efficiently than by issuing many separate blit commands.
+ *
+ *      - It is equivalent to allowing blits with non-integer
+ *        source coordinates. You could blit just one half-pixel
+ *        of a source, for example, by specifying a larger
+ *        destination rectangle than you need, then removing
+ *        part of it using a clip rectangle.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ *
+ * Limitations:
+ *
+ *    - Currently, no backend supports blits from a mipmap or face
+ *      other than the first one.
+ */
+
+typedef
+struct {
+   SVGA3dSurfaceImageId srcImage;
+   SVGASignedRect       srcRect;
+   uint32               destScreenId; /* Screen ID or SVGA_ID_INVALID for virt. coords */
+   SVGASignedRect       destRect;     /* Supports scaling if src/rest different size */
+   /* Clipping: zero or more SVGASignedRects follow */
+} SVGA3dCmdBlitSurfaceToScreen;         /* SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN */
+
+
+/*
+ * Capability query index.
+ *
+ * Notes:
+ *
+ *   1. SVGA3D_DEVCAP_MAX_TEXTURES reflects the maximum number of
+ *      fixed-function texture units available. Each of these units
+ *      work in both FFP and Shader modes, and they support texture
+ *      transforms and texture coordinates. The host may have additional
+ *      texture image units that are only usable with shaders.
+ *
+ *   2. The BUFFER_FORMAT capabilities are deprecated, and they always
+ *      return TRUE. Even on physical hardware that does not support
+ *      these formats natively, the SVGA3D device will provide an emulation
+ *      which should be invisible to the guest OS.
+ *
+ *      In general, the SVGA3D device should support any operation on
+ *      any surface format, it just may perform some of these
+ *      operations in software depending on the capabilities of the
+ *      available physical hardware.
+ *
+ *      XXX: In the future, we will add capabilities that describe in
+ *      detail what formats are supported in hardware for what kinds
+ *      of operations.
+ */
+
+typedef enum {
+   SVGA3D_DEVCAP_3D                                = 0,
+   SVGA3D_DEVCAP_MAX_LIGHTS                        = 1,
+   SVGA3D_DEVCAP_MAX_TEXTURES                      = 2,  /* See note (1) */
+   SVGA3D_DEVCAP_MAX_CLIP_PLANES                   = 3,
+   SVGA3D_DEVCAP_VERTEX_SHADER_VERSION             = 4,
+   SVGA3D_DEVCAP_VERTEX_SHADER                     = 5,
+   SVGA3D_DEVCAP_FRAGMENT_SHADER_VERSION           = 6,
+   SVGA3D_DEVCAP_FRAGMENT_SHADER                   = 7,
+   SVGA3D_DEVCAP_MAX_RENDER_TARGETS                = 8,
+   SVGA3D_DEVCAP_S23E8_TEXTURES                    = 9,
+   SVGA3D_DEVCAP_S10E5_TEXTURES                    = 10,
+   SVGA3D_DEVCAP_MAX_FIXED_VERTEXBLEND             = 11,
+   SVGA3D_DEVCAP_D16_BUFFER_FORMAT                 = 12, /* See note (2) */
+   SVGA3D_DEVCAP_D24S8_BUFFER_FORMAT               = 13, /* See note (2) */
+   SVGA3D_DEVCAP_D24X8_BUFFER_FORMAT               = 14, /* See note (2) */
+   SVGA3D_DEVCAP_QUERY_TYPES                       = 15,
+   SVGA3D_DEVCAP_TEXTURE_GRADIENT_SAMPLING         = 16,
+   SVGA3D_DEVCAP_MAX_POINT_SIZE                    = 17,
+   SVGA3D_DEVCAP_MAX_SHADER_TEXTURES               = 18,
+   SVGA3D_DEVCAP_MAX_TEXTURE_WIDTH                 = 19,
+   SVGA3D_DEVCAP_MAX_TEXTURE_HEIGHT                = 20,
+   SVGA3D_DEVCAP_MAX_VOLUME_EXTENT                 = 21,
+   SVGA3D_DEVCAP_MAX_TEXTURE_REPEAT                = 22,
+   SVGA3D_DEVCAP_MAX_TEXTURE_ASPECT_RATIO          = 23,
+   SVGA3D_DEVCAP_MAX_TEXTURE_ANISOTROPY            = 24,
+   SVGA3D_DEVCAP_MAX_PRIMITIVE_COUNT               = 25,
+   SVGA3D_DEVCAP_MAX_VERTEX_INDEX                  = 26,
+   SVGA3D_DEVCAP_MAX_VERTEX_SHADER_INSTRUCTIONS    = 27,
+   SVGA3D_DEVCAP_MAX_FRAGMENT_SHADER_INSTRUCTIONS  = 28,
+   SVGA3D_DEVCAP_MAX_VERTEX_SHADER_TEMPS           = 29,
+   SVGA3D_DEVCAP_MAX_FRAGMENT_SHADER_TEMPS         = 30,
+   SVGA3D_DEVCAP_TEXTURE_OPS                       = 31,
+   SVGA3D_DEVCAP_SURFACEFMT_X8R8G8B8               = 32,
+   SVGA3D_DEVCAP_SURFACEFMT_A8R8G8B8               = 33,
+   SVGA3D_DEVCAP_SURFACEFMT_A2R10G10B10            = 34,
+   SVGA3D_DEVCAP_SURFACEFMT_X1R5G5B5               = 35,
+   SVGA3D_DEVCAP_SURFACEFMT_A1R5G5B5               = 36,
+   SVGA3D_DEVCAP_SURFACEFMT_A4R4G4B4               = 37,
+   SVGA3D_DEVCAP_SURFACEFMT_R5G6B5                 = 38,
+   SVGA3D_DEVCAP_SURFACEFMT_LUMINANCE16            = 39,
+   SVGA3D_DEVCAP_SURFACEFMT_LUMINANCE8_ALPHA8      = 40,
+   SVGA3D_DEVCAP_SURFACEFMT_ALPHA8                 = 41,
+   SVGA3D_DEVCAP_SURFACEFMT_LUMINANCE8             = 42,
+   SVGA3D_DEVCAP_SURFACEFMT_Z_D16                  = 43,
+   SVGA3D_DEVCAP_SURFACEFMT_Z_D24S8                = 44,
+   SVGA3D_DEVCAP_SURFACEFMT_Z_D24X8                = 45,
+   SVGA3D_DEVCAP_SURFACEFMT_DXT1                   = 46,
+   SVGA3D_DEVCAP_SURFACEFMT_DXT2                   = 47,
+   SVGA3D_DEVCAP_SURFACEFMT_DXT3                   = 48,
+   SVGA3D_DEVCAP_SURFACEFMT_DXT4                   = 49,
+   SVGA3D_DEVCAP_SURFACEFMT_DXT5                   = 50,
+   SVGA3D_DEVCAP_SURFACEFMT_BUMPX8L8V8U8           = 51,
+   SVGA3D_DEVCAP_SURFACEFMT_A2W10V10U10            = 52,
+   SVGA3D_DEVCAP_SURFACEFMT_BUMPU8V8               = 53,
+   SVGA3D_DEVCAP_SURFACEFMT_Q8W8V8U8               = 54,
+   SVGA3D_DEVCAP_SURFACEFMT_CxV8U8                 = 55,
+   SVGA3D_DEVCAP_SURFACEFMT_R_S10E5                = 56,
+   SVGA3D_DEVCAP_SURFACEFMT_R_S23E8                = 57,
+   SVGA3D_DEVCAP_SURFACEFMT_RG_S10E5               = 58,
+   SVGA3D_DEVCAP_SURFACEFMT_RG_S23E8               = 59,
+   SVGA3D_DEVCAP_SURFACEFMT_ARGB_S10E5             = 60,
+   SVGA3D_DEVCAP_SURFACEFMT_ARGB_S23E8             = 61,
+   SVGA3D_DEVCAP_MAX_VERTEX_SHADER_TEXTURES        = 63,
+
+   /*
+    * Note that MAX_SIMULTANEOUS_RENDER_TARGETS is a maximum count of color
+    * render targets.  This does no include the depth or stencil targets.
+    */
+   SVGA3D_DEVCAP_MAX_SIMULTANEOUS_RENDER_TARGETS   = 64,
+
+   SVGA3D_DEVCAP_SURFACEFMT_V16U16                 = 65,
+   SVGA3D_DEVCAP_SURFACEFMT_G16R16                 = 66,
+   SVGA3D_DEVCAP_SURFACEFMT_A16B16G16R16           = 67,
+   SVGA3D_DEVCAP_SURFACEFMT_UYVY                   = 68,
+   SVGA3D_DEVCAP_SURFACEFMT_YUY2                   = 69,
+
+   /*
+    * Don't add new caps into the previous section; the values in this
+    * enumeration must not change. You can put new values right before
+    * SVGA3D_DEVCAP_MAX.
+    */
+   SVGA3D_DEVCAP_MAX                                  /* This must be the last index. */
+} SVGA3dDevCapIndex;
+
+typedef union {
+   Bool   b;
+   uint32 u;
+   int32  i;
+   float  f;
+} SVGA3dDevCapResult;
+
+#endif /* _SVGA3D_REG_H_ */
diff --git a/src/gallium/drivers/svga/include/svga3d_shaderdefs.h b/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
new file mode 100644
index 0000000000..2078c4a8a4
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
@@ -0,0 +1,519 @@
+/**********************************************************
+ * Copyright 2007-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga3d_shaderdefs.h --
+ *
+ * SVGA3D byte code format and limit definitions.
+ *
+ * The format of the byte code directly corresponds to that defined
+ * by Microsoft DirectX SDK 9.0c (file d3d9types.h). The format can
+ * also be extended so that different shader formats can be supported
+ * for example GLSL, ARB vp/fp, NV/ATI shader formats, etc.
+ *
+ */
+
+#ifndef __SVGA3D_SHADER_DEFS__
+#define __SVGA3D_SHADER_DEFS__
+
+/* SVGA3D shader hardware limits. */
+
+#define SVGA3D_INPUTREG_MAX            16
+#define SVGA3D_OUTPUTREG_MAX           12
+#define SVGA3D_VERTEX_SAMPLERREG_MAX   4
+#define SVGA3D_PIXEL_SAMPLERREG_MAX    16
+#define SVGA3D_SAMPLERREG_MAX          (SVGA3D_PIXEL_SAMPLERREG_MAX+\
+                                        SVGA3D_VERTEX_SAMPLERREG_MAX)
+#define SVGA3D_TEMPREG_MAX             32
+#define SVGA3D_CONSTREG_MAX            256
+#define SVGA3D_CONSTINTREG_MAX         16
+#define SVGA3D_CONSTBOOLREG_MAX        16
+#define SVGA3D_ADDRREG_MAX             1
+#define SVGA3D_PREDREG_MAX             1
+
+/* SVGA3D byte code specific limits */
+
+#define SVGA3D_MAX_SRC_REGS      4
+#define SVGA3D_MAX_NESTING_LEVEL 32
+
+/* SVGA3D version information. */
+
+#define SVGA3D_VS_TYPE  0xFFFE
+#define SVGA3D_PS_TYPE  0xFFFF
+
+typedef struct {
+   union {
+      struct {
+         uint32 minor : 8;
+         uint32 major : 8;
+         uint32 type : 16;
+      };
+
+      uint32 value;
+   };
+} SVGA3dShaderVersion;
+
+#define SVGA3D_VS_10 ((SVGA3D_VS_TYPE << 16) | 1 << 8)
+#define SVGA3D_VS_11 (SVGA3D_VS_10 | 1)
+#define SVGA3D_VS_20 ((SVGA3D_VS_TYPE << 16) | 2 << 8)
+#define SVGA3D_VS_30 ((SVGA3D_VS_TYPE << 16) | 3 << 8)
+
+#define SVGA3D_PS_10 ((SVGA3D_PS_TYPE << 16) | 1 << 8)
+#define SVGA3D_PS_11 (SVGA3D_PS_10 | 1)
+#define SVGA3D_PS_12 (SVGA3D_PS_10 | 2)
+#define SVGA3D_PS_13 (SVGA3D_PS_10 | 3)
+#define SVGA3D_PS_14 (SVGA3D_PS_10 | 4)
+#define SVGA3D_PS_20 ((SVGA3D_PS_TYPE << 16) | 2 << 8)
+#define SVGA3D_PS_30 ((SVGA3D_PS_TYPE << 16) | 3 << 8)
+
+/* The *_ENABLED are for backwards compatibility with old drivers */
+typedef enum {
+   SVGA3DPSVERSION_NONE = 0,
+   SVGA3DPSVERSION_ENABLED = 1,
+   SVGA3DPSVERSION_11 = 3,
+   SVGA3DPSVERSION_12 = 5,
+   SVGA3DPSVERSION_13 = 7,
+   SVGA3DPSVERSION_14 = 9,
+   SVGA3DPSVERSION_20 = 11,
+   SVGA3DPSVERSION_30 = 13,
+   SVGA3DPSVERSION_40 = 15,
+   SVGA3DPSVERSION_MAX
+} SVGA3dPixelShaderVersion;
+
+typedef enum {
+   SVGA3DVSVERSION_NONE = 0,
+   SVGA3DVSVERSION_ENABLED = 1,
+   SVGA3DVSVERSION_11 = 3,
+   SVGA3DVSVERSION_20 = 5,
+   SVGA3DVSVERSION_30 = 7,
+   SVGA3DVSVERSION_40 = 9,
+   SVGA3DVSVERSION_MAX
+} SVGA3dVertexShaderVersion;
+
+/* SVGA3D instruction op codes. */
+
+typedef enum {
+   SVGA3DOP_NOP = 0,
+   SVGA3DOP_MOV,
+   SVGA3DOP_ADD,
+   SVGA3DOP_SUB,
+   SVGA3DOP_MAD,
+   SVGA3DOP_MUL,
+   SVGA3DOP_RCP,
+   SVGA3DOP_RSQ,
+   SVGA3DOP_DP3,
+   SVGA3DOP_DP4,
+   SVGA3DOP_MIN,
+   SVGA3DOP_MAX,
+   SVGA3DOP_SLT,
+   SVGA3DOP_SGE,
+   SVGA3DOP_EXP,
+   SVGA3DOP_LOG,
+   SVGA3DOP_LIT,
+   SVGA3DOP_DST,
+   SVGA3DOP_LRP,
+   SVGA3DOP_FRC,
+   SVGA3DOP_M4x4,
+   SVGA3DOP_M4x3,
+   SVGA3DOP_M3x4,
+   SVGA3DOP_M3x3,
+   SVGA3DOP_M3x2,
+   SVGA3DOP_CALL,
+   SVGA3DOP_CALLNZ,
+   SVGA3DOP_LOOP,
+   SVGA3DOP_RET,
+   SVGA3DOP_ENDLOOP,
+   SVGA3DOP_LABEL,
+   SVGA3DOP_DCL,
+   SVGA3DOP_POW,
+   SVGA3DOP_CRS,
+   SVGA3DOP_SGN,
+   SVGA3DOP_ABS,
+   SVGA3DOP_NRM,
+   SVGA3DOP_SINCOS,
+   SVGA3DOP_REP,
+   SVGA3DOP_ENDREP,
+   SVGA3DOP_IF,
+   SVGA3DOP_IFC,
+   SVGA3DOP_ELSE,
+   SVGA3DOP_ENDIF,
+   SVGA3DOP_BREAK,
+   SVGA3DOP_BREAKC,
+   SVGA3DOP_MOVA,
+   SVGA3DOP_DEFB,
+   SVGA3DOP_DEFI,
+   SVGA3DOP_TEXCOORD = 64,
+   SVGA3DOP_TEXKILL,
+   SVGA3DOP_TEX,
+   SVGA3DOP_TEXBEM,
+   SVGA3DOP_TEXBEML,
+   SVGA3DOP_TEXREG2AR,
+   SVGA3DOP_TEXREG2GB = 70,
+   SVGA3DOP_TEXM3x2PAD,
+   SVGA3DOP_TEXM3x2TEX,
+   SVGA3DOP_TEXM3x3PAD,
+   SVGA3DOP_TEXM3x3TEX,
+   SVGA3DOP_RESERVED0,
+   SVGA3DOP_TEXM3x3SPEC,
+   SVGA3DOP_TEXM3x3VSPEC,
+   SVGA3DOP_EXPP,
+   SVGA3DOP_LOGP,
+   SVGA3DOP_CND = 80,
+   SVGA3DOP_DEF,
+   SVGA3DOP_TEXREG2RGB,
+   SVGA3DOP_TEXDP3TEX,
+   SVGA3DOP_TEXM3x2DEPTH,
+   SVGA3DOP_TEXDP3,
+   SVGA3DOP_TEXM3x3,
+   SVGA3DOP_TEXDEPTH,
+   SVGA3DOP_CMP,
+   SVGA3DOP_BEM,
+   SVGA3DOP_DP2ADD = 90,
+   SVGA3DOP_DSX,
+   SVGA3DOP_DSY,
+   SVGA3DOP_TEXLDD,
+   SVGA3DOP_SETP,
+   SVGA3DOP_TEXLDL,
+   SVGA3DOP_BREAKP = 96,
+   SVGA3DOP_LAST_INST,
+   SVGA3DOP_PHASE = 0xFFFD,
+   SVGA3DOP_COMMENT = 0xFFFE,
+   SVGA3DOP_END = 0xFFFF,
+} SVGA3dShaderOpCodeType;
+
+/* SVGA3D operation control/comparison function types */
+
+typedef enum {
+   SVGA3DOPCONT_NONE,
+   SVGA3DOPCONT_PROJECT,   /* Projective texturing */
+   SVGA3DOPCONT_BIAS,      /* Texturing with a LOD bias */
+} SVGA3dShaderOpCodeControlFnType;
+
+typedef enum {
+   SVGA3DOPCOMP_RESERVED0 = 0,
+   SVGA3DOPCOMP_GT,
+   SVGA3DOPCOMP_EQ,
+   SVGA3DOPCOMP_GE,
+   SVGA3DOPCOMP_LT,
+   SVGA3DOPCOMPC_NE,
+   SVGA3DOPCOMP_LE,
+   SVGA3DOPCOMP_RESERVED1
+} SVGA3dShaderOpCodeCompFnType;
+
+/* SVGA3D register types */
+
+typedef enum {
+    SVGA3DREG_TEMP = 0,       /* Temporary register file */
+    SVGA3DREG_INPUT,          /* Input register file */
+    SVGA3DREG_CONST,          /* Constant register file */
+    SVGA3DREG_ADDR,           /* Address register for VS */
+    SVGA3DREG_TEXTURE = 3,    /* Texture register file for PS */
+    SVGA3DREG_RASTOUT,        /* Rasterizer register file */
+    SVGA3DREG_ATTROUT,        /* Attribute output register file */
+    SVGA3DREG_TEXCRDOUT,      /* Texture coordinate output register file */
+    SVGA3DREG_OUTPUT = 6,     /* Output register file for VS 3.0+ */
+    SVGA3DREG_CONSTINT,       /* Constant integer vector register file */
+    SVGA3DREG_COLOROUT,       /* Color output register file */
+    SVGA3DREG_DEPTHOUT,       /* Depth output register file */
+    SVGA3DREG_SAMPLER,        /* Sampler state register file */
+    SVGA3DREG_CONST2,         /* Constant register file 2048 - 4095 */
+    SVGA3DREG_CONST3,         /* Constant register file 4096 - 6143 */
+    SVGA3DREG_CONST4,         /* Constant register file 6144 - 8191 */
+    SVGA3DREG_CONSTBOOL,      /* Constant boolean register file */
+    SVGA3DREG_LOOP,           /* Loop counter register file */
+    SVGA3DREG_TEMPFLOAT16,    /* 16-bit float temp register file */
+    SVGA3DREG_MISCTYPE,       /* Miscellaneous (single) registers */
+    SVGA3DREG_LABEL,          /* Label */
+    SVGA3DREG_PREDICATE,      /* Predicate register */
+} SVGA3dShaderRegType;
+
+/* SVGA3D rasterizer output register types */
+
+typedef enum {
+   SVGA3DRASTOUT_POSITION = 0,
+   SVGA3DRASTOUT_FOG,
+   SVGA3DRASTOUT_PSIZE
+} SVGA3dShaderRastOutRegType;
+
+/* SVGA3D miscellaneous register types */
+
+typedef enum {
+   SVGA3DMISCREG_POSITION = 0,   /* Input position x,y,z,rhw (PS) */
+   SVGA3DMISCREG_FACE            /* Floating point primitive area (PS) */
+} SVGA3DShaderMiscRegType;
+
+/* SVGA3D sampler types */
+
+typedef enum {
+   SVGA3DSAMP_UNKNOWN = 0, /* Uninitialized value */
+   SVGA3DSAMP_2D = 2,      /* dcl_2d s# (for declaring a 2-D texture) */
+   SVGA3DSAMP_CUBE,        /* dcl_cube s# (for declaring a cube texture) */
+   SVGA3DSAMP_VOLUME,      /* dcl_volume s# (for declaring a volume texture) */
+} SVGA3dShaderSamplerType;
+
+/* SVGA3D sampler format classes */
+
+typedef enum {
+   SVGA3DSAMPFORMAT_ARGB,        /* ARGB formats */
+   SVGA3DSAMPFORMAT_V8U8,        /* Sign and normalize (SNORM) V & U */
+   SVGA3DSAMPFORMAT_Q8W8V8U8,    /* SNORM all */
+   SVGA3DSAMPFORMAT_CxV8U8,      /* SNORM V & U, C=SQRT(1-U^2-V^2) */
+   SVGA3DSAMPFORMAT_X8L8V8U8,    /* SNORM V & U */
+   SVGA3DSAMPFORMAT_A2W10V10U10, /* SNORM W, V & U */
+   SVGA3DSAMPFORMAT_DXT_PMA,     /* DXT pre-multiplied alpha */
+   SVGA3DSAMPFORMAT_YUV,         /* YUV video format */
+   SVGA3DSAMPFORMAT_UYVY,        /* UYVY video format */
+   SVGA3DSAMPFORMAT_Rx,          /* R16F/32F */
+   SVGA3DSAMPFORMAT_RxGx,        /* R16FG16F, R32FG32F */
+   SVGA3DSAMPFORMAT_V16U16,      /* SNORM all */
+} SVGA3DShaderSamplerFormatClass;
+
+/* SVGA3D write mask */
+
+#define SVGA3DWRITEMASK_0    1 /* Component 0 (X;Red) */
+#define SVGA3DWRITEMASK_1    2 /* Component 1 (Y;Green) */
+#define SVGA3DWRITEMASK_2    4 /* Component 2 (Z;Blue) */
+#define SVGA3DWRITEMASK_3    8 /* Component 3 (W;Alpha) */
+#define SVGA3DWRITEMASK_ALL 15 /* All components */
+
+/* SVGA3D destination modifiers */
+
+#define SVGA3DDSTMOD_NONE              0 /* nop */
+#define SVGA3DDSTMOD_SATURATE          1 /* clamp to [0, 1] */
+#define SVGA3DDSTMOD_PARTIALPRECISION  2 /* Partial precision hint */
+
+/*
+ * Relevant to multisampling only:
+ * When the pixel center is not covered, sample
+ * attribute or compute gradients/LOD
+ * using multisample "centroid" location.
+ * "Centroid" is some location within the covered
+ * region of the pixel.
+ */
+
+#define SVGA3DDSTMOD_MSAMPCENTROID     4
+
+/* SVGA3D source swizzle */
+
+#define SVGA3DSWIZZLE_REPLICATEX 0x00
+#define SVGA3DSWIZZLE_REPLICATEY 0x55
+#define SVGA3DSWIZZLE_REPLICATEZ 0xAA
+#define SVGA3DSWIZZLE_REPLICATEW 0xFF
+#define SVGA3DSWIZZLE_NONE       0xE4
+#define SVGA3DSWIZZLE_YZXW       0xC9
+#define SVGA3DSWIZZLE_ZXYW       0xD2
+#define SVGA3DSWIZZLE_WXYZ       0x1B
+
+/* SVGA3D source modifiers */
+
+typedef enum {
+    SVGA3DSRCMOD_NONE = 0, /* nop */
+    SVGA3DSRCMOD_NEG,      /* negate */
+    SVGA3DSRCMOD_BIAS,     /* bias */
+    SVGA3DSRCMOD_BIASNEG,  /* bias and negate */
+    SVGA3DSRCMOD_SIGN,     /* sign */
+    SVGA3DSRCMOD_SIGNNEG,  /* sign and negate */
+    SVGA3DSRCMOD_COMP,     /* complement */
+    SVGA3DSRCMOD_X2,       /* x2 */
+    SVGA3DSRCMOD_X2NEG,    /* x2 and negate */
+    SVGA3DSRCMOD_DZ,       /* divide through by z component */
+    SVGA3DSRCMOD_DW,       /* divide through by w component */
+    SVGA3DSRCMOD_ABS,      /* abs() */
+    SVGA3DSRCMOD_ABSNEG,   /* -abs() */
+    SVGA3DSRCMOD_NOT,      /* ! (for predicate register) */
+} SVGA3dShaderSrcModType;
+
+/* SVGA3D instruction token */
+
+typedef struct {
+   union {
+      struct {
+         uint32 comment_op : 16;
+         uint32 comment_size : 16;
+      };
+
+      struct {
+         uint32 op : 16;
+         uint32 control : 3;
+         uint32 reserved2 : 5;
+         uint32 size : 4;
+         uint32 predicated : 1;
+         uint32 reserved1 : 1;
+         uint32 coissue : 1;
+         uint32 reserved0 : 1;
+      };
+
+      uint32 value;
+   };
+} SVGA3dShaderInstToken;
+
+/* SVGA3D destination parameter token */
+
+typedef struct {
+   union {
+      struct {
+         uint32 num : 11;
+         uint32 type_upper : 2;
+         uint32 relAddr : 1;
+         uint32 reserved1 : 2;
+         uint32 mask : 4;
+         uint32 dstMod : 4;
+         uint32 shfScale : 4;
+         uint32 type_lower : 3;
+         uint32 reserved0 : 1;
+      };
+
+      uint32 value;
+   };
+} SVGA3dShaderDestToken;
+
+/* SVGA3D source parameter token */
+
+typedef struct {
+   union {
+      struct {
+         uint32 num : 11;
+         uint32 type_upper : 2;
+         uint32 relAddr : 1;
+         uint32 reserved1 : 2;
+         uint32 swizzle : 8;
+         uint32 srcMod : 4;
+         uint32 type_lower : 3;
+         uint32 reserved0 : 1;
+      };
+
+      uint32 value;
+   };
+} SVGA3dShaderSrcToken;
+
+/* SVGA3DOP_DCL parameter tokens */
+
+typedef struct {
+   union {
+      struct {
+         union {
+            struct {
+               uint32 usage : 5;
+               uint32 reserved1 : 11;
+               uint32 index : 4;
+               uint32 reserved0 : 12;
+            }; /* input / output declaration */
+
+            struct {
+               uint32 reserved3 : 27;
+               uint32 type : 4;
+               uint32 reserved2 : 1;
+            }; /* sampler declaration */
+         };
+
+         SVGA3dShaderDestToken dst;
+      };
+
+      uint32 values[2];
+   };
+} SVGA3DOpDclArgs;
+
+/* SVGA3DOP_DEF parameter tokens */
+
+typedef struct {
+   union {
+      struct {
+         SVGA3dShaderDestToken dst;
+
+         union {
+            float constValues[4];
+            int constIValues[4];
+            Bool constBValue;
+         };
+      };
+
+      uint32 values[5];
+   };
+} SVGA3DOpDefArgs;
+
+/* SVGA3D shader token */
+
+typedef union {
+   uint32 value;
+   SVGA3dShaderInstToken inst;
+   SVGA3dShaderDestToken dest;
+   SVGA3dShaderSrcToken src;
+} SVGA3dShaderToken;
+
+/* SVGA3D shader program */
+
+typedef struct {
+   SVGA3dShaderVersion version;
+   /* SVGA3dShaderToken stream */
+} SVGA3dShaderProgram;
+
+/* SVGA3D version specific register assignments */
+
+static const uint32 SVGA3D_INPUT_REG_POSITION_VS11 = 0;
+static const uint32 SVGA3D_INPUT_REG_PSIZE_VS11 = 1;
+static const uint32 SVGA3D_INPUT_REG_FOG_VS11 = 3;
+static const uint32 SVGA3D_INPUT_REG_FOG_MASK_VS11 = SVGA3DWRITEMASK_3;
+static const uint32 SVGA3D_INPUT_REG_COLOR_BASE_VS11 = 2;
+static const uint32 SVGA3D_INPUT_REG_TEXCOORD_BASE_VS11 = 4;
+
+static const uint32 SVGA3D_INPUT_REG_COLOR_BASE_PS11 = 0;
+static const uint32 SVGA3D_INPUT_REG_TEXCOORD_BASE_PS11 = 2;
+static const uint32 SVGA3D_OUTPUT_REG_DEPTH_PS11 = 0;
+static const uint32 SVGA3D_OUTPUT_REG_COLOR_PS11 = 1;
+
+static const uint32 SVGA3D_INPUT_REG_COLOR_BASE_PS20 = 0;
+static const uint32 SVGA3D_INPUT_REG_COLOR_NUM_PS20 = 2;
+static const uint32 SVGA3D_INPUT_REG_TEXCOORD_BASE_PS20 = 2;
+static const uint32 SVGA3D_INPUT_REG_TEXCOORD_NUM_PS20 = 8;
+static const uint32 SVGA3D_OUTPUT_REG_COLOR_BASE_PS20 = 1;
+static const uint32 SVGA3D_OUTPUT_REG_COLOR_NUM_PS20 = 4;
+static const uint32 SVGA3D_OUTPUT_REG_DEPTH_BASE_PS20 = 0;
+static const uint32 SVGA3D_OUTPUT_REG_DEPTH_NUM_PS20 = 1;
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3dShaderGetRegType --
+ *
+ *      As the register type is split into two non sequential fields,
+ *      this function provides an useful way of accessing the actual
+ *      register type without having to manually concatenate the
+ *      type_upper and type_lower fields.
+ *
+ * Results:
+ *      Returns the register type.
+ *
+ *----------------------------------------------------------------------
+ */
+
+static INLINE SVGA3dShaderRegType
+SVGA3dShaderGetRegType(uint32 token)
+{
+   SVGA3dShaderSrcToken src;
+   src.value = token;
+   return (SVGA3dShaderRegType)(src.type_upper << 3 | src.type_lower);
+}
+
+#endif /* __SVGA3D_SHADER_DEFS__ */
diff --git a/src/gallium/drivers/svga/include/svga_escape.h b/src/gallium/drivers/svga/include/svga_escape.h
new file mode 100644
index 0000000000..7b85e9b8c8
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga_escape.h
@@ -0,0 +1,89 @@
+/**********************************************************
+ * Copyright 2007-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga_escape.h --
+ *
+ *    Definitions for our own (vendor-specific) SVGA Escape commands.
+ */
+
+#ifndef _SVGA_ESCAPE_H_
+#define _SVGA_ESCAPE_H_
+
+
+/*
+ * Namespace IDs for the escape command
+ */
+
+#define SVGA_ESCAPE_NSID_VMWARE 0x00000000
+#define SVGA_ESCAPE_NSID_DEVEL  0xFFFFFFFF
+
+
+/*
+ * Within SVGA_ESCAPE_NSID_VMWARE, we multiplex commands according to
+ * the first DWORD of escape data (after the nsID and size). As a
+ * guideline we're using the high word and low word as a major and
+ * minor command number, respectively.
+ *
+ * Major command number allocation:
+ *
+ *   0000: Reserved
+ *   0001: SVGA_ESCAPE_VMWARE_LOG (svga_binary_logger.h)
+ *   0002: SVGA_ESCAPE_VMWARE_VIDEO (svga_overlay.h)
+ *   0003: SVGA_ESCAPE_VMWARE_HINT (svga_escape.h)
+ */
+
+#define SVGA_ESCAPE_VMWARE_MAJOR_MASK  0xFFFF0000
+
+
+/*
+ * SVGA Hint commands.
+ *
+ * These escapes let the SVGA driver provide optional information to
+ * he host about the state of the guest or guest applications. The
+ * host can use these hints to make user interface or performance
+ * decisions.
+ *
+ * Notes:
+ *
+ *   - SVGA_ESCAPE_VMWARE_HINT_FULLSCREEN is deprecated for guests
+ *     that use the SVGA Screen Object extension. Instead of sending
+ *     this escape, use the SVGA_SCREEN_FULLSCREEN_HINT flag on your
+ *     Screen Object.
+ */
+
+#define SVGA_ESCAPE_VMWARE_HINT               0x00030000
+#define SVGA_ESCAPE_VMWARE_HINT_FULLSCREEN    0x00030001  // Deprecated
+
+typedef
+struct {
+   uint32 command;
+   uint32 fullscreen;
+   struct {
+      int32 x, y;
+   } monitorPosition;
+} SVGAEscapeHintFullscreen;
+
+#endif /* _SVGA_ESCAPE_H_ */
diff --git a/src/gallium/drivers/svga/include/svga_overlay.h b/src/gallium/drivers/svga/include/svga_overlay.h
new file mode 100644
index 0000000000..82c1d3ff3e
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga_overlay.h
@@ -0,0 +1,201 @@
+/**********************************************************
+ * Copyright 2007-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga_overlay.h --
+ *
+ *    Definitions for video-overlay support.
+ */
+
+#ifndef _SVGA_OVERLAY_H_
+#define _SVGA_OVERLAY_H_
+
+#include "svga_reg.h"
+
+/*
+ * Video formats we support
+ */
+
+#define VMWARE_FOURCC_YV12 0x32315659 // 'Y' 'V' '1' '2'
+#define VMWARE_FOURCC_YUY2 0x32595559 // 'Y' 'U' 'Y' '2'
+#define VMWARE_FOURCC_UYVY 0x59565955 // 'U' 'Y' 'V' 'Y'
+
+typedef enum {
+   SVGA_OVERLAY_FORMAT_INVALID = 0,
+   SVGA_OVERLAY_FORMAT_YV12 = VMWARE_FOURCC_YV12,
+   SVGA_OVERLAY_FORMAT_YUY2 = VMWARE_FOURCC_YUY2,
+   SVGA_OVERLAY_FORMAT_UYVY = VMWARE_FOURCC_UYVY,
+} SVGAOverlayFormat;
+
+#define SVGA_VIDEO_COLORKEY_MASK             0x00ffffff
+
+#define SVGA_ESCAPE_VMWARE_VIDEO             0x00020000
+
+#define SVGA_ESCAPE_VMWARE_VIDEO_SET_REGS    0x00020001
+        /* FIFO escape layout:
+         * Type, Stream Id, (Register Id, Value) pairs */
+
+#define SVGA_ESCAPE_VMWARE_VIDEO_FLUSH       0x00020002
+        /* FIFO escape layout:
+         * Type, Stream Id */
+
+typedef
+struct SVGAEscapeVideoSetRegs {
+   struct {
+      uint32 cmdType;
+      uint32 streamId;
+   } header;
+
+   // May include zero or more items.
+   struct {
+      uint32 registerId;
+      uint32 value;
+   } items[1];
+} SVGAEscapeVideoSetRegs;
+
+typedef
+struct SVGAEscapeVideoFlush {
+   uint32 cmdType;
+   uint32 streamId;
+} SVGAEscapeVideoFlush;
+
+
+/*
+ * Struct definitions for the video overlay commands built on
+ * SVGAFifoCmdEscape.
+ */
+typedef
+struct {
+   uint32 command;
+   uint32 overlay;
+} SVGAFifoEscapeCmdVideoBase;
+
+typedef
+struct {
+   SVGAFifoEscapeCmdVideoBase videoCmd;
+} SVGAFifoEscapeCmdVideoFlush;
+
+typedef
+struct {
+   SVGAFifoEscapeCmdVideoBase videoCmd;
+   struct {
+      uint32 regId;
+      uint32 value;
+   } items[1];
+} SVGAFifoEscapeCmdVideoSetRegs;
+
+typedef
+struct {
+   SVGAFifoEscapeCmdVideoBase videoCmd;
+   struct {
+      uint32 regId;
+      uint32 value;
+   } items[SVGA_VIDEO_NUM_REGS];
+} SVGAFifoEscapeCmdVideoSetAllRegs;
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * VMwareVideoGetAttributes --
+ *
+ *      Computes the size, pitches and offsets for YUV frames.
+ *
+ * Results:
+ *      TRUE on success; otherwise FALSE on failure.
+ *
+ * Side effects:
+ *      Pitches and offsets for the given YUV frame are put in 'pitches'
+ *      and 'offsets' respectively. They are both optional though.
+ *
+ *----------------------------------------------------------------------
+ */
+
+static INLINE Bool
+VMwareVideoGetAttributes(const SVGAOverlayFormat format,    // IN
+                         uint32 *width,                     // IN / OUT
+                         uint32 *height,                    // IN / OUT
+                         uint32 *size,                      // OUT
+                         uint32 *pitches,                   // OUT (optional)
+                         uint32 *offsets)                   // OUT (optional)
+{
+    int tmp;
+
+    *width = (*width + 1) & ~1;
+
+    if (offsets) {
+        offsets[0] = 0;
+    }
+
+    switch (format) {
+    case VMWARE_FOURCC_YV12:
+       *height = (*height + 1) & ~1;
+       *size = (*width + 3) & ~3;
+
+       if (pitches) {
+          pitches[0] = *size;
+       }
+
+       *size *= *height;
+
+       if (offsets) {
+          offsets[1] = *size;
+       }
+
+       tmp = ((*width >> 1) + 3) & ~3;
+
+       if (pitches) {
+          pitches[1] = pitches[2] = tmp;
+       }
+
+       tmp *= (*height >> 1);
+       *size += tmp;
+
+       if (offsets) {
+          offsets[2] = *size;
+       }
+
+       *size += tmp;
+       break;
+
+    case VMWARE_FOURCC_YUY2:
+    case VMWARE_FOURCC_UYVY:
+       *size = *width * 2;
+
+       if (pitches) {
+          pitches[0] = *size;
+       }
+
+       *size *= *height;
+       break;
+
+    default:
+       return FALSE;
+    }
+
+    return TRUE;
+}
+
+#endif // _SVGA_OVERLAY_H_
diff --git a/src/gallium/drivers/svga/include/svga_reg.h b/src/gallium/drivers/svga/include/svga_reg.h
new file mode 100644
index 0000000000..1b96c2ec07
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga_reg.h
@@ -0,0 +1,1346 @@
+/**********************************************************
+ * Copyright 1998-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga_reg.h --
+ *
+ *    Virtual hardware definitions for the VMware SVGA II device.
+ */
+
+#ifndef _SVGA_REG_H_
+#define _SVGA_REG_H_
+
+/*
+ * PCI device IDs.
+ */
+#define PCI_VENDOR_ID_VMWARE            0x15AD
+#define PCI_DEVICE_ID_VMWARE_SVGA2      0x0405
+
+/*
+ * Legal values for the SVGA_REG_CURSOR_ON register in old-fashioned
+ * cursor bypass mode. This is still supported, but no new guest
+ * drivers should use it.
+ */
+#define SVGA_CURSOR_ON_HIDE            0x0   /* Must be 0 to maintain backward compatibility */
+#define SVGA_CURSOR_ON_SHOW            0x1   /* Must be 1 to maintain backward compatibility */
+#define SVGA_CURSOR_ON_REMOVE_FROM_FB  0x2   /* Remove the cursor from the framebuffer because we need to see what's under it */
+#define SVGA_CURSOR_ON_RESTORE_TO_FB   0x3   /* Put the cursor back in the framebuffer so the user can see it */
+
+/*
+ * The maximum framebuffer size that can traced for e.g. guests in VESA mode.
+ * The changeMap in the monitor is proportional to this number. Therefore, we'd
+ * like to keep it as small as possible to reduce monitor overhead (using
+ * SVGA_VRAM_MAX_SIZE for this increases the size of the shared area by over
+ * 4k!).
+ *
+ * NB: For compatibility reasons, this value must be greater than 0xff0000.
+ *     See bug 335072.
+ */
+#define SVGA_FB_MAX_TRACEABLE_SIZE      0x1000000
+
+#define SVGA_MAX_PSEUDOCOLOR_DEPTH      8
+#define SVGA_MAX_PSEUDOCOLORS           (1 << SVGA_MAX_PSEUDOCOLOR_DEPTH)
+#define SVGA_NUM_PALETTE_REGS           (3 * SVGA_MAX_PSEUDOCOLORS)
+
+#define SVGA_MAGIC         0x900000UL
+#define SVGA_MAKE_ID(ver)  (SVGA_MAGIC << 8 | (ver))
+
+/* Version 2 let the address of the frame buffer be unsigned on Win32 */
+#define SVGA_VERSION_2     2
+#define SVGA_ID_2          SVGA_MAKE_ID(SVGA_VERSION_2)
+
+/* Version 1 has new registers starting with SVGA_REG_CAPABILITIES so
+   PALETTE_BASE has moved */
+#define SVGA_VERSION_1     1
+#define SVGA_ID_1          SVGA_MAKE_ID(SVGA_VERSION_1)
+
+/* Version 0 is the initial version */
+#define SVGA_VERSION_0     0
+#define SVGA_ID_0          SVGA_MAKE_ID(SVGA_VERSION_0)
+
+/* "Invalid" value for all SVGA IDs. (Version ID, screen object ID, surface ID...) */
+#define SVGA_ID_INVALID    0xFFFFFFFF
+
+/* Port offsets, relative to BAR0 */
+#define SVGA_INDEX_PORT         0x0
+#define SVGA_VALUE_PORT         0x1
+#define SVGA_BIOS_PORT          0x2
+#define SVGA_IRQSTATUS_PORT     0x8
+
+/*
+ * Interrupt source flags for IRQSTATUS_PORT and IRQMASK.
+ *
+ * Interrupts are only supported when the
+ * SVGA_CAP_IRQMASK capability is present.
+ */
+#define SVGA_IRQFLAG_ANY_FENCE            0x1    /* Any fence was passed */
+#define SVGA_IRQFLAG_FIFO_PROGRESS        0x2    /* Made forward progress in the FIFO */
+#define SVGA_IRQFLAG_FENCE_GOAL           0x4    /* SVGA_FIFO_FENCE_GOAL reached */
+
+/*
+ * Registers
+ */
+
+enum {
+   SVGA_REG_ID = 0,
+   SVGA_REG_ENABLE = 1,
+   SVGA_REG_WIDTH = 2,
+   SVGA_REG_HEIGHT = 3,
+   SVGA_REG_MAX_WIDTH = 4,
+   SVGA_REG_MAX_HEIGHT = 5,
+   SVGA_REG_DEPTH = 6,
+   SVGA_REG_BITS_PER_PIXEL = 7,       /* Current bpp in the guest */
+   SVGA_REG_PSEUDOCOLOR = 8,
+   SVGA_REG_RED_MASK = 9,
+   SVGA_REG_GREEN_MASK = 10,
+   SVGA_REG_BLUE_MASK = 11,
+   SVGA_REG_BYTES_PER_LINE = 12,
+   SVGA_REG_FB_START = 13,            /* (Deprecated) */
+   SVGA_REG_FB_OFFSET = 14,
+   SVGA_REG_VRAM_SIZE = 15,
+   SVGA_REG_FB_SIZE = 16,
+
+   /* ID 0 implementation only had the above registers, then the palette */
+
+   SVGA_REG_CAPABILITIES = 17,
+   SVGA_REG_MEM_START = 18,           /* (Deprecated) */
+   SVGA_REG_MEM_SIZE = 19,
+   SVGA_REG_CONFIG_DONE = 20,         /* Set when memory area configured */
+   SVGA_REG_SYNC = 21,                /* See "FIFO Synchronization Registers" */
+   SVGA_REG_BUSY = 22,                /* See "FIFO Synchronization Registers" */
+   SVGA_REG_GUEST_ID = 23,            /* Set guest OS identifier */
+   SVGA_REG_CURSOR_ID = 24,           /* (Deprecated) */
+   SVGA_REG_CURSOR_X = 25,            /* (Deprecated) */
+   SVGA_REG_CURSOR_Y = 26,            /* (Deprecated) */
+   SVGA_REG_CURSOR_ON = 27,           /* (Deprecated) */
+   SVGA_REG_HOST_BITS_PER_PIXEL = 28, /* (Deprecated) */
+   SVGA_REG_SCRATCH_SIZE = 29,        /* Number of scratch registers */
+   SVGA_REG_MEM_REGS = 30,            /* Number of FIFO registers */
+   SVGA_REG_NUM_DISPLAYS = 31,        /* (Deprecated) */
+   SVGA_REG_PITCHLOCK = 32,           /* Fixed pitch for all modes */
+   SVGA_REG_IRQMASK = 33,             /* Interrupt mask */
+
+   /* Legacy multi-monitor support */
+   SVGA_REG_NUM_GUEST_DISPLAYS = 34,/* Number of guest displays in X/Y direction */
+   SVGA_REG_DISPLAY_ID = 35,        /* Display ID for the following display attributes */
+   SVGA_REG_DISPLAY_IS_PRIMARY = 36,/* Whether this is a primary display */
+   SVGA_REG_DISPLAY_POSITION_X = 37,/* The display position x */
+   SVGA_REG_DISPLAY_POSITION_Y = 38,/* The display position y */
+   SVGA_REG_DISPLAY_WIDTH = 39,     /* The display's width */
+   SVGA_REG_DISPLAY_HEIGHT = 40,    /* The display's height */
+
+   /* See "Guest memory regions" below. */
+   SVGA_REG_GMR_ID = 41,
+   SVGA_REG_GMR_DESCRIPTOR = 42,
+   SVGA_REG_GMR_MAX_IDS = 43,
+   SVGA_REG_GMR_MAX_DESCRIPTOR_LENGTH = 44,
+
+   SVGA_REG_TRACES = 45,            /* Enable trace-based updates even when FIFO is on */
+   SVGA_REG_TOP = 46,               /* Must be 1 more than the last register */
+
+   SVGA_PALETTE_BASE = 1024,        /* Base of SVGA color map */
+   /* Next 768 (== 256*3) registers exist for colormap */
+
+   SVGA_SCRATCH_BASE = SVGA_PALETTE_BASE + SVGA_NUM_PALETTE_REGS
+                                    /* Base of scratch registers */
+   /* Next reg[SVGA_REG_SCRATCH_SIZE] registers exist for scratch usage:
+      First 4 are reserved for VESA BIOS Extension; any remaining are for
+      the use of the current SVGA driver. */
+};
+
+
+/*
+ * Guest memory regions (GMRs):
+ *
+ * This is a new memory mapping feature available in SVGA devices
+ * which have the SVGA_CAP_GMR bit set. Previously, there were two
+ * fixed memory regions available with which to share data between the
+ * device and the driver: the FIFO ('MEM') and the framebuffer. GMRs
+ * are our name for an extensible way of providing arbitrary DMA
+ * buffers for use between the driver and the SVGA device. They are a
+ * new alternative to framebuffer memory, usable for both 2D and 3D
+ * graphics operations.
+ *
+ * Since GMR mapping must be done synchronously with guest CPU
+ * execution, we use a new pair of SVGA registers:
+ *
+ *   SVGA_REG_GMR_ID --
+ *
+ *     Read/write.
+ *     This register holds the 32-bit ID (a small positive integer)
+ *     of a GMR to create, delete, or redefine. Writing this register
+ *     has no side-effects.
+ *
+ *   SVGA_REG_GMR_DESCRIPTOR --
+ *
+ *     Write-only.
+ *     Writing this register will create, delete, or redefine the GMR
+ *     specified by the above ID register. If this register is zero,
+ *     the GMR is deleted. Any pointers into this GMR (including those
+ *     currently being processed by FIFO commands) will be
+ *     synchronously invalidated.
+ *
+ *     If this register is nonzero, it must be the physical page
+ *     number (PPN) of a data structure which describes the physical
+ *     layout of the memory region this GMR should describe. The
+ *     descriptor structure will be read synchronously by the SVGA
+ *     device when this register is written. The descriptor need not
+ *     remain allocated for the lifetime of the GMR.
+ *
+ *     The guest driver should write SVGA_REG_GMR_ID first, then
+ *     SVGA_REG_GMR_DESCRIPTOR.
+ *
+ *   SVGA_REG_GMR_MAX_IDS --
+ *
+ *     Read-only.
+ *     The SVGA device may choose to support a maximum number of
+ *     user-defined GMR IDs. This register holds the number of supported
+ *     IDs. (The maximum supported ID plus 1)
+ *
+ *   SVGA_REG_GMR_MAX_DESCRIPTOR_LENGTH --
+ *
+ *     Read-only.
+ *     The SVGA device may choose to put a limit on the total number
+ *     of SVGAGuestMemDescriptor structures it will read when defining
+ *     a single GMR.
+ *
+ * The descriptor structure is an array of SVGAGuestMemDescriptor
+ * structures. Each structure may do one of three things:
+ *
+ *   - Terminate the GMR descriptor list.
+ *     (ppn==0, numPages==0)
+ *
+ *   - Add a PPN or range of PPNs to the GMR's virtual address space.
+ *     (ppn != 0, numPages != 0)
+ *
+ *   - Provide the PPN of the next SVGAGuestMemDescriptor, in order to
+ *     support multi-page GMR descriptor tables without forcing the
+ *     driver to allocate physically contiguous memory.
+ *     (ppn != 0, numPages == 0)
+ *
+ * Note that each physical page of SVGAGuestMemDescriptor structures
+ * can describe at least 2MB of guest memory. If the driver needs to
+ * use more than one page of descriptor structures, it must use one of
+ * its SVGAGuestMemDescriptors to point to an additional page.  The
+ * device will never automatically cross a page boundary.
+ *
+ * Once the driver has described a GMR, it is immediately available
+ * for use via any FIFO command that uses an SVGAGuestPtr structure.
+ * These pointers include a GMR identifier plus an offset into that
+ * GMR.
+ *
+ * The driver must check the SVGA_CAP_GMR bit before using the GMR
+ * registers.
+ */
+
+/*
+ * Special GMR IDs, allowing SVGAGuestPtrs to point to framebuffer
+ * memory as well.  In the future, these IDs could even be used to
+ * allow legacy memory regions to be redefined by the guest as GMRs.
+ *
+ * Using the guest framebuffer (GFB) at BAR1 for general purpose DMA
+ * is being phased out. Please try to use user-defined GMRs whenever
+ * possible.
+ */
+#define SVGA_GMR_NULL         ((uint32) -1)
+#define SVGA_GMR_FRAMEBUFFER  ((uint32) -2)  // Guest Framebuffer (GFB)
+
+typedef
+struct SVGAGuestMemDescriptor {
+   uint32 ppn;
+   uint32 numPages;
+} SVGAGuestMemDescriptor;
+
+typedef
+struct SVGAGuestPtr {
+   uint32 gmrId;
+   uint32 offset;
+} SVGAGuestPtr;
+
+
+/*
+ * SVGAGMRImageFormat --
+ *
+ *    This is a packed representation of the source 2D image format
+ *    for a GMR-to-screen blit. Currently it is defined as an encoding
+ *    of the screen's color depth and bits-per-pixel, however, 16 bits
+ *    are reserved for future use to identify other encodings (such as
+ *    RGBA or higher-precision images).
+ *
+ *    Currently supported formats:
+ *
+ *       bpp depth  Format Name
+ *       --- -----  -----------
+ *        32    24  32-bit BGRX
+ *        24    24  24-bit BGR
+ *        16    16  RGB 5-6-5
+ *        16    15  RGB 5-5-5
+ *
+ */
+
+typedef
+struct SVGAGMRImageFormat {
+   union {
+      struct {
+         uint32 bitsPerPixel : 8;
+         uint32 colorDepth   : 8;
+         uint32 reserved     : 16;  // Must be zero
+      };
+
+      uint32 value;
+   };
+} SVGAGMRImageFormat;
+
+/*
+ * SVGAColorBGRX --
+ *
+ *    A 24-bit color format (BGRX), which does not depend on the
+ *    format of the legacy guest framebuffer (GFB) or the current
+ *    GMRFB state.
+ */
+
+typedef
+struct SVGAColorBGRX {
+   union {
+      struct {
+         uint32 b : 8;
+         uint32 g : 8;
+         uint32 r : 8;
+         uint32 x : 8;  // Unused
+      };
+
+      uint32 value;
+   };
+} SVGAColorBGRX;
+
+
+/*
+ * SVGASignedRect --
+ * SVGASignedPoint --
+ *
+ *    Signed rectangle and point primitives. These are used by the new
+ *    2D primitives for drawing to Screen Objects, which can occupy a
+ *    signed virtual coordinate space.
+ *
+ *    SVGASignedRect specifies a half-open interval: the (left, top)
+ *    pixel is part of the rectangle, but the (right, bottom) pixel is
+ *    not.
+ */
+
+typedef
+struct SVGASignedRect {
+   int32  left;
+   int32  top;
+   int32  right;
+   int32  bottom;
+} SVGASignedRect;
+
+typedef
+struct SVGASignedPoint {
+   int32  x;
+   int32  y;
+} SVGASignedPoint;
+
+
+/*
+ *  Capabilities
+ *
+ *  Note the holes in the bitfield. Missing bits have been deprecated,
+ *  and must not be reused. Those capabilities will never be reported
+ *  by new versions of the SVGA device.
+ */
+
+#define SVGA_CAP_NONE               0x00000000
+#define SVGA_CAP_RECT_COPY          0x00000002
+#define SVGA_CAP_CURSOR             0x00000020
+#define SVGA_CAP_CURSOR_BYPASS      0x00000040   // Legacy (Use Cursor Bypass 3 instead)
+#define SVGA_CAP_CURSOR_BYPASS_2    0x00000080   // Legacy (Use Cursor Bypass 3 instead)
+#define SVGA_CAP_8BIT_EMULATION     0x00000100
+#define SVGA_CAP_ALPHA_CURSOR       0x00000200
+#define SVGA_CAP_3D                 0x00004000
+#define SVGA_CAP_EXTENDED_FIFO      0x00008000
+#define SVGA_CAP_MULTIMON           0x00010000   // Legacy multi-monitor support
+#define SVGA_CAP_PITCHLOCK          0x00020000
+#define SVGA_CAP_IRQMASK            0x00040000
+#define SVGA_CAP_DISPLAY_TOPOLOGY   0x00080000   // Legacy multi-monitor support
+#define SVGA_CAP_GMR                0x00100000
+#define SVGA_CAP_TRACES             0x00200000
+
+
+/*
+ * FIFO register indices.
+ *
+ * The FIFO is a chunk of device memory mapped into guest physmem.  It
+ * is always treated as 32-bit words.
+ *
+ * The guest driver gets to decide how to partition it between
+ * - FIFO registers (there are always at least 4, specifying where the
+ *   following data area is and how much data it contains; there may be
+ *   more registers following these, depending on the FIFO protocol
+ *   version in use)
+ * - FIFO data, written by the guest and slurped out by the VMX.
+ * These indices are 32-bit word offsets into the FIFO.
+ */
+
+enum {
+   /*
+    * Block 1 (basic registers): The originally defined FIFO registers.
+    * These exist and are valid for all versions of the FIFO protocol.
+    */
+
+   SVGA_FIFO_MIN = 0,
+   SVGA_FIFO_MAX,       /* The distance from MIN to MAX must be at least 10K */
+   SVGA_FIFO_NEXT_CMD,
+   SVGA_FIFO_STOP,
+
+   /*
+    * Block 2 (extended registers): Mandatory registers for the extended
+    * FIFO.  These exist if the SVGA caps register includes
+    * SVGA_CAP_EXTENDED_FIFO; some of them are valid only if their
+    * associated capability bit is enabled.
+    *
+    * Note that when originally defined, SVGA_CAP_EXTENDED_FIFO implied
+    * support only for (FIFO registers) CAPABILITIES, FLAGS, and FENCE.
+    * This means that the guest has to test individually (in most cases
+    * using FIFO caps) for the presence of registers after this; the VMX
+    * can define "extended FIFO" to mean whatever it wants, and currently
+    * won't enable it unless there's room for that set and much more.
+    */
+
+   SVGA_FIFO_CAPABILITIES = 4,
+   SVGA_FIFO_FLAGS,
+   // Valid with SVGA_FIFO_CAP_FENCE:
+   SVGA_FIFO_FENCE,
+
+   /*
+    * Block 3a (optional extended registers): Additional registers for the
+    * extended FIFO, whose presence isn't actually implied by
+    * SVGA_CAP_EXTENDED_FIFO; these exist if SVGA_FIFO_MIN is high enough to
+    * leave room for them.
+    *
+    * These in block 3a, the VMX currently considers mandatory for the
+    * extended FIFO.
+    */
+
+   // Valid if exists (i.e. if extended FIFO enabled):
+   SVGA_FIFO_3D_HWVERSION,       /* See SVGA3dHardwareVersion in svga3d_reg.h */
+   // Valid with SVGA_FIFO_CAP_PITCHLOCK:
+   SVGA_FIFO_PITCHLOCK,
+
+   // Valid with SVGA_FIFO_CAP_CURSOR_BYPASS_3:
+   SVGA_FIFO_CURSOR_ON,          /* Cursor bypass 3 show/hide register */
+   SVGA_FIFO_CURSOR_X,           /* Cursor bypass 3 x register */
+   SVGA_FIFO_CURSOR_Y,           /* Cursor bypass 3 y register */
+   SVGA_FIFO_CURSOR_COUNT,       /* Incremented when any of the other 3 change */
+   SVGA_FIFO_CURSOR_LAST_UPDATED,/* Last time the host updated the cursor */
+
+   // Valid with SVGA_FIFO_CAP_RESERVE:
+   SVGA_FIFO_RESERVED,           /* Bytes past NEXT_CMD with real contents */
+
+   /*
+    * Valid with SVGA_FIFO_CAP_SCREEN_OBJECT:
+    *
+    * By default this is SVGA_ID_INVALID, to indicate that the cursor
+    * coordinates are specified relative to the virtual root. If this
+    * is set to a specific screen ID, cursor position is reinterpreted
+    * as a signed offset relative to that screen's origin. This is the
+    * only way to place the cursor on a non-rooted screen.
+    */
+   SVGA_FIFO_CURSOR_SCREEN_ID,
+
+   /*
+    * XXX: The gap here, up until SVGA_FIFO_3D_CAPS, can be used for new
+    * registers, but this must be done carefully and with judicious use of
+    * capability bits, since comparisons based on SVGA_FIFO_MIN aren't
+    * enough to tell you whether the register exists: we've shipped drivers
+    * and products that used SVGA_FIFO_3D_CAPS but didn't know about some of
+    * the earlier ones.  The actual order of introduction was:
+    * - PITCHLOCK
+    * - 3D_CAPS
+    * - CURSOR_* (cursor bypass 3)
+    * - RESERVED
+    * So, code that wants to know whether it can use any of the
+    * aforementioned registers, or anything else added after PITCHLOCK and
+    * before 3D_CAPS, needs to reason about something other than
+    * SVGA_FIFO_MIN.
+    */
+
+   /*
+    * 3D caps block space; valid with 3D hardware version >=
+    * SVGA3D_HWVERSION_WS6_B1.
+    */
+   SVGA_FIFO_3D_CAPS      = 32,
+   SVGA_FIFO_3D_CAPS_LAST = 32 + 255,
+
+   /*
+    * End of VMX's current definition of "extended-FIFO registers".
+    * Registers before here are always enabled/disabled as a block; either
+    * the extended FIFO is enabled and includes all preceding registers, or
+    * it's disabled entirely.
+    *
+    * Block 3b (truly optional extended registers): Additional registers for
+    * the extended FIFO, which the VMX already knows how to enable and
+    * disable with correct granularity.
+    *
+    * Registers after here exist if and only if the guest SVGA driver
+    * sets SVGA_FIFO_MIN high enough to leave room for them.
+    */
+
+   // Valid if register exists:
+   SVGA_FIFO_GUEST_3D_HWVERSION, /* Guest driver's 3D version */
+   SVGA_FIFO_FENCE_GOAL,         /* Matching target for SVGA_IRQFLAG_FENCE_GOAL */
+   SVGA_FIFO_BUSY,               /* See "FIFO Synchronization Registers" */
+
+   /*
+    * Always keep this last.  This defines the maximum number of
+    * registers we know about.  At power-on, this value is placed in
+    * the SVGA_REG_MEM_REGS register, and we expect the guest driver
+    * to allocate this much space in FIFO memory for registers.
+    */
+    SVGA_FIFO_NUM_REGS
+};
+
+
+/*
+ * Definition of registers included in extended FIFO support.
+ *
+ * The guest SVGA driver gets to allocate the FIFO between registers
+ * and data.  It must always allocate at least 4 registers, but old
+ * drivers stopped there.
+ *
+ * The VMX will enable extended FIFO support if and only if the guest
+ * left enough room for all registers defined as part of the mandatory
+ * set for the extended FIFO.
+ *
+ * Note that the guest drivers typically allocate the FIFO only at
+ * initialization time, not at mode switches, so it's likely that the
+ * number of FIFO registers won't change without a reboot.
+ *
+ * All registers less than this value are guaranteed to be present if
+ * svgaUser->fifo.extended is set. Any later registers must be tested
+ * individually for compatibility at each use (in the VMX).
+ *
+ * This value is used only by the VMX, so it can change without
+ * affecting driver compatibility; keep it that way?
+ */
+#define SVGA_FIFO_EXTENDED_MANDATORY_REGS  (SVGA_FIFO_3D_CAPS_LAST + 1)
+
+
+/*
+ * FIFO Synchronization Registers
+ *
+ *  This explains the relationship between the various FIFO
+ *  sync-related registers in IOSpace and in FIFO space.
+ *
+ *  SVGA_REG_SYNC --
+ *
+ *       The SYNC register can be used in two different ways by the guest:
+ *
+ *         1. If the guest wishes to fully sync (drain) the FIFO,
+ *            it will write once to SYNC then poll on the BUSY
+ *            register. The FIFO is sync'ed once BUSY is zero.
+ *
+ *         2. If the guest wants to asynchronously wake up the host,
+ *            it will write once to SYNC without polling on BUSY.
+ *            Ideally it will do this after some new commands have
+ *            been placed in the FIFO, and after reading a zero
+ *            from SVGA_FIFO_BUSY.
+ *
+ *       (1) is the original behaviour that SYNC was designed to
+ *       support.  Originally, a write to SYNC would implicitly
+ *       trigger a read from BUSY. This causes us to synchronously
+ *       process the FIFO.
+ *
+ *       This behaviour has since been changed so that writing SYNC
+ *       will *not* implicitly cause a read from BUSY. Instead, it
+ *       makes a channel call which asynchronously wakes up the MKS
+ *       thread.
+ *
+ *       New guests can use this new behaviour to implement (2)
+ *       efficiently. This lets guests get the host's attention
+ *       without waiting for the MKS to poll, which gives us much
+ *       better CPU utilization on SMP hosts and on UP hosts while
+ *       we're blocked on the host GPU.
+ *
+ *       Old guests shouldn't notice the behaviour change. SYNC was
+ *       never guaranteed to process the entire FIFO, since it was
+ *       bounded to a particular number of CPU cycles. Old guests will
+ *       still loop on the BUSY register until the FIFO is empty.
+ *
+ *       Writing to SYNC currently has the following side-effects:
+ *
+ *         - Sets SVGA_REG_BUSY to TRUE (in the monitor)
+ *         - Asynchronously wakes up the MKS thread for FIFO processing
+ *         - The value written to SYNC is recorded as a "reason", for
+ *           stats purposes.
+ *
+ *       If SVGA_FIFO_BUSY is available, drivers are advised to only
+ *       write to SYNC if SVGA_FIFO_BUSY is FALSE. Drivers should set
+ *       SVGA_FIFO_BUSY to TRUE after writing to SYNC. The MKS will
+ *       eventually set SVGA_FIFO_BUSY on its own, but this approach
+ *       lets the driver avoid sending multiple asynchronous wakeup
+ *       messages to the MKS thread.
+ *
+ *  SVGA_REG_BUSY --
+ *
+ *       This register is set to TRUE when SVGA_REG_SYNC is written,
+ *       and it reads as FALSE when the FIFO has been completely
+ *       drained.
+ *
+ *       Every read from this register causes us to synchronously
+ *       process FIFO commands. There is no guarantee as to how many
+ *       commands each read will process.
+ *
+ *       CPU time spent processing FIFO commands will be billed to
+ *       the guest.
+ *
+ *       New drivers should avoid using this register unless they
+ *       need to guarantee that the FIFO is completely drained. It
+ *       is overkill for performing a sync-to-fence. Older drivers
+ *       will use this register for any type of synchronization.
+ *
+ *  SVGA_FIFO_BUSY --
+ *
+ *       This register is a fast way for the guest driver to check
+ *       whether the FIFO is already being processed. It reads and
+ *       writes at normal RAM speeds, with no monitor intervention.
+ *
+ *       If this register reads as TRUE, the host is guaranteeing that
+ *       any new commands written into the FIFO will be noticed before
+ *       the MKS goes back to sleep.
+ *
+ *       If this register reads as FALSE, no such guarantee can be
+ *       made.
+ *
+ *       The guest should use this register to quickly determine
+ *       whether or not it needs to wake up the host. If the guest
+ *       just wrote a command or group of commands that it would like
+ *       the host to begin processing, it should:
+ *
+ *         1. Read SVGA_FIFO_BUSY. If it reads as TRUE, no further
+ *            action is necessary.
+ *
+ *         2. Write TRUE to SVGA_FIFO_BUSY. This informs future guest
+ *            code that we've already sent a SYNC to the host and we
+ *            don't need to send a duplicate.
+ *
+ *         3. Write a reason to SVGA_REG_SYNC. This will send an
+ *            asynchronous wakeup to the MKS thread.
+ */
+
+
+/*
+ * FIFO Capabilities
+ *
+ *      Fence -- Fence register and command are supported
+ *      Accel Front -- Front buffer only commands are supported
+ *      Pitch Lock -- Pitch lock register is supported
+ *      Video -- SVGA Video overlay units are supported
+ *      Escape -- Escape command is supported
+ *
+ * XXX: Add longer descriptions for each capability, including a list
+ *      of the new features that each capability provides.
+ *
+ * SVGA_FIFO_CAP_SCREEN_OBJECT --
+ *
+ *    Provides dynamic multi-screen rendering, for improved Unity and
+ *    multi-monitor modes. With Screen Object, the guest can
+ *    dynamically create and destroy 'screens', which can represent
+ *    Unity windows or virtual monitors. Screen Object also provides
+ *    strong guarantees that DMA operations happen only when
+ *    guest-initiated. Screen Object deprecates the BAR1 guest
+ *    framebuffer (GFB) and all commands that work only with the GFB.
+ *
+ *    New registers:
+ *       FIFO_CURSOR_SCREEN_ID, VIDEO_DATA_GMRID, VIDEO_DST_SCREEN_ID
+ *
+ *    New 2D commands:
+ *       DEFINE_SCREEN, DESTROY_SCREEN, DEFINE_GMRFB, BLIT_GMRFB_TO_SCREEN,
+ *       BLIT_SCREEN_TO_GMRFB, ANNOTATION_FILL, ANNOTATION_COPY
+ *
+ *    New 3D commands:
+ *       BLIT_SURFACE_TO_SCREEN
+ *
+ *    New guarantees:
+ *
+ *       - The host will not read or write guest memory, including the GFB,
+ *         except when explicitly initiated by a DMA command.
+ *
+ *       - All DMA, including legacy DMA like UPDATE and PRESENT_READBACK,
+ *         is guaranteed to complete before any subsequent FENCEs.
+ *
+ *       - All legacy commands which affect a Screen (UPDATE, PRESENT,
+ *         PRESENT_READBACK) as well as new Screen blit commands will
+ *         all behave consistently as blits, and memory will be read
+ *         or written in FIFO order.
+ *
+ *         For example, if you PRESENT from one SVGA3D surface to multiple
+ *         places on the screen, the data copied will always be from the
+ *         SVGA3D surface at the time the PRESENT was issued in the FIFO.
+ *         This was not necessarily true on devices without Screen Object.
+ *
+ *         This means that on devices that support Screen Object, the
+ *         PRESENT_READBACK command should not be necessary unless you
+ *         actually want to read back the results of 3D rendering into
+ *         system memory. (And for that, the BLIT_SCREEN_TO_GMRFB
+ *         command provides a strict superset of functionality.)
+ *
+ *       - When a screen is resized, either using Screen Object commands or
+ *         legacy multimon registers, its contents are preserved.
+ */
+
+#define SVGA_FIFO_CAP_NONE                  0
+#define SVGA_FIFO_CAP_FENCE             (1<<0)
+#define SVGA_FIFO_CAP_ACCELFRONT        (1<<1)
+#define SVGA_FIFO_CAP_PITCHLOCK         (1<<2)
+#define SVGA_FIFO_CAP_VIDEO             (1<<3)
+#define SVGA_FIFO_CAP_CURSOR_BYPASS_3   (1<<4)
+#define SVGA_FIFO_CAP_ESCAPE            (1<<5)
+#define SVGA_FIFO_CAP_RESERVE           (1<<6)
+#define SVGA_FIFO_CAP_SCREEN_OBJECT     (1<<7)
+
+
+/*
+ * FIFO Flags
+ *
+ *      Accel Front -- Driver should use front buffer only commands
+ */
+
+#define SVGA_FIFO_FLAG_NONE                 0
+#define SVGA_FIFO_FLAG_ACCELFRONT       (1<<0)
+#define SVGA_FIFO_FLAG_RESERVED        (1<<31) // Internal use only
+
+/*
+ * FIFO reservation sentinel value
+ */
+
+#define SVGA_FIFO_RESERVED_UNKNOWN      0xffffffff
+
+
+/*
+ * Video overlay support
+ */
+
+#define SVGA_NUM_OVERLAY_UNITS 32
+
+
+/*
+ * Video capabilities that the guest is currently using
+ */
+
+#define SVGA_VIDEO_FLAG_COLORKEY        0x0001
+
+
+/*
+ * Offsets for the video overlay registers
+ */
+
+enum {
+   SVGA_VIDEO_ENABLED = 0,
+   SVGA_VIDEO_FLAGS,
+   SVGA_VIDEO_DATA_OFFSET,
+   SVGA_VIDEO_FORMAT,
+   SVGA_VIDEO_COLORKEY,
+   SVGA_VIDEO_SIZE,          // Deprecated
+   SVGA_VIDEO_WIDTH,
+   SVGA_VIDEO_HEIGHT,
+   SVGA_VIDEO_SRC_X,
+   SVGA_VIDEO_SRC_Y,
+   SVGA_VIDEO_SRC_WIDTH,
+   SVGA_VIDEO_SRC_HEIGHT,
+   SVGA_VIDEO_DST_X,         // Signed int32
+   SVGA_VIDEO_DST_Y,         // Signed int32
+   SVGA_VIDEO_DST_WIDTH,
+   SVGA_VIDEO_DST_HEIGHT,
+   SVGA_VIDEO_PITCH_1,
+   SVGA_VIDEO_PITCH_2,
+   SVGA_VIDEO_PITCH_3,
+   SVGA_VIDEO_DATA_GMRID,    // Optional, defaults to SVGA_GMR_FRAMEBUFFER
+   SVGA_VIDEO_DST_SCREEN_ID, // Optional, defaults to virtual coords (SVGA_ID_INVALID)
+   SVGA_VIDEO_NUM_REGS
+};
+
+
+/*
+ * SVGA Overlay Units
+ *
+ *      width and height relate to the entire source video frame.
+ *      srcX, srcY, srcWidth and srcHeight represent subset of the source
+ *      video frame to be displayed.
+ */
+
+typedef struct SVGAOverlayUnit {
+   uint32 enabled;
+   uint32 flags;
+   uint32 dataOffset;
+   uint32 format;
+   uint32 colorKey;
+   uint32 size;
+   uint32 width;
+   uint32 height;
+   uint32 srcX;
+   uint32 srcY;
+   uint32 srcWidth;
+   uint32 srcHeight;
+   int32  dstX;
+   int32  dstY;
+   uint32 dstWidth;
+   uint32 dstHeight;
+   uint32 pitches[3];
+   uint32 dataGMRId;
+   uint32 dstScreenId;
+} SVGAOverlayUnit;
+
+
+/*
+ * SVGAScreenObject --
+ *
+ *    This is a new way to represent a guest's multi-monitor screen or
+ *    Unity window. Screen objects are only supported if the
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT capability bit is set.
+ *
+ *    If Screen Objects are supported, they can be used to fully
+ *    replace the functionality provided by the framebuffer registers
+ *    (SVGA_REG_WIDTH, HEIGHT, etc.) and by SVGA_CAP_DISPLAY_TOPOLOGY.
+ *
+ *    The screen object is a struct with guaranteed binary
+ *    compatibility. New flags can be added, and the struct may grow,
+ *    but existing fields must retain their meaning.
+ *
+ */
+
+#define SVGA_SCREEN_HAS_ROOT    (1 << 0)  // Screen is present in the virtual coord space
+#define SVGA_SCREEN_IS_PRIMARY  (1 << 1)  // Guest considers this screen to be 'primary'
+#define SVGA_SCREEN_FULLSCREEN_HINT (1 << 2)   // Guest is running a fullscreen app here
+
+typedef
+struct SVGAScreenObject {
+   uint32 structSize;   // sizeof(SVGAScreenObject)
+   uint32 id;
+   uint32 flags;
+   struct {
+      uint32 width;
+      uint32 height;
+   } size;
+   struct {
+      int32 x;
+      int32 y;
+   } root;              // Only used if SVGA_SCREEN_HAS_ROOT is set.
+} SVGAScreenObject;
+
+
+/*
+ *  Commands in the command FIFO:
+ *
+ *  Command IDs defined below are used for the traditional 2D FIFO
+ *  communication (not all commands are available for all versions of the
+ *  SVGA FIFO protocol).
+ *
+ *  Note the holes in the command ID numbers: These commands have been
+ *  deprecated, and the old IDs must not be reused.
+ *
+ *  Command IDs from 1000 to 1999 are reserved for use by the SVGA3D
+ *  protocol.
+ *
+ *  Each command's parameters are described by the comments and
+ *  structs below.
+ */
+
+typedef enum {
+   SVGA_CMD_INVALID_CMD           = 0,
+   SVGA_CMD_UPDATE                = 1,
+   SVGA_CMD_RECT_COPY             = 3,
+   SVGA_CMD_DEFINE_CURSOR         = 19,
+   SVGA_CMD_DEFINE_ALPHA_CURSOR   = 22,
+   SVGA_CMD_UPDATE_VERBOSE        = 25,
+   SVGA_CMD_FRONT_ROP_FILL        = 29,
+   SVGA_CMD_FENCE                 = 30,
+   SVGA_CMD_ESCAPE                = 33,
+   SVGA_CMD_DEFINE_SCREEN         = 34,
+   SVGA_CMD_DESTROY_SCREEN        = 35,
+   SVGA_CMD_DEFINE_GMRFB          = 36,
+   SVGA_CMD_BLIT_GMRFB_TO_SCREEN  = 37,
+   SVGA_CMD_BLIT_SCREEN_TO_GMRFB  = 38,
+   SVGA_CMD_ANNOTATION_FILL       = 39,
+   SVGA_CMD_ANNOTATION_COPY       = 40,
+   SVGA_CMD_MAX
+} SVGAFifoCmdId;
+
+#define SVGA_CMD_MAX_ARGS           64
+
+
+/*
+ * SVGA_CMD_UPDATE --
+ *
+ *    This is a DMA transfer which copies from the Guest Framebuffer
+ *    (GFB) at BAR1 + SVGA_REG_FB_OFFSET to any screens which
+ *    intersect with the provided virtual rectangle.
+ *
+ *    This command does not support using arbitrary guest memory as a
+ *    data source- it only works with the pre-defined GFB memory.
+ *    This command also does not support signed virtual coordinates.
+ *    If you have defined screens (using SVGA_CMD_DEFINE_SCREEN) with
+ *    negative root x/y coordinates, the negative portion of those
+ *    screens will not be reachable by this command.
+ *
+ *    This command is not necessary when using framebuffer
+ *    traces. Traces are automatically enabled if the SVGA FIFO is
+ *    disabled, and you may explicitly enable/disable traces using
+ *    SVGA_REG_TRACES. With traces enabled, any write to the GFB will
+ *    automatically act as if a subsequent SVGA_CMD_UPDATE was issued.
+ *
+ *    Traces and SVGA_CMD_UPDATE are the only supported ways to render
+ *    pseudocolor screen updates. The newer Screen Object commands
+ *    only support true color formats.
+ *
+ * Availability:
+ *    Always available.
+ */
+
+typedef
+struct {
+   uint32 x;
+   uint32 y;
+   uint32 width;
+   uint32 height;
+} SVGAFifoCmdUpdate;
+
+
+/*
+ * SVGA_CMD_RECT_COPY --
+ *
+ *    Perform a rectangular DMA transfer from one area of the GFB to
+ *    another, and copy the result to any screens which intersect it.
+ *
+ * Availability:
+ *    SVGA_CAP_RECT_COPY
+ */
+
+typedef
+struct {
+   uint32 srcX;
+   uint32 srcY;
+   uint32 destX;
+   uint32 destY;
+   uint32 width;
+   uint32 height;
+} SVGAFifoCmdRectCopy;
+
+
+/*
+ * SVGA_CMD_DEFINE_CURSOR --
+ *
+ *    Provide a new cursor image, as an AND/XOR mask.
+ *
+ *    The recommended way to position the cursor overlay is by using
+ *    the SVGA_FIFO_CURSOR_* registers, supported by the
+ *    SVGA_FIFO_CAP_CURSOR_BYPASS_3 capability.
+ *
+ * Availability:
+ *    SVGA_CAP_CURSOR
+ */
+
+typedef
+struct {
+   uint32 id;             // Reserved, must be zero.
+   uint32 hotspotX;
+   uint32 hotspotY;
+   uint32 width;
+   uint32 height;
+   uint32 andMaskDepth;   // Value must be 1 or equal to BITS_PER_PIXEL
+   uint32 xorMaskDepth;   // Value must be 1 or equal to BITS_PER_PIXEL
+   /*
+    * Followed by scanline data for AND mask, then XOR mask.
+    * Each scanline is padded to a 32-bit boundary.
+   */
+} SVGAFifoCmdDefineCursor;
+
+
+/*
+ * SVGA_CMD_DEFINE_ALPHA_CURSOR --
+ *
+ *    Provide a new cursor image, in 32-bit BGRA format.
+ *
+ *    The recommended way to position the cursor overlay is by using
+ *    the SVGA_FIFO_CURSOR_* registers, supported by the
+ *    SVGA_FIFO_CAP_CURSOR_BYPASS_3 capability.
+ *
+ * Availability:
+ *    SVGA_CAP_ALPHA_CURSOR
+ */
+
+typedef
+struct {
+   uint32 id;             // Reserved, must be zero.
+   uint32 hotspotX;
+   uint32 hotspotY;
+   uint32 width;
+   uint32 height;
+   /* Followed by scanline data */
+} SVGAFifoCmdDefineAlphaCursor;
+
+
+/*
+ * SVGA_CMD_UPDATE_VERBOSE --
+ *
+ *    Just like SVGA_CMD_UPDATE, but also provide a per-rectangle
+ *    'reason' value, an opaque cookie which is used by internal
+ *    debugging tools. Third party drivers should not use this
+ *    command.
+ *
+ * Availability:
+ *    SVGA_CAP_EXTENDED_FIFO
+ */
+
+typedef
+struct {
+   uint32 x;
+   uint32 y;
+   uint32 width;
+   uint32 height;
+   uint32 reason;
+} SVGAFifoCmdUpdateVerbose;
+
+
+/*
+ * SVGA_CMD_FRONT_ROP_FILL --
+ *
+ *    This is a hint which tells the SVGA device that the driver has
+ *    just filled a rectangular region of the GFB with a solid
+ *    color. Instead of reading these pixels from the GFB, the device
+ *    can assume that they all equal 'color'. This is primarily used
+ *    for remote desktop protocols.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_ACCELFRONT
+ */
+
+#define  SVGA_ROP_COPY                    0x03
+
+typedef
+struct {
+   uint32 color;     // In the same format as the GFB
+   uint32 x;
+   uint32 y;
+   uint32 width;
+   uint32 height;
+   uint32 rop;       // Must be SVGA_ROP_COPY
+} SVGAFifoCmdFrontRopFill;
+
+
+/*
+ * SVGA_CMD_FENCE --
+ *
+ *    Insert a synchronization fence.  When the SVGA device reaches
+ *    this command, it will copy the 'fence' value into the
+ *    SVGA_FIFO_FENCE register. It will also compare the fence against
+ *    SVGA_FIFO_FENCE_GOAL. If the fence matches the goal and the
+ *    SVGA_IRQFLAG_FENCE_GOAL interrupt is enabled, the device will
+ *    raise this interrupt.
+ *
+ * Availability:
+ *    SVGA_FIFO_FENCE for this command,
+ *    SVGA_CAP_IRQMASK for SVGA_FIFO_FENCE_GOAL.
+ */
+
+typedef
+struct {
+   uint32 fence;
+} SVGAFifoCmdFence;
+
+
+/*
+ * SVGA_CMD_ESCAPE --
+ *
+ *    Send an extended or vendor-specific variable length command.
+ *    This is used for video overlay, third party plugins, and
+ *    internal debugging tools. See svga_escape.h
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_ESCAPE
+ */
+
+typedef
+struct {
+   uint32 nsid;
+   uint32 size;
+   /* followed by 'size' bytes of data */
+} SVGAFifoCmdEscape;
+
+
+/*
+ * SVGA_CMD_DEFINE_SCREEN --
+ *
+ *    Define or redefine an SVGAScreenObject. See the description of
+ *    SVGAScreenObject above.  The video driver is responsible for
+ *    generating new screen IDs. They should be small positive
+ *    integers. The virtual device will have an implementation
+ *    specific upper limit on the number of screen IDs
+ *    supported. Drivers are responsible for recycling IDs. The first
+ *    valid ID is zero.
+ *
+ *    - Interaction with other registers:
+ *
+ *    For backwards compatibility, when the GFB mode registers (WIDTH,
+ *    HEIGHT, PITCHLOCK, BITS_PER_PIXEL) are modified, the SVGA device
+ *    deletes all screens other than screen #0, and redefines screen
+ *    #0 according to the specified mode. Drivers that use
+ *    SVGA_CMD_DEFINE_SCREEN should destroy or redefine screen #0.
+ *
+ *    If you use screen objects, do not use the legacy multi-mon
+ *    registers (SVGA_REG_NUM_GUEST_DISPLAYS, SVGA_REG_DISPLAY_*).
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   SVGAScreenObject screen;   // Variable-length according to version
+} SVGAFifoCmdDefineScreen;
+
+
+/*
+ * SVGA_CMD_DESTROY_SCREEN --
+ *
+ *    Destroy an SVGAScreenObject. Its ID is immediately available for
+ *    re-use.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   uint32 screenId;
+} SVGAFifoCmdDestroyScreen;
+
+
+/*
+ * SVGA_CMD_DEFINE_GMRFB --
+ *
+ *    This command sets a piece of SVGA device state called the
+ *    Guest Memory Region Framebuffer, or GMRFB. The GMRFB is a
+ *    piece of light-weight state which identifies the location and
+ *    format of an image in guest memory or in BAR1. The GMRFB has
+ *    an arbitrary size, and it doesn't need to match the geometry
+ *    of the GFB or any screen object.
+ *
+ *    The GMRFB can be redefined as often as you like. You could
+ *    always use the same GMRFB, you could redefine it before
+ *    rendering from a different guest screen, or you could even
+ *    redefine it before every blit.
+ *
+ *    There are multiple ways to use this command. The simplest way is
+ *    to use it to move the framebuffer either to elsewhere in the GFB
+ *    (BAR1) memory region, or to a user-defined GMR. This lets a
+ *    driver use a framebuffer allocated entirely out of normal system
+ *    memory, which we encourage.
+ *
+ *    Another way to use this command is to set up a ring buffer of
+ *    updates in GFB memory. If a driver wants to ensure that no
+ *    frames are skipped by the SVGA device, it is important that the
+ *    driver not modify the source data for a blit until the device is
+ *    done processing the command. One efficient way to accomplish
+ *    this is to use a ring of small DMA buffers. Each buffer is used
+ *    for one blit, then we move on to the next buffer in the
+ *    ring. The FENCE mechanism is used to protect each buffer from
+ *    re-use until the device is finished with that buffer's
+ *    corresponding blit.
+ *
+ *    This command does not affect the meaning of SVGA_CMD_UPDATE.
+ *    UPDATEs always occur from the legacy GFB memory area. This
+ *    command has no support for pseudocolor GMRFBs. Currently only
+ *    true-color 15, 16, and 24-bit depths are supported. Future
+ *    devices may expose capabilities for additional framebuffer
+ *    formats.
+ *
+ *    The default GMRFB value is undefined. Drivers must always send
+ *    this command at least once before performing any blit from the
+ *    GMRFB.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   SVGAGuestPtr        ptr;
+   uint32              bytesPerLine;
+   SVGAGMRImageFormat  format;
+} SVGAFifoCmdDefineGMRFB;
+
+
+/*
+ * SVGA_CMD_BLIT_GMRFB_TO_SCREEN --
+ *
+ *    This is a guest-to-host blit. It performs a DMA operation to
+ *    copy a rectangular region of pixels from the current GMRFB to
+ *    one or more Screen Objects.
+ *
+ *    The destination coordinate may be specified relative to a
+ *    screen's origin (if a screen ID is specified) or relative to the
+ *    virtual coordinate system's origin (if the screen ID is
+ *    SVGA_ID_INVALID). The actual destination may span zero or more
+ *    screens, in the case of a virtual destination rect or a rect
+ *    which extends off the edge of the specified screen.
+ *
+ *    This command writes to the screen's "base layer": the underlying
+ *    framebuffer which exists below any cursor or video overlays. No
+ *    action is necessary to explicitly hide or update any overlays
+ *    which exist on top of the updated region.
+ *
+ *    The SVGA device is guaranteed to finish reading from the GMRFB
+ *    by the time any subsequent FENCE commands are reached.
+ *
+ *    This command consumes an annotation. See the
+ *    SVGA_CMD_ANNOTATION_* commands for details.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   SVGASignedPoint  srcOrigin;
+   SVGASignedRect   destRect;
+   uint32           destScreenId;
+} SVGAFifoCmdBlitGMRFBToScreen;
+
+
+/*
+ * SVGA_CMD_BLIT_SCREEN_TO_GMRFB --
+ *
+ *    This is a host-to-guest blit. It performs a DMA operation to
+ *    copy a rectangular region of pixels from a single Screen Object
+ *    back to the current GMRFB.
+ *
+ *    Usage note: This command should be used rarely. It will
+ *    typically be inefficient, but it is necessary for some types of
+ *    synchronization between 3D (GPU) and 2D (CPU) rendering into
+ *    overlapping areas of a screen.
+ *
+ *    The source coordinate is specified relative to a screen's
+ *    origin. The provided screen ID must be valid. If any parameters
+ *    are invalid, the resulting pixel values are undefined.
+ *
+ *    This command reads the screen's "base layer". Overlays like
+ *    video and cursor are not included, but any data which was sent
+ *    using a blit-to-screen primitive will be available, no matter
+ *    whether the data's original source was the GMRFB or the 3D
+ *    acceleration hardware.
+ *
+ *    Note that our guest-to-host blits and host-to-guest blits aren't
+ *    symmetric in their current implementation. While the parameters
+ *    are identical, host-to-guest blits are a lot less featureful.
+ *    They do not support clipping: If the source parameters don't
+ *    fully fit within a screen, the blit fails. They must originate
+ *    from exactly one screen. Virtual coordinates are not directly
+ *    supported.
+ *
+ *    Host-to-guest blits do support the same set of GMRFB formats
+ *    offered by guest-to-host blits.
+ *
+ *    The SVGA device is guaranteed to finish writing to the GMRFB by
+ *    the time any subsequent FENCE commands are reached.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   SVGASignedPoint  destOrigin;
+   SVGASignedRect   srcRect;
+   uint32           srcScreenId;
+} SVGAFifoCmdBlitScreenToGMRFB;
+
+
+/*
+ * SVGA_CMD_ANNOTATION_FILL --
+ *
+ *    This is a blit annotation. This command stores a small piece of
+ *    device state which is consumed by the next blit-to-screen
+ *    command. The state is only cleared by commands which are
+ *    specifically documented as consuming an annotation. Other
+ *    commands (such as ESCAPEs for debugging) may intervene between
+ *    the annotation and its associated blit.
+ *
+ *    This annotation is a promise about the contents of the next
+ *    blit: The video driver is guaranteeing that all pixels in that
+ *    blit will have the same value, specified here as a color in
+ *    SVGAColorBGRX format.
+ *
+ *    The SVGA device can still render the blit correctly even if it
+ *    ignores this annotation, but the annotation may allow it to
+ *    perform the blit more efficiently, for example by ignoring the
+ *    source data and performing a fill in hardware.
+ *
+ *    This annotation is most important for performance when the
+ *    user's display is being remoted over a network connection.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   SVGAColorBGRX  color;
+} SVGAFifoCmdAnnotationFill;
+
+
+/*
+ * SVGA_CMD_ANNOTATION_COPY --
+ *
+ *    This is a blit annotation. See SVGA_CMD_ANNOTATION_FILL for more
+ *    information about annotations.
+ *
+ *    This annotation is a promise about the contents of the next
+ *    blit: The video driver is guaranteeing that all pixels in that
+ *    blit will have the same value as those which already exist at an
+ *    identically-sized region on the same or a different screen.
+ *
+ *    Note that the source pixels for the COPY in this annotation are
+ *    sampled before applying the anqnotation's associated blit. They
+ *    are allowed to overlap with the blit's destination pixels.
+ *
+ *    The copy source rectangle is specified the same way as the blit
+ *    destination: it can be a rectangle which spans zero or more
+ *    screens, specified relative to either a screen or to the virtual
+ *    coordinate system's origin. If the source rectangle includes
+ *    pixels which are not from exactly one screen, the results are
+ *    undefined.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   SVGASignedPoint  srcOrigin;
+   uint32           srcScreenId;
+} SVGAFifoCmdAnnotationCopy;
+
+#endif
diff --git a/src/gallium/drivers/svga/include/svga_types.h b/src/gallium/drivers/svga/include/svga_types.h
new file mode 100644
index 0000000000..7fd9bab03a
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga_types.h
@@ -0,0 +1,46 @@
+/**********************************************************
+ * Copyright 1998-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef _SVGA_TYPES_H_
+#define _SVGA_TYPES_H_
+
+#include "pipe/p_compiler.h"
+
+typedef int64_t int64;
+typedef uint64_t uint64;
+
+typedef int32_t int32;
+typedef uint32_t uint32;
+
+typedef int16_t int16;
+typedef uint16_t uint16;
+
+typedef int8_t int8;
+typedef uint8_t uint8;
+
+typedef uint8_t Bool;
+
+#endif /* _SVGA_TYPES_H_ */
+
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
new file mode 100644
index 0000000000..7b2dfe2549
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -0,0 +1,1429 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * svga_cmd.c --
+ *
+ *      Command construction utility for the SVGA3D protocol used by
+ *      the VMware SVGA device, based on the svgautil library.
+ */
+
+#include "svga_winsys.h"
+#include "svga_resource_buffer.h"
+#include "svga_resource_texture.h"
+#include "svga_surface.h"
+#include "svga_cmd.h"
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * surface_to_surfaceid --
+ *
+ *      Utility function for surface ids.
+ *      Can handle null surface. Does a surface_reallocation so you need
+ *      to have allocated the fifo space before converting.
+ *
+ * Results:
+ *      id is filld out.
+ *
+ * Side effects:
+ *      One surface relocation is preformed for texture handle.
+ *
+ *----------------------------------------------------------------------
+ */
+
+static INLINE
+void surface_to_surfaceid(struct svga_winsys_context *swc, // IN
+                          struct pipe_surface *surface,    // IN
+                          SVGA3dSurfaceImageId *id,        // OUT
+                          unsigned flags)                  // IN
+{
+   if(surface) {
+      struct svga_surface *s = svga_surface(surface);
+      swc->surface_relocation(swc, &id->sid, s->handle, flags);
+      id->face = s->real_face; /* faces have the same order */
+      id->mipmap = s->real_level;
+   }
+   else {
+      id->sid = SVGA3D_INVALID_ID;
+      id->face = 0;
+      id->mipmap = 0;
+   }
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_FIFOReserve --
+ *
+ *      Reserve space for an SVGA3D FIFO command.
+ *
+ *      The 2D SVGA commands have been around for a while, so they
+ *      have a rather asymmetric structure. The SVGA3D protocol is
+ *      more uniform: each command begins with a header containing the
+ *      command number and the full size.
+ *
+ *      This is a convenience wrapper around SVGA_FIFOReserve. We
+ *      reserve space for the whole command, and write the header.
+ *
+ *      This function must be paired with SVGA_FIFOCommitAll().
+ *
+ * Results:
+ *      Returns a pointer to the space reserved for command-specific
+ *      data. It must be 'cmdSize' bytes long.
+ *
+ * Side effects:
+ *      Begins a FIFO reservation.
+ *
+ *----------------------------------------------------------------------
+ */
+
+void *
+SVGA3D_FIFOReserve(struct svga_winsys_context *swc,
+                   uint32 cmd,       // IN
+                   uint32 cmdSize,   // IN
+                   uint32 nr_relocs) // IN
+{
+   SVGA3dCmdHeader *header;
+
+   header = swc->reserve(swc, sizeof *header + cmdSize, nr_relocs);
+   if(!header)
+      return NULL;
+
+   header->id = cmd;
+   header->size = cmdSize;
+
+   return &header[1];
+}
+
+
+void
+SVGA_FIFOCommitAll(struct svga_winsys_context *swc)
+{
+   swc->commit(swc);
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_DefineContext --
+ *
+ *      Create a new context, to be referred to with the provided ID.
+ *
+ *      Context objects encapsulate all render state, and shader
+ *      objects are per-context.
+ *
+ *      Surfaces are not per-context. The same surface can be shared
+ *      between multiple contexts, and surface operations can occur
+ *      without a context.
+ *
+ *      If the provided context ID already existed, it is redefined.
+ *
+ *      Context IDs are arbitrary small non-negative integers,
+ *      global to the entire SVGA device.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_DefineContext(struct svga_winsys_context *swc)  // IN
+{
+   SVGA3dCmdDefineContext *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_CONTEXT_DEFINE, sizeof *cmd, 0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+
+   swc->commit(swc);
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_DestroyContext --
+ *
+ *      Delete a context created with SVGA3D_DefineContext.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_DestroyContext(struct svga_winsys_context *swc)  // IN
+{
+   SVGA3dCmdDestroyContext *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_CONTEXT_DESTROY, sizeof *cmd, 0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   
+   cmd->cid = swc->cid;
+   
+   swc->commit(swc);
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginDefineSurface --
+ *
+ *      Begin a SURFACE_DEFINE command. This reserves space for it in
+ *      the FIFO, and returns pointers to the command's faces and
+ *      mipsizes arrays.
+ *
+ *      This function must be paired with SVGA_FIFOCommitAll().
+ *      The faces and mipSizes arrays are initialized to zero.
+ *
+ *      This creates a "surface" object in the SVGA3D device,
+ *      with the provided surface ID (sid). Surfaces are generic
+ *      containers for host VRAM objects like textures, vertex
+ *      buffers, and depth/stencil buffers.
+ *
+ *      Surfaces are hierarchial:
+ *
+ *        - Surface may have multiple faces (for cube maps)
+ *
+ *          - Each face has a list of mipmap levels
+ *
+ *             - Each mipmap image may have multiple volume
+ *               slices, if the image is three dimensional.
+ *
+ *                - Each slice is a 2D array of 'blocks'
+ *
+ *                   - Each block may be one or more pixels.
+ *                     (Usually 1, more for DXT or YUV formats.)
+ *
+ *      Surfaces are generic host VRAM objects. The SVGA3D device
+ *      may optimize surfaces according to the format they were
+ *      created with, but this format does not limit the ways in
+ *      which the surface may be used. For example, a depth surface
+ *      can be used as a texture, or a floating point image may
+ *      be used as a vertex buffer. Some surface usages may be
+ *      lower performance, due to software emulation, but any
+ *      usage should work with any surface.
+ *
+ *      If 'sid' is already defined, the old surface is deleted
+ *      and this new surface replaces it.
+ *
+ *      Surface IDs are arbitrary small non-negative integers,
+ *      global to the entire SVGA device.
+ *
+ * Results:
+ *      Returns pointers to arrays allocated in the FIFO for 'faces'
+ *      and 'mipSizes'.
+ *
+ * Side effects:
+ *      Begins a FIFO reservation.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginDefineSurface(struct svga_winsys_context *swc,
+                          struct svga_winsys_surface *sid, // IN
+                          SVGA3dSurfaceFlags flags,    // IN
+                          SVGA3dSurfaceFormat format,  // IN
+                          SVGA3dSurfaceFace **faces,   // OUT
+                          SVGA3dSize **mipSizes,       // OUT
+                          uint32 numMipSizes)          // IN
+{
+   SVGA3dCmdDefineSurface *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_DEFINE, sizeof *cmd +
+                            sizeof **mipSizes * numMipSizes, 1);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->surface_relocation(swc, &cmd->sid, sid, SVGA_RELOC_WRITE);
+   cmd->surfaceFlags = flags;
+   cmd->format = format;
+
+   *faces = &cmd->face[0];
+   *mipSizes = (SVGA3dSize*) &cmd[1];
+
+   memset(*faces, 0, sizeof **faces * SVGA3D_MAX_SURFACE_FACES);
+   memset(*mipSizes, 0, sizeof **mipSizes * numMipSizes);
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_DefineSurface2D --
+ *
+ *      This is a simplified version of SVGA3D_BeginDefineSurface(),
+ *      which does not support cube maps, mipmaps, or volume textures.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_DefineSurface2D(struct svga_winsys_context *swc,    // IN
+                       struct svga_winsys_surface *sid, // IN
+                       uint32 width,                // IN
+                       uint32 height,               // IN
+                       SVGA3dSurfaceFormat format)  // IN
+{
+   SVGA3dSize *mipSizes;
+   SVGA3dSurfaceFace *faces;
+   enum pipe_error ret;
+
+   ret = SVGA3D_BeginDefineSurface(swc,
+                                   sid, 0, format, &faces, &mipSizes, 1);
+   if(ret != PIPE_OK)
+      return ret;
+
+   faces[0].numMipLevels = 1;
+
+   mipSizes[0].width = width;
+   mipSizes[0].height = height;
+   mipSizes[0].depth = 1;
+ 
+   swc->commit(swc);;
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_DestroySurface --
+ *
+ *      Release the host VRAM encapsulated by a particular surface ID.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_DestroySurface(struct svga_winsys_context *swc,
+                      struct svga_winsys_surface *sid)  // IN
+{
+   SVGA3dCmdDestroySurface *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_DESTROY, sizeof *cmd, 1);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   
+   swc->surface_relocation(swc, &cmd->sid, sid, SVGA_RELOC_READ);
+   swc->commit(swc);;
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginSurfaceDMA--
+ *
+ *      Begin a SURFACE_DMA command. This reserves space for it in
+ *      the FIFO, and returns a pointer to the command's box array.
+ *      This function must be paired with SVGA_FIFOCommitAll().
+ *
+ *      When the SVGA3D device asynchronously processes this FIFO
+ *      command, a DMA operation is performed between host VRAM and
+ *      a generic SVGAGuestPtr. The guest pointer may refer to guest
+ *      VRAM (provided by the SVGA PCI device) or to guest system
+ *      memory that has been set up as a Guest Memory Region (GMR)
+ *      by the SVGA device.
+ *
+ *      The guest's DMA buffer must remain valid (not freed, paged out,
+ *      or overwritten) until the host has finished processing this
+ *      command. The guest can determine that the host has finished
+ *      by using the SVGA device's FIFO Fence mechanism.
+ *
+ *      The guest's image buffer can be an arbitrary size and shape.
+ *      Guest image data is interpreted according to the SVGA3D surface
+ *      format specified when the surface was defined.
+ *
+ *      The caller may optionally define the guest image's pitch.
+ *      guestImage->pitch can either be zero (assume image is tightly
+ *      packed) or it must be the number of bytes between vertically
+ *      adjacent image blocks.
+ *
+ *      The provided copybox list specifies which regions of the source
+ *      image are to be copied, and where they appear on the destination.
+ *
+ *      NOTE: srcx/srcy are always on the guest image and x/y are
+ *      always on the host image, regardless of the actual transfer
+ *      direction!
+ *
+ *      For efficiency, the SVGA3D device is free to copy more data
+ *      than specified. For example, it may round copy boxes outwards
+ *      such that they lie on particular alignment boundaries.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SurfaceDMA(struct svga_winsys_context *swc,
+                  struct svga_transfer *st,         // IN
+                  SVGA3dTransferType transfer,      // IN
+                  const SVGA3dCopyBox *boxes,       // IN
+                  uint32 numBoxes)                  // IN
+{
+   struct svga_texture *texture = svga_texture(st->base.resource); 
+   SVGA3dCmdSurfaceDMA *cmd;
+   SVGA3dCmdSurfaceDMASuffix *pSuffix;
+   uint32 boxesSize = sizeof *boxes * numBoxes;
+   unsigned region_flags;
+   unsigned surface_flags;
+   
+   if(transfer == SVGA3D_WRITE_HOST_VRAM) {
+      region_flags = SVGA_RELOC_READ;
+      surface_flags = SVGA_RELOC_WRITE;
+   }
+   else if(transfer == SVGA3D_READ_HOST_VRAM) {
+      region_flags = SVGA_RELOC_WRITE;
+      surface_flags = SVGA_RELOC_READ;
+   }
+   else {
+      assert(0);
+      return PIPE_ERROR_BAD_INPUT;
+   }
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_DMA,
+                            sizeof *cmd + boxesSize + sizeof *pSuffix,
+                            2);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->region_relocation(swc, &cmd->guest.ptr, st->hwbuf, 0, region_flags);
+   cmd->guest.pitch = st->base.stride;
+
+   swc->surface_relocation(swc, &cmd->host.sid, texture->handle, surface_flags);
+   cmd->host.face = st->base.sr.face; /* PIPE_TEX_FACE_* and SVGA3D_CUBEFACE_* match */
+   cmd->host.mipmap = st->base.sr.level;
+
+   cmd->transfer = transfer;
+
+   memcpy(&cmd[1], boxes, boxesSize);
+   
+   pSuffix = (SVGA3dCmdSurfaceDMASuffix *)((uint8_t*)cmd + sizeof *cmd + boxesSize);
+   pSuffix->suffixSize = sizeof *pSuffix;
+   pSuffix->maximumOffset = st->hw_nblocksy*st->base.stride;
+   memset(&pSuffix->flags, 0, sizeof pSuffix->flags);
+
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+enum pipe_error
+SVGA3D_BufferDMA(struct svga_winsys_context *swc,
+                 struct svga_winsys_buffer *guest,
+                 struct svga_winsys_surface *host,
+                 SVGA3dTransferType transfer,      // IN
+                 uint32 size,                      // IN
+                 uint32 guest_offset,              // IN
+                 uint32 host_offset,               // IN
+                 SVGA3dSurfaceDMAFlags flags)      // IN
+{
+   SVGA3dCmdSurfaceDMA *cmd;
+   SVGA3dCopyBox *box;
+   SVGA3dCmdSurfaceDMASuffix *pSuffix;
+   unsigned region_flags;
+   unsigned surface_flags;
+   
+   if(transfer == SVGA3D_WRITE_HOST_VRAM) {
+      region_flags = SVGA_RELOC_READ;
+      surface_flags = SVGA_RELOC_WRITE;
+   }
+   else if(transfer == SVGA3D_READ_HOST_VRAM) {
+      region_flags = SVGA_RELOC_WRITE;
+      surface_flags = SVGA_RELOC_READ;
+   }
+   else {
+      assert(0);
+      return PIPE_ERROR_BAD_INPUT;
+   }
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_DMA,
+                            sizeof *cmd + sizeof *box + sizeof *pSuffix,
+                            2);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->region_relocation(swc, &cmd->guest.ptr, guest, 0, region_flags);
+   cmd->guest.pitch = 0;
+
+   swc->surface_relocation(swc, &cmd->host.sid, host, surface_flags);
+   cmd->host.face = 0;
+   cmd->host.mipmap = 0;
+
+   cmd->transfer = transfer;
+
+   box = (SVGA3dCopyBox *)&cmd[1];
+   box->x = host_offset;
+   box->y = 0;
+   box->z = 0;
+   box->w = size;
+   box->h = 1;
+   box->d = 1;
+   box->srcx = guest_offset;
+   box->srcy = 0;
+   box->srcz = 0;
+   
+   pSuffix = (SVGA3dCmdSurfaceDMASuffix *)((uint8_t*)cmd + sizeof *cmd + sizeof *box);
+   pSuffix->suffixSize = sizeof *pSuffix;
+   pSuffix->maximumOffset = guest_offset + size;
+   pSuffix->flags = flags;
+
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetRenderTarget --
+ *
+ *      Bind a surface object to a particular render target attachment
+ *      point on the current context. Render target attachment points
+ *      exist for color buffers, a depth buffer, and a stencil buffer.
+ *
+ *      The SVGA3D device is quite lenient about the types of surfaces
+ *      that may be used as render targets. The color buffers must
+ *      all be the same size, but the depth and stencil buffers do not
+ *      have to be the same size as the color buffer. All attachments
+ *      are optional.
+ *
+ *      Some combinations of render target formats may require software
+ *      emulation, depending on the capabilities of the host graphics
+ *      API and graphics hardware.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SetRenderTarget(struct svga_winsys_context *swc,
+                       SVGA3dRenderTargetType type,   // IN
+                       struct pipe_surface *surface)  // IN
+{
+   SVGA3dCmdSetRenderTarget *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETRENDERTARGET, sizeof *cmd, 1);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+
+   cmd->cid = swc->cid;
+
+   cmd->type = type;
+
+   surface_to_surfaceid(swc, surface, &cmd->target, SVGA_RELOC_WRITE);
+
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+
+
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_DefineShader --
+ *
+ *      Upload the bytecode for a new shader. The bytecode is "SVGA3D
+ *      format", which is theoretically a binary-compatible superset
+ *      of Microsoft's DirectX shader bytecode. In practice, the
+ *      SVGA3D bytecode doesn't yet have any extensions to DirectX's
+ *      bytecode format.
+ *
+ *      The SVGA3D device supports shader models 1.1 through 2.0.
+ *
+ *      The caller chooses a shader ID (small positive integer) by
+ *      which this shader will be identified in future commands. This
+ *      ID is in a namespace which is per-context and per-shader-type.
+ *
+ *      'bytecodeLen' is specified in bytes. It must be a multiple of 4.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_DefineShader(struct svga_winsys_context *swc,
+                    uint32 shid,                  // IN
+                    SVGA3dShaderType type,        // IN
+                    const uint32 *bytecode,       // IN
+                    uint32 bytecodeLen)           // IN
+{
+   SVGA3dCmdDefineShader *cmd;
+
+   assert(bytecodeLen % 4 == 0);
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SHADER_DEFINE, sizeof *cmd + bytecodeLen,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->shid = shid;
+   cmd->type = type;
+   memcpy(&cmd[1], bytecode, bytecodeLen);
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_DestroyShader --
+ *
+ *      Delete a shader that was created by SVGA3D_DefineShader. If
+ *      the shader was the current vertex or pixel shader for its
+ *      context, rendering results are undefined until a new shader is
+ *      bound.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_DestroyShader(struct svga_winsys_context *swc,
+                     uint32 shid,            // IN
+                     SVGA3dShaderType type)  // IN
+{
+   SVGA3dCmdDestroyShader *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SHADER_DESTROY, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->shid = shid;
+   cmd->type = type;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetShaderConst --
+ *
+ *      Set the value of a shader constant.
+ *
+ *      Shader constants are analogous to uniform variables in GLSL,
+ *      except that they belong to the render context rather than to
+ *      an individual shader.
+ *
+ *      Constants may have one of three types: A 4-vector of floats,
+ *      a 4-vector of integers, or a single boolean flag.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SetShaderConst(struct svga_winsys_context *swc,
+                      uint32 reg,                   // IN
+                      SVGA3dShaderType type,        // IN
+                      SVGA3dShaderConstType ctype,  // IN
+                      const void *value)            // IN
+{
+   SVGA3dCmdSetShaderConst *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SET_SHADER_CONST, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->reg = reg;
+   cmd->type = type;
+   cmd->ctype = ctype;
+
+   switch (ctype) {
+
+   case SVGA3D_CONST_TYPE_FLOAT:
+   case SVGA3D_CONST_TYPE_INT:
+      memcpy(&cmd->values, value, sizeof cmd->values);
+      break;
+
+   case SVGA3D_CONST_TYPE_BOOL:
+      memset(&cmd->values, 0, sizeof cmd->values);
+      cmd->values[0] = *(uint32*)value;
+      break;
+
+   default:
+      assert(0);
+      break;
+
+   }
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetShader --
+ *
+ *      Switch active shaders. This binds a new vertex or pixel shader
+ *      to the specified context.
+ *
+ *      A shader ID of SVGA3D_INVALID_ID unbinds any shader, switching
+ *      back to the fixed function vertex or pixel pipeline.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SetShader(struct svga_winsys_context *swc,
+                 SVGA3dShaderType type,  // IN
+                 uint32 shid)            // IN
+{
+   SVGA3dCmdSetShader *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SET_SHADER, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   
+   cmd->cid = swc->cid;
+   cmd->type = type;
+   cmd->shid = shid;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginClear --
+ *
+ *      Begin a CLEAR command. This reserves space for it in the FIFO,
+ *      and returns a pointer to the command's rectangle array.  This
+ *      function must be paired with SVGA_FIFOCommitAll().
+ *
+ *      Clear is a rendering operation which fills a list of
+ *      rectangles with constant values on all render target types
+ *      indicated by 'flags'.
+ *
+ *      Clear is not affected by clipping, depth test, or other
+ *      render state which affects the fragment pipeline.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      May write to attached render target surfaces.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginClear(struct svga_winsys_context *swc,
+                  SVGA3dClearFlag flags,  // IN
+                  uint32 color,           // IN
+                  float depth,            // IN
+                  uint32 stencil,         // IN
+                  SVGA3dRect **rects,     // OUT
+                  uint32 numRects)        // IN
+{
+   SVGA3dCmdClear *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_CLEAR, 
+                            sizeof *cmd + sizeof **rects * numRects,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->clearFlag = flags;
+   cmd->color = color;
+   cmd->depth = depth;
+   cmd->stencil = stencil;
+   *rects = (SVGA3dRect*) &cmd[1];
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_ClearRect --
+ *
+ *      This is a simplified version of SVGA3D_BeginClear().
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_ClearRect(struct svga_winsys_context *swc,
+                 SVGA3dClearFlag flags,  // IN
+                 uint32 color,           // IN
+                 float depth,            // IN
+                 uint32 stencil,         // IN
+                 uint32 x,               // IN
+                 uint32 y,               // IN
+                 uint32 w,               // IN
+                 uint32 h)               // IN
+{
+   SVGA3dRect *rect;
+   enum pipe_error ret;
+
+   ret = SVGA3D_BeginClear(swc, flags, color, depth, stencil, &rect, 1);
+   if(ret != PIPE_OK)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   memset(rect, 0, sizeof *rect);
+   rect->x = x;
+   rect->y = y;
+   rect->w = w;
+   rect->h = h;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginDrawPrimitives --
+ *
+ *      Begin a DRAW_PRIMITIVES command. This reserves space for it in
+ *      the FIFO, and returns a pointer to the command's arrays.
+ *      This function must be paired with SVGA_FIFOCommitAll().
+ *
+ *      Drawing commands consist of two variable-length arrays:
+ *      SVGA3dVertexDecl elements declare a set of vertex buffers to
+ *      use while rendering, and SVGA3dPrimitiveRange elements specify
+ *      groups of primitives each with an optional index buffer.
+ *
+ *      The decls and ranges arrays are initialized to zero.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      May write to attached render target surfaces.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginDrawPrimitives(struct svga_winsys_context *swc,
+                           SVGA3dVertexDecl **decls,      // OUT
+                           uint32 numVertexDecls,         // IN
+                           SVGA3dPrimitiveRange **ranges, // OUT
+                           uint32 numRanges)              // IN
+{
+   SVGA3dCmdDrawPrimitives *cmd;
+   SVGA3dVertexDecl *declArray;
+   SVGA3dPrimitiveRange *rangeArray;
+   uint32 declSize = sizeof **decls * numVertexDecls;
+   uint32 rangeSize = sizeof **ranges * numRanges;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_DRAW_PRIMITIVES, 
+                            sizeof *cmd + declSize + rangeSize,
+                            numVertexDecls + numRanges);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->numVertexDecls = numVertexDecls;
+   cmd->numRanges = numRanges;
+
+   declArray = (SVGA3dVertexDecl*) &cmd[1];
+   rangeArray = (SVGA3dPrimitiveRange*) &declArray[numVertexDecls];
+
+   memset(declArray, 0, declSize);
+   memset(rangeArray, 0, rangeSize);
+
+   *decls = declArray;
+   *ranges = rangeArray;
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginSurfaceCopy --
+ *
+ *      Begin a SURFACE_COPY command. This reserves space for it in
+ *      the FIFO, and returns a pointer to the command's arrays.  This
+ *      function must be paired with SVGA_FIFOCommitAll().
+ *
+ *      The box array is initialized with zeroes.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      Asynchronously copies a list of boxes from surface to surface.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginSurfaceCopy(struct svga_winsys_context *swc,
+                        struct pipe_surface *src,    // IN
+                        struct pipe_surface *dest,   // IN
+                        SVGA3dCopyBox **boxes,       // OUT
+                        uint32 numBoxes)             // IN
+{
+   SVGA3dCmdSurfaceCopy *cmd;
+   uint32 boxesSize = sizeof **boxes * numBoxes;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_COPY, sizeof *cmd + boxesSize,
+                            2);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   surface_to_surfaceid(swc, src, &cmd->src, SVGA_RELOC_READ);
+   surface_to_surfaceid(swc, dest, &cmd->dest, SVGA_RELOC_WRITE);
+   *boxes = (SVGA3dCopyBox*) &cmd[1];
+
+   memset(*boxes, 0, boxesSize);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SurfaceStretchBlt --
+ *
+ *      Issue a SURFACE_STRETCHBLT command: an asynchronous
+ *      surface-to-surface blit, with scaling.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      Asynchronously copies one box from surface to surface.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SurfaceStretchBlt(struct svga_winsys_context *swc,
+                         struct pipe_surface *src,    // IN
+                         struct pipe_surface *dest,   // IN
+                         SVGA3dBox *boxSrc,           // IN
+                         SVGA3dBox *boxDest,          // IN
+                         SVGA3dStretchBltMode mode)   // IN
+{
+   SVGA3dCmdSurfaceStretchBlt *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_STRETCHBLT, sizeof *cmd,
+                            2);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   surface_to_surfaceid(swc, src, &cmd->src, SVGA_RELOC_READ);
+   surface_to_surfaceid(swc, dest, &cmd->dest, SVGA_RELOC_WRITE);
+   cmd->boxSrc = *boxSrc;
+   cmd->boxDest = *boxDest;
+   cmd->mode = mode;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetViewport --
+ *
+ *      Set the current context's viewport rectangle. The viewport
+ *      is clipped to the dimensions of the current render target,
+ *      then all rendering is clipped to the viewport.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SetViewport(struct svga_winsys_context *swc,
+                   SVGA3dRect *rect)  // IN
+{
+   SVGA3dCmdSetViewport *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETVIEWPORT, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->rect = *rect;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetScissorRect --
+ *
+ *      Set the current context's scissor rectangle. If scissor
+ *      is enabled then all rendering is clipped to the scissor.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SetScissorRect(struct svga_winsys_context *swc,
+                      SVGA3dRect *rect)  // IN
+{
+   SVGA3dCmdSetScissorRect *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETSCISSORRECT, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->rect = *rect;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetClipPlane --
+ *
+ *      Set one of the current context's clip planes. If the clip
+ *      plane is enabled then all 3d rendering is clipped to against
+ *      the plane.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error SVGA3D_SetClipPlane(struct svga_winsys_context *swc,
+                         uint32 index, const float *plane)
+{
+   SVGA3dCmdSetClipPlane *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETCLIPPLANE, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->index = index;
+   cmd->plane[0] = plane[0];
+   cmd->plane[1] = plane[1];
+   cmd->plane[2] = plane[2];
+   cmd->plane[3] = plane[3];
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetZRange --
+ *
+ *      Set the range of the depth buffer to use. 'min' and 'max'
+ *      are values between 0.0 and 1.0.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SetZRange(struct svga_winsys_context *swc,
+                 float zMin,  // IN
+                 float zMax)  // IN
+{
+   SVGA3dCmdSetZRange *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETZRANGE, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->zRange.min = zMin;
+   cmd->zRange.max = zMax;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginSetTextureState --
+ *
+ *      Begin a SETTEXTURESTATE command. This reserves space for it in
+ *      the FIFO, and returns a pointer to the command's texture state
+ *      array.  This function must be paired with SVGA_FIFOCommitAll().
+ *
+ *      This command sets rendering state which is per-texture-unit.
+ *
+ *      XXX: Individual texture states need documentation. However,
+ *           they are very similar to the texture states defined by
+ *           Direct3D. The D3D documentation is a good starting point
+ *           for understanding SVGA3D texture states.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginSetTextureState(struct svga_winsys_context *swc,
+                            SVGA3dTextureState **states,  // OUT
+                            uint32 numStates)             // IN
+{
+   SVGA3dCmdSetTextureState *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETTEXTURESTATE, 
+                            sizeof *cmd + sizeof **states * numStates,
+                            numStates);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   *states = (SVGA3dTextureState*) &cmd[1];
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginSetRenderState --
+ *
+ *      Begin a SETRENDERSTATE command. This reserves space for it in
+ *      the FIFO, and returns a pointer to the command's texture state
+ *      array.  This function must be paired with SVGA_FIFOCommitAll().
+ *
+ *      This command sets rendering state which is global to the context.
+ *
+ *      XXX: Individual render states need documentation. However,
+ *           they are very similar to the render states defined by
+ *           Direct3D. The D3D documentation is a good starting point
+ *           for understanding SVGA3D render states.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginSetRenderState(struct svga_winsys_context *swc,
+                           SVGA3dRenderState **states,  // OUT
+                           uint32 numStates)            // IN
+{
+   SVGA3dCmdSetRenderState *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETRENDERSTATE, 
+                            sizeof *cmd + sizeof **states * numStates,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   *states = (SVGA3dRenderState*) &cmd[1];
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginQuery--
+ *
+ *      Issues a SVGA_3D_CMD_BEGIN_QUERY command.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      Commits space in the FIFO memory.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginQuery(struct svga_winsys_context *swc,
+                  SVGA3dQueryType type) // IN
+{
+   SVGA3dCmdBeginQuery *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_BEGIN_QUERY,
+                            sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->type = type;
+
+   swc->commit(swc);
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_EndQuery--
+ *
+ *      Issues a SVGA_3D_CMD_END_QUERY command.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      Commits space in the FIFO memory.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_EndQuery(struct svga_winsys_context *swc,
+                SVGA3dQueryType type,              // IN
+                struct svga_winsys_buffer *buffer) // IN/OUT
+{
+   SVGA3dCmdEndQuery *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_END_QUERY, 
+                            sizeof *cmd,
+                            1);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->type = type;
+
+   swc->region_relocation(swc, &cmd->guestResult, buffer, 0,
+                          SVGA_RELOC_WRITE);
+
+   swc->commit(swc);
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_WaitForQuery--
+ *
+ *      Issues a SVGA_3D_CMD_WAIT_FOR_QUERY command.  This reserves space
+ *      for it in the FIFO.  This doesn't actually wait for the query to
+ *      finish but instead tells the host to start a wait at the driver
+ *      level.  The caller can wait on the status variable in the
+ *      guestPtr memory or send an insert fence instruction after this
+ *      command and wait on the fence.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      Commits space in the FIFO memory.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_WaitForQuery(struct svga_winsys_context *swc,
+                    SVGA3dQueryType type,              // IN
+                    struct svga_winsys_buffer *buffer) // IN/OUT
+{
+   SVGA3dCmdWaitForQuery *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_WAIT_FOR_QUERY, 
+                            sizeof *cmd,
+                            1);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->type = type;
+   
+   swc->region_relocation(swc, &cmd->guestResult, buffer, 0,
+                          SVGA_RELOC_WRITE);
+
+   swc->commit(swc);
+   
+   return PIPE_OK;
+}
diff --git a/src/gallium/drivers/svga/svga_cmd.h b/src/gallium/drivers/svga/svga_cmd.h
new file mode 100644
index 0000000000..0e568d78e6
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_cmd.h
@@ -0,0 +1,235 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga_cmd.h --
+ *
+ *      Command construction utility for the SVGA3D protocol used by
+ *      the VMware SVGA device, based on the svgautil library.
+ */
+
+#ifndef __SVGA3D_H__
+#define __SVGA3D_H__
+
+
+#include "svga_types.h"
+#include "svga_reg.h"
+#include "svga3d_reg.h"
+
+#include "pipe/p_defines.h"
+
+
+struct pipe_surface;
+struct svga_transfer;
+struct svga_winsys_context;
+struct svga_winsys_buffer;
+struct svga_winsys_surface;
+
+
+/*
+ * SVGA Device Interoperability
+ */
+
+void *
+SVGA3D_FIFOReserve(struct svga_winsys_context *swc, uint32 cmd, uint32 cmdSize, uint32 nr_relocs);
+
+void
+SVGA_FIFOCommitAll(struct svga_winsys_context *swc);
+
+
+/*
+ * Context Management
+ */
+
+enum pipe_error
+SVGA3D_DefineContext(struct svga_winsys_context *swc);
+
+enum pipe_error
+SVGA3D_DestroyContext(struct svga_winsys_context *swc);
+
+
+/*
+ * Surface Management
+ */
+
+enum pipe_error
+SVGA3D_BeginDefineSurface(struct svga_winsys_context *swc,
+                          struct svga_winsys_surface *sid,
+                          SVGA3dSurfaceFlags flags,
+                          SVGA3dSurfaceFormat format,
+                          SVGA3dSurfaceFace **faces,
+                          SVGA3dSize **mipSizes,
+                          uint32 numMipSizes);
+enum pipe_error
+SVGA3D_DefineSurface2D(struct svga_winsys_context *swc,
+                       struct svga_winsys_surface *sid,
+                       uint32 width,
+                       uint32 height,
+                       SVGA3dSurfaceFormat format);
+enum pipe_error
+SVGA3D_DestroySurface(struct svga_winsys_context *swc,
+                      struct svga_winsys_surface *sid);
+
+
+/*
+ * Surface Operations
+ */
+
+enum pipe_error
+SVGA3D_SurfaceDMA(struct svga_winsys_context *swc,
+                  struct svga_transfer *st,
+                  SVGA3dTransferType transfer,
+                  const SVGA3dCopyBox *boxes,
+                  uint32 numBoxes);
+
+enum pipe_error
+SVGA3D_BufferDMA(struct svga_winsys_context *swc,
+                 struct svga_winsys_buffer *guest,
+                 struct svga_winsys_surface *host,
+                 SVGA3dTransferType transfer,
+                 uint32 size,
+                 uint32 guest_offset,
+                 uint32 host_offset,
+                 SVGA3dSurfaceDMAFlags flags);
+
+/*
+ * Drawing Operations
+ */
+
+
+enum pipe_error
+SVGA3D_BeginClear(struct svga_winsys_context *swc,
+                  SVGA3dClearFlag flags,
+                  uint32 color, float depth, uint32 stencil,
+                  SVGA3dRect **rects, uint32 numRects);
+
+enum pipe_error
+SVGA3D_ClearRect(struct svga_winsys_context *swc,
+                 SVGA3dClearFlag flags, uint32 color, float depth,
+                 uint32 stencil, uint32 x, uint32 y, uint32 w, uint32 h);
+
+enum pipe_error
+SVGA3D_BeginDrawPrimitives(struct svga_winsys_context *swc,
+                           SVGA3dVertexDecl **decls,
+                           uint32 numVertexDecls,
+                           SVGA3dPrimitiveRange **ranges,
+                           uint32 numRanges);
+
+/*
+ * Blits
+ */
+
+enum pipe_error
+SVGA3D_BeginSurfaceCopy(struct svga_winsys_context *swc,
+                        struct pipe_surface *src,
+                        struct pipe_surface *dest,
+                        SVGA3dCopyBox **boxes, uint32 numBoxes);
+
+
+enum pipe_error
+SVGA3D_SurfaceStretchBlt(struct svga_winsys_context *swc,
+                         struct pipe_surface *src,
+                         struct pipe_surface *dest,
+                         SVGA3dBox *boxSrc, SVGA3dBox *boxDest,
+                         SVGA3dStretchBltMode mode);
+
+/*
+ * Shared FFP/Shader Render State
+ */
+
+enum pipe_error
+SVGA3D_SetRenderTarget(struct svga_winsys_context *swc,
+                       SVGA3dRenderTargetType type,
+                       struct pipe_surface *surface);
+
+enum pipe_error
+SVGA3D_SetZRange(struct svga_winsys_context *swc,
+                 float zMin, float zMax);
+
+enum pipe_error
+SVGA3D_SetViewport(struct svga_winsys_context *swc,
+                   SVGA3dRect *rect);
+
+enum pipe_error
+SVGA3D_SetScissorRect(struct svga_winsys_context *swc,
+                      SVGA3dRect *rect);
+
+enum pipe_error
+SVGA3D_SetClipPlane(struct svga_winsys_context *swc,
+                    uint32 index, const float *plane);
+
+enum pipe_error
+SVGA3D_BeginSetTextureState(struct svga_winsys_context *swc,
+                            SVGA3dTextureState **states,
+                            uint32 numStates);
+
+enum pipe_error
+SVGA3D_BeginSetRenderState(struct svga_winsys_context *swc,
+                           SVGA3dRenderState **states,
+                           uint32 numStates);
+
+
+/*
+ * Shaders
+ */
+
+enum pipe_error
+SVGA3D_DefineShader(struct svga_winsys_context *swc,
+                    uint32 shid, SVGA3dShaderType type,
+                    const uint32 *bytecode, uint32 bytecodeLen);
+
+enum pipe_error
+SVGA3D_DestroyShader(struct svga_winsys_context *swc,
+                     uint32 shid, SVGA3dShaderType type);
+
+enum pipe_error
+SVGA3D_SetShaderConst(struct svga_winsys_context *swc,
+                      uint32 reg, SVGA3dShaderType type,
+                      SVGA3dShaderConstType ctype, const void *value);
+
+enum pipe_error
+SVGA3D_SetShader(struct svga_winsys_context *swc,
+                 SVGA3dShaderType type, uint32 shid);
+
+
+/*
+ * Queries
+ */
+
+enum pipe_error
+SVGA3D_BeginQuery(struct svga_winsys_context *swc,
+                  SVGA3dQueryType type);
+
+enum pipe_error
+SVGA3D_EndQuery(struct svga_winsys_context *swc,
+                SVGA3dQueryType type,
+                struct svga_winsys_buffer *buffer);
+
+enum pipe_error
+SVGA3D_WaitForQuery(struct svga_winsys_context *swc,
+                    SVGA3dQueryType type,
+                    struct svga_winsys_buffer *buffer);
+
+#endif /* __SVGA3D_H__ */
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
new file mode 100644
index 0000000000..3b30b9e341
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -0,0 +1,246 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "pipe/p_screen.h"
+#include "util/u_memory.h"
+#include "util/u_bitmask.h"
+#include "util/u_upload_mgr.h"
+
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_resource_texture.h"
+#include "svga_resource_buffer.h"
+#include "svga_resource.h"
+#include "svga_winsys.h"
+#include "svga_swtnl.h"
+#include "svga_draw.h"
+#include "svga_debug.h"
+#include "svga_state.h"
+
+
+static void svga_destroy( struct pipe_context *pipe )
+{
+   struct svga_context *svga = svga_context( pipe );
+   unsigned shader;
+
+   svga_cleanup_framebuffer( svga );
+   svga_cleanup_tss_binding( svga );
+
+   svga_hwtnl_destroy( svga->hwtnl );
+
+   svga_cleanup_vertex_state(svga);
+   
+   svga->swc->destroy(svga->swc);
+   
+   svga_destroy_swtnl( svga );
+
+   u_upload_destroy( svga->upload_vb );
+   u_upload_destroy( svga->upload_ib );
+
+   util_bitmask_destroy( svga->vs_bm );
+   util_bitmask_destroy( svga->fs_bm );
+
+   for(shader = 0; shader < PIPE_SHADER_TYPES; ++shader)
+      pipe_resource_reference( &svga->curr.cb[shader], NULL );
+
+   FREE( svga );
+}
+
+
+
+struct pipe_context *svga_context_create( struct pipe_screen *screen,
+					  void *priv )
+{
+   struct svga_screen *svgascreen = svga_screen(screen);
+   struct svga_context *svga = NULL;
+   enum pipe_error ret;
+
+   svga = CALLOC_STRUCT(svga_context);
+   if (svga == NULL)
+      goto no_svga;
+
+   svga->pipe.winsys = screen->winsys;
+   svga->pipe.screen = screen;
+   svga->pipe.priv = priv;
+   svga->pipe.destroy = svga_destroy;
+   svga->pipe.clear = svga_clear;
+
+   svga->swc = svgascreen->sws->context_create(svgascreen->sws);
+   if(!svga->swc)
+      goto no_swc;
+
+   svga_init_resource_functions(svga);
+   svga_init_blend_functions(svga);
+   svga_init_blit_functions(svga);
+   svga_init_depth_stencil_functions(svga);
+   svga_init_draw_functions(svga);
+   svga_init_flush_functions(svga);
+   svga_init_misc_functions(svga);
+   svga_init_rasterizer_functions(svga);
+   svga_init_sampler_functions(svga);
+   svga_init_fs_functions(svga);
+   svga_init_vs_functions(svga);
+   svga_init_vertex_functions(svga);
+   svga_init_constbuffer_functions(svga);
+   svga_init_query_functions(svga);
+
+
+   /* debug */
+   svga->debug.no_swtnl = debug_get_bool_option("SVGA_NO_SWTNL", FALSE);
+   svga->debug.force_swtnl = debug_get_bool_option("SVGA_FORCE_SWTNL", FALSE);
+   svga->debug.use_min_mipmap = debug_get_bool_option("SVGA_USE_MIN_MIPMAP", FALSE);
+   svga->debug.disable_shader = debug_get_num_option("SVGA_DISABLE_SHADER", ~0);
+
+   if (!svga_init_swtnl(svga))
+      goto no_swtnl;
+
+   svga->fs_bm = util_bitmask_create();
+   if (svga->fs_bm == NULL)
+      goto no_fs_bm;
+
+   svga->vs_bm = util_bitmask_create();
+   if (svga->vs_bm == NULL)
+      goto no_vs_bm;
+
+   svga->upload_ib = u_upload_create( &svga->pipe,
+                                      32 * 1024,
+                                      16,
+                                      PIPE_BIND_INDEX_BUFFER );
+   if (svga->upload_ib == NULL)
+      goto no_upload_ib;
+
+   svga->upload_vb = u_upload_create( &svga->pipe,
+                                      128 * 1024,
+                                      16,
+                                      PIPE_BIND_VERTEX_BUFFER );
+   if (svga->upload_vb == NULL)
+      goto no_upload_vb;
+
+   svga->hwtnl = svga_hwtnl_create( svga,
+                                    svga->upload_ib,
+                                    svga->swc );
+   if (svga->hwtnl == NULL)
+      goto no_hwtnl;
+
+
+   ret = svga_emit_initial_state( svga );
+   if (ret)
+      goto no_state;
+   
+   /* Avoid shortcircuiting state with initial value of zero.
+    */
+   memset(&svga->state.hw_clear, 0xcd, sizeof(svga->state.hw_clear));
+   memset(&svga->state.hw_clear.framebuffer, 0x0, 
+          sizeof(svga->state.hw_clear.framebuffer));
+
+   memset(&svga->state.hw_draw, 0xcd, sizeof(svga->state.hw_draw));
+   memset(&svga->state.hw_draw.views, 0x0, sizeof(svga->state.hw_draw.views));
+   svga->state.hw_draw.num_views = 0;
+
+   svga->dirty = ~0;
+
+   LIST_INITHEAD(&svga->dirty_buffers);
+
+   return &svga->pipe;
+
+no_state:
+   svga_hwtnl_destroy( svga->hwtnl );
+no_hwtnl:
+   u_upload_destroy( svga->upload_vb );
+no_upload_vb:
+   u_upload_destroy( svga->upload_ib );
+no_upload_ib:
+   util_bitmask_destroy( svga->vs_bm );
+no_vs_bm:
+   util_bitmask_destroy( svga->fs_bm );
+no_fs_bm:
+   svga_destroy_swtnl(svga);
+no_swtnl:
+   svga->swc->destroy(svga->swc);
+no_swc:
+   FREE(svga);
+no_svga:
+   return NULL;
+}
+
+
+void svga_context_flush( struct svga_context *svga, 
+                         struct pipe_fence_handle **pfence )
+{
+   struct svga_screen *svgascreen = svga_screen(svga->pipe.screen);
+   struct pipe_fence_handle *fence = NULL;
+
+   svga->curr.nr_fbs = 0;
+
+   /* Unmap upload manager buffers: 
+    */
+   u_upload_flush(svga->upload_vb);
+   u_upload_flush(svga->upload_ib);
+
+   /* Ensure that texture dma uploads are processed
+    * before submitting commands.
+    */
+   svga_context_flush_buffers(svga);
+
+   /* Flush pending commands to hardware:
+    */
+   svga->swc->flush(svga->swc, &fence);
+
+   svga_screen_cache_flush(svgascreen, fence);
+
+   if (SVGA_DEBUG & DEBUG_SYNC) {
+      if (fence)
+         svga->pipe.screen->fence_finish( svga->pipe.screen, fence, 0);
+   }
+
+   if(pfence)
+      *pfence = fence;
+   else
+      svgascreen->sws->fence_reference(svgascreen->sws, &fence, NULL);
+}
+
+
+void svga_hwtnl_flush_retry( struct svga_context *svga )
+{
+   enum pipe_error ret = PIPE_OK;
+
+   ret = svga_hwtnl_flush( svga->hwtnl );
+   if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
+      svga_context_flush( svga, NULL );
+      ret = svga_hwtnl_flush( svga->hwtnl );
+   }
+
+   assert(ret == 0);
+}
+
+struct svga_winsys_context *
+svga_winsys_context( struct pipe_context *pipe )
+{
+   return svga_context( pipe )->swc;
+}
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
new file mode 100644
index 0000000000..9a46de643f
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -0,0 +1,450 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_CONTEXT_H
+#define SVGA_CONTEXT_H
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "util/u_double_list.h"
+
+#include "tgsi/tgsi_scan.h"
+
+
+#define SVGA_TEX_UNITS 8
+#define SVGA_MAX_POINTSIZE 80.0
+
+struct draw_vertex_shader;
+struct svga_shader_result;
+struct SVGACmdMemory;
+struct util_bitmask;
+struct u_upload_mgr;
+
+
+struct svga_shader
+{
+   const struct tgsi_token *tokens;
+
+   struct tgsi_shader_info info;
+
+   struct svga_shader_result *results;
+
+   unsigned id;
+
+   boolean use_sm30;
+};
+
+struct svga_fragment_shader
+{
+   struct svga_shader base;
+};
+
+struct svga_vertex_shader
+{
+   struct svga_shader base;
+
+   struct draw_vertex_shader *draw_shader;
+};
+
+
+struct svga_cache_context;
+struct svga_tracked_state;
+
+struct svga_blend_state {
+
+   boolean need_white_fragments;
+
+   /* Should be per-render-target:
+    */
+   struct {
+      uint8_t writemask;
+
+      boolean blend_enable;
+      uint8_t srcblend;
+      uint8_t dstblend;
+      uint8_t blendeq;
+      
+      boolean separate_alpha_blend_enable;
+      uint8_t srcblend_alpha;
+      uint8_t dstblend_alpha;
+      uint8_t blendeq_alpha;
+
+   } rt[1];
+};
+
+struct svga_depth_stencil_state {
+   unsigned zfunc:8;
+   unsigned zenable:1;
+   unsigned zwriteenable:1;
+
+   unsigned alphatestenable:1;
+   unsigned alphafunc:8;
+  
+   struct {
+      unsigned enabled:1;
+      unsigned func:8;
+      unsigned fail:8;
+      unsigned zfail:8;
+      unsigned pass:8;
+   } stencil[2];
+   
+   /* SVGA3D has one ref/mask/writemask triple shared between front &
+    * back face stencil.  We really need two:
+    */
+   unsigned stencil_mask:8;
+   unsigned stencil_writemask:8;
+
+   float    alpharef;
+};
+
+#define SVGA_UNFILLED_DISABLE 0
+#define SVGA_UNFILLED_LINE    1
+#define SVGA_UNFILLED_POINT   2
+
+#define SVGA_PIPELINE_FLAG_POINTS   (1<<PIPE_PRIM_POINTS)
+#define SVGA_PIPELINE_FLAG_LINES    (1<<PIPE_PRIM_LINES)
+#define SVGA_PIPELINE_FLAG_TRIS     (1<<PIPE_PRIM_TRIANGLES)
+
+struct svga_rasterizer_state {
+   struct pipe_rasterizer_state templ; /* needed for draw module */
+
+   unsigned shademode:8;
+   unsigned cullmode:8;
+   unsigned scissortestenable:1;
+   unsigned multisampleantialias:1;
+   unsigned antialiasedlineenable:1;
+   unsigned lastpixel:1;
+
+   unsigned linepattern;
+
+   float slopescaledepthbias;
+   float depthbias;
+   float pointsize;
+   
+   unsigned hw_unfilled:16;         /* PIPE_POLYGON_MODE_x */
+   unsigned need_pipeline:16;    /* which prims do we need help for? */
+};
+
+struct svga_sampler_state {
+   unsigned mipfilter;
+   unsigned magfilter;
+   unsigned minfilter;
+   unsigned aniso_level;
+   float lod_bias;
+   unsigned addressu;
+   unsigned addressv;
+   unsigned addressw;
+   unsigned bordercolor;
+   unsigned normalized_coords:1;
+   unsigned compare_mode:1;
+   unsigned compare_func:3;
+
+   unsigned min_lod;
+   unsigned view_min_lod;
+   unsigned view_max_lod;
+};
+
+struct svga_velems_state {
+   unsigned count;
+   struct pipe_vertex_element velem[PIPE_MAX_ATTRIBS];
+};
+
+/* Use to calculate differences between state emitted to hardware and
+ * current driver-calculated state.  
+ */
+struct svga_state 
+{
+   const struct svga_blend_state *blend;
+   const struct svga_depth_stencil_state *depth;
+   const struct svga_rasterizer_state *rast;
+   const struct svga_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   const struct svga_velems_state *velems;
+
+   struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS]; /* or texture ID's? */
+   struct svga_fragment_shader *fs;
+   struct svga_vertex_shader *vs;
+
+   struct pipe_vertex_buffer vb[PIPE_MAX_ATTRIBS];
+   struct pipe_resource *cb[PIPE_SHADER_TYPES];
+
+   struct pipe_framebuffer_state framebuffer;
+   float depthscale;
+
+   /* Hack to limit the number of different render targets between
+    * flushes.  Helps avoid blowing out our surface cache in EXA.
+    */
+   int nr_fbs;
+
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_blend_color blend_color;
+   struct pipe_stencil_ref stencil_ref;
+   struct pipe_clip_state clip;
+   struct pipe_viewport_state viewport;
+
+   unsigned num_samplers;
+   unsigned num_sampler_views;
+   unsigned num_vertex_buffers;
+   unsigned reduced_prim;
+
+   struct {
+      unsigned flag_1d;
+      unsigned flag_srgb;
+   } tex_flags;
+
+   boolean any_user_vertex_buffers;
+
+   unsigned zero_stride_vertex_elements;
+   unsigned num_zero_stride_vertex_elements;
+   /* ### maybe dynamically allocate this */
+   float zero_stride_constants[PIPE_MAX_ATTRIBS*4];
+};
+
+#define RS_MAX 97
+#define TS_MAX 30
+#define CB_MAX 256
+
+struct svga_prescale {
+   float translate[4];
+   float scale[4];
+   boolean enabled;
+};
+
+
+/* Updated by calling svga_update_state( SVGA_STATE_HW_VIEWPORT )
+ */
+struct svga_hw_clear_state
+{
+   struct {
+      unsigned x,y,w,h;
+   } viewport;
+
+   struct {
+      float zmin, zmax;
+   } depthrange;
+   
+   struct pipe_framebuffer_state framebuffer;
+   struct svga_prescale prescale;
+};
+
+struct svga_hw_view_state
+{
+   struct pipe_resource *texture;
+   struct svga_sampler_view *v;
+   unsigned min_lod;
+   unsigned max_lod;
+   int dirty;
+};
+
+/* Updated by calling svga_update_state( SVGA_STATE_HW_DRAW )
+ */
+struct svga_hw_draw_state
+{
+   unsigned rs[RS_MAX];
+   unsigned ts[16][TS_MAX];
+   float cb[PIPE_SHADER_TYPES][CB_MAX][4];
+
+   struct svga_shader_result *fs;
+   struct svga_shader_result *vs;
+   struct svga_hw_view_state views[PIPE_MAX_SAMPLERS];
+
+   unsigned num_views;
+};
+
+
+/* Updated by calling svga_update_state( SVGA_STATE_NEED_SWTNL )
+ */
+struct svga_sw_state
+{
+   unsigned ve_format[PIPE_MAX_ATTRIBS]; /* NEW_VELEMENT */
+
+   /* which parts we need */
+   boolean need_swvfetch;
+   boolean need_pipeline;
+   boolean need_swtnl;
+};
+
+
+/* Queue some state updates (like rss) and submit them to hardware in
+ * a single packet.
+ */
+struct svga_hw_queue;
+
+struct svga_query;
+
+struct svga_context
+{
+   struct pipe_context pipe;
+   struct svga_winsys_context *swc;
+
+   struct {
+      boolean no_swtnl;
+      boolean force_swtnl;
+      boolean use_min_mipmap;
+
+      /* incremented for each shader */
+      unsigned shader_id;
+
+      unsigned disable_shader;
+   } debug;
+
+   struct {
+      struct draw_context *draw;
+      struct vbuf_render *backend;
+      unsigned hw_prim;
+      boolean new_vbuf;
+      boolean new_vdecl;
+   } swtnl;
+
+   /* Bitmask of used shader IDs */
+   struct util_bitmask *fs_bm;
+   struct util_bitmask *vs_bm;
+
+   struct {
+      unsigned dirty[4];
+
+      unsigned texture_timestamp;
+
+      /* 
+       */
+      struct svga_sw_state          sw;
+      struct svga_hw_draw_state     hw_draw;
+      struct svga_hw_clear_state    hw_clear;
+   } state;
+
+   struct svga_state curr;      /* state from the state tracker */
+   unsigned dirty;              /* statechanges since last update_state() */
+
+   struct u_upload_mgr *upload_ib;
+   struct u_upload_mgr *upload_vb;
+   struct svga_hwtnl *hwtnl;
+
+   /** The occlusion query currently in progress */
+   struct svga_query *sq;
+
+   /** List of buffers with queued transfers */
+   struct list_head dirty_buffers;
+};
+
+/* A flag for each state_tracker state object:
+ */
+#define SVGA_NEW_BLEND               0x1
+#define SVGA_NEW_DEPTH_STENCIL       0x2
+#define SVGA_NEW_RAST                0x4
+#define SVGA_NEW_SAMPLER             0x8
+#define SVGA_NEW_TEXTURE             0x10
+#define SVGA_NEW_VBUFFER             0x20
+#define SVGA_NEW_VELEMENT            0x40
+#define SVGA_NEW_FS                  0x80
+#define SVGA_NEW_VS                  0x100
+#define SVGA_NEW_FS_CONST_BUFFER     0x200
+#define SVGA_NEW_VS_CONST_BUFFER     0x400
+#define SVGA_NEW_FRAME_BUFFER        0x800
+#define SVGA_NEW_STIPPLE             0x1000
+#define SVGA_NEW_SCISSOR             0x2000
+#define SVGA_NEW_BLEND_COLOR         0x4000
+#define SVGA_NEW_CLIP                0x8000
+#define SVGA_NEW_VIEWPORT            0x10000
+#define SVGA_NEW_PRESCALE            0x20000
+#define SVGA_NEW_REDUCED_PRIMITIVE   0x40000
+#define SVGA_NEW_TEXTURE_BINDING     0x80000
+#define SVGA_NEW_NEED_PIPELINE       0x100000
+#define SVGA_NEW_NEED_SWVFETCH       0x200000
+#define SVGA_NEW_NEED_SWTNL          0x400000
+#define SVGA_NEW_FS_RESULT           0x800000
+#define SVGA_NEW_VS_RESULT           0x1000000
+#define SVGA_NEW_ZERO_STRIDE         0x2000000
+#define SVGA_NEW_TEXTURE_FLAGS       0x4000000
+#define SVGA_NEW_STENCIL_REF         0x8000000
+
+
+
+
+
+/***********************************************************************
+ * svga_clear.c: 
+ */
+void svga_clear(struct pipe_context *pipe, 
+                unsigned buffers,
+                const float *rgba,
+                double depth,
+                unsigned stencil);
+
+
+/***********************************************************************
+ * svga_screen_texture.c: 
+ */
+void svga_mark_surfaces_dirty(struct svga_context *svga);
+
+
+
+
+void svga_init_state_functions( struct svga_context *svga );
+void svga_init_flush_functions( struct svga_context *svga );
+void svga_init_string_functions( struct svga_context *svga );
+void svga_init_blit_functions(struct svga_context *svga);
+
+void svga_init_blend_functions( struct svga_context *svga );
+void svga_init_depth_stencil_functions( struct svga_context *svga );
+void svga_init_misc_functions( struct svga_context *svga );
+void svga_init_rasterizer_functions( struct svga_context *svga );
+void svga_init_sampler_functions( struct svga_context *svga );
+void svga_init_fs_functions( struct svga_context *svga );
+void svga_init_vs_functions( struct svga_context *svga );
+void svga_init_vertex_functions( struct svga_context *svga );
+void svga_init_constbuffer_functions( struct svga_context *svga );
+void svga_init_draw_functions( struct svga_context *svga );
+void svga_init_query_functions( struct svga_context *svga );
+
+void svga_cleanup_vertex_state( struct svga_context *svga );
+void svga_cleanup_tss_binding( struct svga_context *svga );
+void svga_cleanup_framebuffer( struct svga_context *svga );
+
+void svga_context_flush( struct svga_context *svga,
+                         struct pipe_fence_handle **pfence );
+
+void svga_hwtnl_flush_retry( struct svga_context *svga );
+
+struct pipe_context *
+svga_context_create(struct pipe_screen *screen,
+		    void *priv);
+
+
+/***********************************************************************
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static INLINE struct svga_context *
+svga_context( struct pipe_context *pipe )
+{
+   return (struct svga_context *)pipe;
+}
+
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_debug.h b/src/gallium/drivers/svga/svga_debug.h
new file mode 100644
index 0000000000..3a3fcd8fae
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_debug.h
@@ -0,0 +1,75 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_DEBUG_H
+#define SVGA_DEBUG_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+
+#define DEBUG_DMA      0x1
+#define DEBUG_TGSI     0x4
+#define DEBUG_PIPE     0x8
+#define DEBUG_STATE    0x10
+#define DEBUG_SCREEN   0x20
+#define DEBUG_TEX      0x40
+#define DEBUG_SWTNL    0x80
+#define DEBUG_CONSTS   0x100
+#define DEBUG_VIEWPORT 0x200
+#define DEBUG_VIEWS    0x400
+#define DEBUG_PERF     0x800    /* print something when we hit any slow path operation */
+#define DEBUG_FLUSH    0x1000   /* flush after every draw */
+#define DEBUG_SYNC     0x2000   /* sync after every flush */
+#define DEBUG_QUERY    0x4000
+#define DEBUG_CACHE    0x8000
+
+#ifdef DEBUG
+extern int SVGA_DEBUG;
+#define DBSTR(x) x
+#else
+#define SVGA_DEBUG 0
+#define DBSTR(x) ""
+#endif
+
+static INLINE void
+SVGA_DBG( unsigned flag, const char *fmt, ... )
+{
+#ifdef DEBUG 
+    if (SVGA_DEBUG & flag)
+    {
+        va_list args;
+
+        va_start( args, fmt );
+        debug_vprintf( fmt, args );
+        va_end( args );
+    }
+#else
+    (void)flag;
+    (void)fmt;
+#endif
+}
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
new file mode 100644
index 0000000000..81dd4778d0
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -0,0 +1,382 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_compiler.h"
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "svga_context.h"
+#include "svga_draw.h"
+#include "svga_draw_private.h"
+#include "svga_debug.h"
+#include "svga_screen.h"
+#include "svga_resource_buffer.h"
+#include "svga_resource_texture.h"
+#include "svga_surface.h"
+#include "svga_winsys.h"
+#include "svga_cmd.h"
+
+
+struct svga_hwtnl *svga_hwtnl_create( struct svga_context *svga,
+                                      struct u_upload_mgr *upload_ib,
+                                      struct svga_winsys_context *swc )
+{
+   struct svga_hwtnl *hwtnl = CALLOC_STRUCT(svga_hwtnl);
+   if (hwtnl == NULL)
+      goto fail;
+
+   hwtnl->svga = svga;
+   hwtnl->upload_ib = upload_ib;
+   
+   hwtnl->cmd.swc = swc;
+
+   return hwtnl;
+
+fail:
+   return NULL;
+}
+
+void svga_hwtnl_destroy( struct svga_hwtnl *hwtnl )
+{
+   int i, j;
+
+   for (i = 0; i < PIPE_PRIM_MAX; i++) {
+      for (j = 0; j < IDX_CACHE_MAX; j++) {
+         pipe_resource_reference( &hwtnl->index_cache[i][j].buffer,
+                                NULL );
+      }
+   }
+
+   for (i = 0; i < hwtnl->cmd.vdecl_count; i++)
+      pipe_resource_reference(&hwtnl->cmd.vdecl_vb[i], NULL);
+
+   for (i = 0; i < hwtnl->cmd.prim_count; i++)
+      pipe_resource_reference(&hwtnl->cmd.prim_ib[i], NULL);
+      
+
+   FREE(hwtnl);
+}
+
+
+void svga_hwtnl_set_flatshade( struct svga_hwtnl *hwtnl,
+                               boolean flatshade,
+                               boolean flatshade_first )
+{
+   hwtnl->hw_pv = PV_FIRST;
+   hwtnl->api_pv = (flatshade && !flatshade_first) ? PV_LAST : PV_FIRST;
+}                               
+
+void svga_hwtnl_set_unfilled( struct svga_hwtnl *hwtnl,
+                              unsigned mode )
+{
+   hwtnl->api_fillmode = mode;
+}                               
+
+void svga_hwtnl_reset_vdecl( struct svga_hwtnl *hwtnl,
+                             unsigned count )
+{
+   unsigned i;
+
+   assert(hwtnl->cmd.prim_count == 0);
+
+   for (i = count; i < hwtnl->cmd.vdecl_count; i++) {
+      pipe_resource_reference(&hwtnl->cmd.vdecl_vb[i],
+                            NULL);
+   }
+
+   hwtnl->cmd.vdecl_count = count;
+}
+
+
+void svga_hwtnl_vdecl( struct svga_hwtnl *hwtnl,
+		       unsigned i,
+		       const SVGA3dVertexDecl *decl,
+		       struct pipe_resource *vb)
+{
+   assert(hwtnl->cmd.prim_count == 0);
+
+   assert( i < hwtnl->cmd.vdecl_count );
+
+   hwtnl->cmd.vdecl[i] = *decl;
+
+   pipe_resource_reference(&hwtnl->cmd.vdecl_vb[i], vb);   
+}
+
+
+
+enum pipe_error
+svga_hwtnl_flush( struct svga_hwtnl *hwtnl )
+{
+   struct svga_winsys_context *swc = hwtnl->cmd.swc;
+   struct svga_context *svga = hwtnl->svga;
+   enum pipe_error ret;
+
+   if (hwtnl->cmd.prim_count) {
+      struct svga_winsys_surface *vb_handle[SVGA3D_INPUTREG_MAX];
+      struct svga_winsys_surface *ib_handle[QSZ];
+      struct svga_winsys_surface *handle;
+      SVGA3dVertexDecl *vdecl;
+      SVGA3dPrimitiveRange *prim;
+      unsigned i;
+
+      for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
+         handle = svga_buffer_handle(svga, hwtnl->cmd.vdecl_vb[i]);
+         if (handle == NULL)
+            return PIPE_ERROR_OUT_OF_MEMORY;
+
+         vb_handle[i] = handle;
+      }
+
+      for (i = 0; i < hwtnl->cmd.prim_count; i++) {
+         if (hwtnl->cmd.prim_ib[i]) {
+            handle = svga_buffer_handle(svga, hwtnl->cmd.prim_ib[i]);
+            if (handle == NULL)
+               return PIPE_ERROR_OUT_OF_MEMORY;
+         }
+         else
+            handle = NULL;
+
+         ib_handle[i] = handle;
+      }
+
+      SVGA_DBG(DEBUG_DMA, "draw to sid %p, %d prims\n",
+               svga->curr.framebuffer.cbufs[0] ?
+               svga_surface(svga->curr.framebuffer.cbufs[0])->handle : NULL,
+               hwtnl->cmd.prim_count);
+
+      ret = SVGA3D_BeginDrawPrimitives(swc, 
+                                       &vdecl, 
+                                       hwtnl->cmd.vdecl_count, 
+                                       &prim, 
+                                       hwtnl->cmd.prim_count);
+      if (ret != PIPE_OK) 
+         return ret;
+
+      
+      memcpy( vdecl,
+              hwtnl->cmd.vdecl,
+              hwtnl->cmd.vdecl_count * sizeof hwtnl->cmd.vdecl[0]);
+
+      for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
+         /* Given rangeHint is considered to be relative to indexBias, and 
+          * indexBias varies per primitive, we cannot accurately supply an 
+          * rangeHint when emitting more than one primitive per draw command.
+          */
+         if (hwtnl->cmd.prim_count == 1) {
+            vdecl[i].rangeHint.first = hwtnl->cmd.min_index[0];
+            vdecl[i].rangeHint.last = hwtnl->cmd.max_index[0] + 1;
+         }
+         else {
+            vdecl[i].rangeHint.first = 0;
+            vdecl[i].rangeHint.last = 0;
+         }
+
+         swc->surface_relocation(swc,
+                                 &vdecl[i].array.surfaceId,
+                                 vb_handle[i],
+                                 SVGA_RELOC_READ);
+      }
+
+      memcpy( prim,
+              hwtnl->cmd.prim,
+              hwtnl->cmd.prim_count * sizeof hwtnl->cmd.prim[0]);
+
+      for (i = 0; i < hwtnl->cmd.prim_count; i++) {
+         swc->surface_relocation(swc,
+                                 &prim[i].indexArray.surfaceId,
+                                 ib_handle[i],
+                                 SVGA_RELOC_READ);
+         pipe_resource_reference(&hwtnl->cmd.prim_ib[i], NULL);
+      }
+      
+      SVGA_FIFOCommitAll( swc );
+      hwtnl->cmd.prim_count = 0;
+   }
+
+   return PIPE_OK;
+}
+
+
+
+
+
+/***********************************************************************
+ * Internal functions:
+ */
+
+enum pipe_error svga_hwtnl_prim( struct svga_hwtnl *hwtnl,
+                                 const SVGA3dPrimitiveRange *range,
+                                 unsigned min_index,
+                                 unsigned max_index,
+                                 struct pipe_resource *ib )
+{
+   int ret = PIPE_OK;
+
+#ifdef DEBUG
+   {
+      unsigned i;
+      for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
+         struct pipe_resource *vb = hwtnl->cmd.vdecl_vb[i];
+         unsigned size = vb ? vb->width0 : 0;
+         unsigned offset = hwtnl->cmd.vdecl[i].array.offset;
+         unsigned stride = hwtnl->cmd.vdecl[i].array.stride;
+         unsigned index_bias = range->indexBias;
+         unsigned width;
+
+         assert(vb);
+         assert(size);
+         assert(offset < size);
+         assert(index_bias >= 0);
+         assert(min_index <= max_index);
+         assert(offset + index_bias*stride < size);
+         if (min_index != ~0) {
+            assert(offset + (index_bias + min_index) * stride < size);
+         }
+
+         switch (hwtnl->cmd.vdecl[i].identity.type) {
+         case SVGA3D_DECLTYPE_FLOAT1:
+            width = 4;
+            break;
+         case SVGA3D_DECLTYPE_FLOAT2:
+            width = 4*2;
+            break;
+         case SVGA3D_DECLTYPE_FLOAT3:
+            width = 4*3;
+            break;
+         case SVGA3D_DECLTYPE_FLOAT4:
+            width = 4*4;
+            break;
+         case SVGA3D_DECLTYPE_D3DCOLOR:
+            width = 4;
+            break;
+         case SVGA3D_DECLTYPE_UBYTE4:
+            width = 1*4;
+            break;
+         case SVGA3D_DECLTYPE_SHORT2:
+            width = 2*2;
+            break;
+         case SVGA3D_DECLTYPE_SHORT4:
+            width = 2*4;
+            break;
+         case SVGA3D_DECLTYPE_UBYTE4N:
+            width = 1*4;
+            break;
+         case SVGA3D_DECLTYPE_SHORT2N:
+            width = 2*2;
+            break;
+         case SVGA3D_DECLTYPE_SHORT4N:
+            width = 2*4;
+            break;
+         case SVGA3D_DECLTYPE_USHORT2N:
+            width = 2*2;
+            break;
+         case SVGA3D_DECLTYPE_USHORT4N:
+            width = 2*4;
+            break;
+         case SVGA3D_DECLTYPE_UDEC3:
+            width = 4;
+            break;
+         case SVGA3D_DECLTYPE_DEC3N:
+            width = 4;
+            break;
+         case SVGA3D_DECLTYPE_FLOAT16_2:
+            width = 2*2;
+            break;
+         case SVGA3D_DECLTYPE_FLOAT16_4:
+            width = 2*4;
+            break;
+         default:
+            assert(0);
+            width = 0;
+            break;
+         }
+
+         assert(!stride || width <= stride);
+         if (max_index != ~0) {
+            assert(offset + (index_bias + max_index) * stride + width <= size);
+         }
+      }
+
+      assert(range->indexWidth == range->indexArray.stride);
+
+      if(ib) {
+         unsigned size = ib->width0;
+         unsigned offset = range->indexArray.offset;
+         unsigned stride = range->indexArray.stride;
+         unsigned count;
+
+         assert(size);
+         assert(offset < size);
+         assert(stride);
+
+         switch (range->primType) {
+         case SVGA3D_PRIMITIVE_POINTLIST:
+            count = range->primitiveCount;
+            break;
+         case SVGA3D_PRIMITIVE_LINELIST:
+            count = range->primitiveCount * 2;
+            break;
+         case SVGA3D_PRIMITIVE_LINESTRIP:
+            count = range->primitiveCount + 1;
+            break;
+         case SVGA3D_PRIMITIVE_TRIANGLELIST:
+            count = range->primitiveCount * 3;
+            break;
+         case SVGA3D_PRIMITIVE_TRIANGLESTRIP:
+            count = range->primitiveCount + 2;
+            break;
+         case SVGA3D_PRIMITIVE_TRIANGLEFAN:
+            count = range->primitiveCount + 2;
+            break;
+         default:
+            assert(0);
+            count = 0;
+            break;
+         }
+
+         assert(offset + count*stride <= size);
+      }
+   }
+#endif
+
+   if (hwtnl->cmd.prim_count+1 >= QSZ) {
+      ret = svga_hwtnl_flush( hwtnl );
+      if (ret != PIPE_OK)
+         return ret;
+   }
+   
+   /* min/max indices are relative to bias */
+   hwtnl->cmd.min_index[hwtnl->cmd.prim_count] = min_index;
+   hwtnl->cmd.max_index[hwtnl->cmd.prim_count] = max_index;
+
+   hwtnl->cmd.prim[hwtnl->cmd.prim_count] = *range;
+
+   pipe_resource_reference(&hwtnl->cmd.prim_ib[hwtnl->cmd.prim_count], ib);
+   hwtnl->cmd.prim_count++;
+
+   return ret;
+}
diff --git a/src/gallium/drivers/svga/svga_draw.h b/src/gallium/drivers/svga/svga_draw.h
new file mode 100644
index 0000000000..a2403d802b
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_draw.h
@@ -0,0 +1,83 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_DRAW_H
+#define SVGA_DRAW_H
+
+#include "pipe/p_compiler.h"
+
+#include "svga_hw_reg.h"
+
+struct svga_hwtnl;
+struct svga_winsys_context;
+struct svga_screen;
+struct svga_context;
+struct pipe_resource;
+struct u_upload_mgr;
+
+struct svga_hwtnl *svga_hwtnl_create( struct svga_context *svga,
+                                      struct u_upload_mgr *upload_ib,
+                                      struct svga_winsys_context *swc );
+
+void svga_hwtnl_destroy( struct svga_hwtnl *hwtnl );
+
+void svga_hwtnl_set_flatshade( struct svga_hwtnl *hwtnl,
+                               boolean flatshade,
+                               boolean flatshade_first );
+
+void svga_hwtnl_set_unfilled( struct svga_hwtnl *hwtnl,
+                              unsigned mode );
+
+void svga_hwtnl_vdecl( struct svga_hwtnl *hwtnl,
+                       unsigned i,
+                       const SVGA3dVertexDecl *decl,
+                       struct pipe_resource *vb);
+
+void svga_hwtnl_reset_vdecl( struct svga_hwtnl *hwtnl,
+                             unsigned count );
+
+
+enum pipe_error 
+svga_hwtnl_draw_arrays( struct svga_hwtnl *hwtnl,
+                        unsigned prim, 
+                        unsigned start, 
+                        unsigned count);
+
+enum pipe_error
+svga_hwtnl_draw_range_elements( struct svga_hwtnl *hwtnl,
+                                struct pipe_resource *indexBuffer,
+                                unsigned index_size,
+                                int index_bias,
+                                unsigned min_index,
+                                unsigned max_index,
+                                unsigned prim, 
+                                unsigned start, 
+                                unsigned count );
+
+enum pipe_error
+svga_hwtnl_flush( struct svga_hwtnl *hwtnl );
+
+
+#endif /* SVGA_DRAW_H_ */
diff --git a/src/gallium/drivers/svga/svga_draw_arrays.c b/src/gallium/drivers/svga/svga_draw_arrays.c
new file mode 100644
index 0000000000..da33fae62f
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_draw_arrays.c
@@ -0,0 +1,297 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "util/u_inlines.h"
+#include "indices/u_indices.h"
+
+#include "svga_hw_reg.h"
+#include "svga_draw.h"
+#include "svga_draw_private.h"
+#include "svga_context.h"
+
+
+#define DBG 0
+
+
+
+
+static enum pipe_error generate_indices( struct svga_hwtnl *hwtnl,
+                                         unsigned nr,
+                                         unsigned index_size,
+                                         u_generate_func generate,
+                                         struct pipe_resource **out_buf )
+{
+   struct pipe_context *pipe = &hwtnl->svga->pipe;
+   struct pipe_transfer *transfer;
+   unsigned size = index_size * nr;
+   struct pipe_resource *dst = NULL;
+   void *dst_map = NULL;
+
+   dst = pipe_buffer_create( pipe->screen, 
+			     PIPE_BIND_INDEX_BUFFER, 
+			     size );
+   if (dst == NULL)
+      goto fail;
+
+   dst_map = pipe_buffer_map( pipe, dst, PIPE_TRANSFER_WRITE,
+			      &transfer);
+   if (dst_map == NULL)
+      goto fail;
+
+   generate( nr,
+             dst_map );
+
+   pipe_buffer_unmap( pipe, dst, transfer );
+
+   *out_buf = dst;
+   return PIPE_OK;
+
+fail:
+   if (dst_map)
+      pipe_buffer_unmap( pipe, dst, transfer );
+
+   if (dst)
+      pipe->screen->resource_destroy( pipe->screen, dst );
+   
+   return PIPE_ERROR_OUT_OF_MEMORY;
+}
+
+static boolean compare( unsigned cached_nr,
+                        unsigned nr,
+                        unsigned type )
+{
+   if (type == U_GENERATE_REUSABLE)
+      return cached_nr >= nr;
+   else
+      return cached_nr == nr;
+}
+
+static enum pipe_error retrieve_or_generate_indices( struct svga_hwtnl *hwtnl,
+                                                     unsigned prim,
+                                                     unsigned gen_type,
+                                                     unsigned gen_nr,
+                                                     unsigned gen_size,
+                                                     u_generate_func generate,
+                                                     struct pipe_resource **out_buf )
+{
+   enum pipe_error ret = PIPE_OK;
+   int i;
+
+   for (i = 0; i < IDX_CACHE_MAX; i++) {
+      if (hwtnl->index_cache[prim][i].buffer != NULL &&
+          hwtnl->index_cache[prim][i].generate == generate)
+      {
+         if (compare(hwtnl->index_cache[prim][i].gen_nr, gen_nr, gen_type))
+         {
+            pipe_resource_reference( out_buf,
+                                   hwtnl->index_cache[prim][i].buffer );
+
+            if (DBG) 
+               debug_printf("%s retrieve %d/%d\n", __FUNCTION__, i, gen_nr);
+
+            return PIPE_OK;
+         }
+         else if (gen_type == U_GENERATE_REUSABLE) 
+         {
+            pipe_resource_reference( &hwtnl->index_cache[prim][i].buffer,
+                                   NULL );
+
+            if (DBG) 
+               debug_printf("%s discard %d/%d\n", __FUNCTION__, 
+                            i, hwtnl->index_cache[prim][i].gen_nr);
+
+            break;
+         }
+      }
+   }
+
+   if (i == IDX_CACHE_MAX)
+   {
+      unsigned smallest = 0;
+      unsigned smallest_size = ~0;
+      
+      for (i = 0; i < IDX_CACHE_MAX && smallest_size; i++) {
+         if (hwtnl->index_cache[prim][i].buffer == NULL)
+         {
+            smallest = i;
+            smallest_size = 0;
+         }
+         else if (hwtnl->index_cache[prim][i].gen_nr < smallest)
+         {
+            smallest = i;
+            smallest_size = hwtnl->index_cache[prim][i].gen_nr;
+         }
+      }
+
+      assert (smallest != IDX_CACHE_MAX);
+
+      pipe_resource_reference( &hwtnl->index_cache[prim][smallest].buffer,
+                             NULL );
+
+      if (DBG)
+         debug_printf("%s discard smallest %d/%d\n", __FUNCTION__, 
+                      smallest, smallest_size);
+      
+      i = smallest;
+   }
+      
+      
+   ret = generate_indices( hwtnl, 
+                           gen_nr,
+                           gen_size,
+                           generate,
+                           out_buf );
+   if (ret != PIPE_OK)
+      return ret;
+
+
+   hwtnl->index_cache[prim][i].generate = generate;
+   hwtnl->index_cache[prim][i].gen_nr = gen_nr;
+   pipe_resource_reference( &hwtnl->index_cache[prim][i].buffer,
+                          *out_buf );
+
+   if (DBG)
+      debug_printf("%s cache %d/%d\n", __FUNCTION__, 
+                   i, hwtnl->index_cache[prim][i].gen_nr);
+
+   return PIPE_OK;
+}
+
+
+
+static enum pipe_error
+simple_draw_arrays( struct svga_hwtnl *hwtnl,
+                    unsigned prim, unsigned start, unsigned count )
+{
+   SVGA3dPrimitiveRange range;
+   unsigned hw_prim;
+   unsigned hw_count;
+
+   hw_prim = svga_translate_prim(prim, count, &hw_count);
+   if (hw_count == 0)
+      return PIPE_ERROR_BAD_INPUT;
+      
+   range.primType = hw_prim;
+   range.primitiveCount = hw_count;
+   range.indexArray.surfaceId = SVGA3D_INVALID_ID;
+   range.indexArray.offset = 0;
+   range.indexArray.stride = 0;
+   range.indexWidth = 0;
+   range.indexBias = start;
+
+   /* Min/max index should be calculated prior to applying bias, so we
+    * end up with min_index = 0, max_index = count - 1 and everybody
+    * looking at those numbers knows to adjust them by
+    * range.indexBias.
+    */
+   return svga_hwtnl_prim( hwtnl, &range, 0, count - 1, NULL );
+}
+
+
+
+
+
+
+
+
+
+
+enum pipe_error 
+svga_hwtnl_draw_arrays( struct svga_hwtnl *hwtnl,
+                        unsigned prim, 
+                        unsigned start, 
+                        unsigned count)
+{
+   unsigned gen_prim, gen_size, gen_nr, gen_type;
+   u_generate_func gen_func;
+   enum pipe_error ret = PIPE_OK;
+
+   if (hwtnl->api_fillmode != PIPE_POLYGON_MODE_FILL && 
+       prim >= PIPE_PRIM_TRIANGLES) 
+   {
+      gen_type = u_unfilled_generator( prim,
+                                       start,
+                                       count,
+                                       hwtnl->api_fillmode,
+                                       &gen_prim,
+                                       &gen_size,
+                                       &gen_nr,
+                                       &gen_func );
+   }
+   else {
+      gen_type = u_index_generator( svga_hw_prims,
+                                    prim,
+                                    start,
+                                    count,
+                                    hwtnl->api_pv,
+                                    hwtnl->hw_pv,
+                                    &gen_prim,
+                                    &gen_size,
+                                    &gen_nr,
+                                    &gen_func );
+   }
+
+   if (gen_type == U_GENERATE_LINEAR) {
+      return simple_draw_arrays( hwtnl, gen_prim, start, count );
+   }
+   else {
+      struct pipe_resource *gen_buf = NULL;
+
+      /* Need to draw as indexed primitive. 
+       * Potentially need to run the gen func to build an index buffer.
+       */
+      ret = retrieve_or_generate_indices( hwtnl,
+                                          prim,
+                                          gen_type,
+                                          gen_nr,
+                                          gen_size,
+                                          gen_func,
+                                          &gen_buf );
+      if (ret)
+         goto done;
+
+      ret = svga_hwtnl_simple_draw_range_elements( hwtnl,
+                                                   gen_buf,
+                                                   gen_size,
+                                                   start,
+                                                   0,
+                                                   count - 1,
+                                                   gen_prim,
+                                                   0,
+                                                   gen_nr );
+
+      if (ret)
+         goto done;
+
+   done:
+      if (gen_buf)
+         pipe_resource_reference( &gen_buf, NULL );
+
+      return ret;
+   }
+}
+
diff --git a/src/gallium/drivers/svga/svga_draw_elements.c b/src/gallium/drivers/svga/svga_draw_elements.c
new file mode 100644
index 0000000000..c4579177b7
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_draw_elements.c
@@ -0,0 +1,255 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "util/u_upload_mgr.h"
+#include "indices/u_indices.h"
+
+#include "svga_cmd.h"
+#include "svga_draw.h"
+#include "svga_draw_private.h"
+#include "svga_resource_buffer.h"
+#include "svga_winsys.h"
+#include "svga_context.h"
+
+#include "svga_hw_reg.h"
+
+
+static enum pipe_error
+translate_indices( struct svga_hwtnl *hwtnl,
+                   struct pipe_resource *src,
+                   unsigned offset,
+                   unsigned nr,
+                   unsigned index_size,
+                   u_translate_func translate,
+                   struct pipe_resource **out_buf )
+{
+   struct pipe_context *pipe = &hwtnl->svga->pipe;
+   struct pipe_transfer *src_transfer = NULL;
+   struct pipe_transfer *dst_transfer = NULL;
+   unsigned size = index_size * nr;
+   const void *src_map = NULL;
+   struct pipe_resource *dst = NULL;
+   void *dst_map = NULL;
+
+   dst = pipe_buffer_create( pipe->screen, 
+			     PIPE_BIND_INDEX_BUFFER, 
+			     size );
+   if (dst == NULL)
+      goto fail;
+
+   src_map = pipe_buffer_map( pipe, src, PIPE_TRANSFER_READ, &src_transfer );
+   if (src_map == NULL)
+      goto fail;
+
+   dst_map = pipe_buffer_map( pipe, dst, PIPE_TRANSFER_WRITE, &dst_transfer );
+   if (dst_map == NULL)
+      goto fail;
+
+   translate( (const char *)src_map + offset,
+              nr,
+              dst_map );
+
+   pipe_buffer_unmap( pipe, src, src_transfer );
+   pipe_buffer_unmap( pipe, dst, dst_transfer );
+
+   *out_buf = dst;
+   return PIPE_OK;
+
+fail:
+   if (src_map)
+      pipe_buffer_unmap( pipe, src, src_transfer );
+
+   if (dst_map)
+      pipe_buffer_unmap( pipe, dst, dst_transfer );
+
+   if (dst)
+      pipe->screen->resource_destroy( pipe->screen, dst );
+
+   return PIPE_ERROR_OUT_OF_MEMORY;
+}
+
+
+
+
+
+enum pipe_error
+svga_hwtnl_simple_draw_range_elements( struct svga_hwtnl *hwtnl,
+                                       struct pipe_resource *index_buffer,
+                                       unsigned index_size,
+                                       int index_bias,
+                                       unsigned min_index,
+                                       unsigned max_index,
+                                       unsigned prim, 
+                                       unsigned start,
+                                       unsigned count )
+{
+   struct pipe_resource *upload_buffer = NULL;
+   SVGA3dPrimitiveRange range;
+   unsigned hw_prim;
+   unsigned hw_count;
+   unsigned index_offset = start * index_size;
+   int ret = PIPE_OK;
+
+   hw_prim = svga_translate_prim(prim, count, &hw_count);
+   if (hw_count == 0)
+      goto done;
+
+   if (index_buffer && 
+       svga_buffer_is_user_buffer(index_buffer)) 
+   {
+      assert( index_buffer->width0 >= index_offset + count * index_size );
+
+      ret = u_upload_buffer( hwtnl->upload_ib,
+                             index_offset,
+                             count * index_size,
+                             index_buffer,
+                             &index_offset,
+                             &upload_buffer );
+      if (ret)
+         goto done;
+
+      /* Don't need to worry about refcounting index_buffer as this is
+       * just a stack variable without a counted reference of its own.
+       * The caller holds the reference.
+       */
+      index_buffer = upload_buffer;
+   }
+
+   range.primType = hw_prim;
+   range.primitiveCount = hw_count;
+   range.indexArray.offset = index_offset;
+   range.indexArray.stride = index_size;
+   range.indexWidth = index_size;
+   range.indexBias = index_bias;
+      
+   ret = svga_hwtnl_prim( hwtnl, &range, min_index, max_index, index_buffer );
+   if (ret)
+      goto done;
+
+done:
+   if (upload_buffer)
+      pipe_resource_reference( &upload_buffer, NULL );
+
+   return ret;
+}
+
+
+
+
+enum pipe_error
+svga_hwtnl_draw_range_elements( struct svga_hwtnl *hwtnl,
+                                struct pipe_resource *index_buffer,
+                                unsigned index_size,
+                                int index_bias,
+                                unsigned min_index,
+                                unsigned max_index,
+                                unsigned prim, unsigned start, unsigned count)
+{
+   unsigned gen_prim, gen_size, gen_nr, gen_type;
+   u_translate_func gen_func;
+   enum pipe_error ret = PIPE_OK;
+
+   if (hwtnl->api_fillmode != PIPE_POLYGON_MODE_FILL && 
+       prim >= PIPE_PRIM_TRIANGLES) 
+   {
+      gen_type = u_unfilled_translator( prim,
+                                        index_size,
+                                        count,
+                                        hwtnl->api_fillmode,
+                                        &gen_prim,
+                                        &gen_size,
+                                        &gen_nr,
+                                        &gen_func );
+   }
+   else
+   {
+      gen_type = u_index_translator( svga_hw_prims,
+                                     prim,
+                                     index_size,
+                                     count,
+                                     hwtnl->api_pv,
+                                     hwtnl->hw_pv,
+                                     &gen_prim,
+                                     &gen_size,
+                                     &gen_nr,
+                                     &gen_func );
+   }
+
+   
+   if (gen_type == U_TRANSLATE_MEMCPY) {
+      /* No need for translation, just pass through to hardware: 
+       */
+      return svga_hwtnl_simple_draw_range_elements( hwtnl, index_buffer,
+                                                    index_size,
+                                                    index_bias,
+                                                    min_index,
+                                                    max_index,
+                                                    gen_prim, start, count );
+   }
+   else {
+      struct pipe_resource *gen_buf = NULL;
+
+      /* Need to allocate a new index buffer and run the translate
+       * func to populate it.  Could potentially cache this translated
+       * index buffer with the original to avoid future
+       * re-translations.  Not much point if we're just accelerating
+       * GL though, as index buffers are typically used only once
+       * there.
+       */
+      ret = translate_indices( hwtnl,
+                               index_buffer,
+                               start * index_size,
+                               gen_nr,
+                               gen_size,
+                               gen_func,
+                               &gen_buf );
+      if (ret)
+         goto done;
+
+      ret = svga_hwtnl_simple_draw_range_elements( hwtnl,
+                                                   gen_buf,
+                                                   gen_size,
+                                                   index_bias,
+                                                   min_index,
+                                                   max_index,
+                                                   gen_prim,
+                                                   0,
+                                                   gen_nr );
+      if (ret)
+         goto done;
+
+   done:
+      if (gen_buf)
+         pipe_resource_reference( &gen_buf, NULL );
+
+      return ret;
+   }
+}
+
+
+
+
+
diff --git a/src/gallium/drivers/svga/svga_draw_private.h b/src/gallium/drivers/svga/svga_draw_private.h
new file mode 100644
index 0000000000..11afb59875
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_draw_private.h
@@ -0,0 +1,158 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_DRAW_H_
+#define SVGA_DRAW_H_
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+#include "indices/u_indices.h"
+#include "svga_hw_reg.h"
+#include "svga3d_shaderdefs.h"
+
+struct svga_context;
+struct u_upload_mgr;
+
+/* Should include polygon?
+ */
+static const unsigned svga_hw_prims = 
+   ((1 << PIPE_PRIM_POINTS) |
+    (1 << PIPE_PRIM_LINES) |
+    (1 << PIPE_PRIM_LINE_STRIP) |
+    (1 << PIPE_PRIM_TRIANGLES) |
+    (1 << PIPE_PRIM_TRIANGLE_STRIP) |
+    (1 << PIPE_PRIM_TRIANGLE_FAN));
+
+
+static INLINE unsigned svga_translate_prim(unsigned mode, 
+                                           unsigned count,
+                                           unsigned *out_count)
+{
+   switch (mode) {
+   case PIPE_PRIM_POINTS:
+      *out_count = count;
+      return SVGA3D_PRIMITIVE_POINTLIST;
+
+   case PIPE_PRIM_LINES:
+      *out_count = count / 2;
+      return SVGA3D_PRIMITIVE_LINELIST; 
+
+   case PIPE_PRIM_LINE_STRIP:
+      *out_count = count - 1;
+      return SVGA3D_PRIMITIVE_LINESTRIP; 
+
+   case PIPE_PRIM_TRIANGLES:
+      *out_count = count / 3;
+      return SVGA3D_PRIMITIVE_TRIANGLELIST; 
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      *out_count = count - 2;
+      return SVGA3D_PRIMITIVE_TRIANGLESTRIP; 
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      *out_count = count - 2;
+      return SVGA3D_PRIMITIVE_TRIANGLEFAN; 
+
+   default:
+      assert(0);
+      *out_count = 0;
+      return 0;
+   }
+}
+
+
+struct index_cache {
+   u_generate_func generate;
+   unsigned gen_nr;
+
+   /* If non-null, this buffer is filled by calling 
+    *   generate(nr, map(buffer))
+    */
+   struct pipe_resource *buffer;
+};
+
+#define QSZ 32
+
+struct draw_cmd {
+   struct svga_winsys_context *swc;
+
+   SVGA3dVertexDecl vdecl[SVGA3D_INPUTREG_MAX];
+   struct pipe_resource *vdecl_vb[SVGA3D_INPUTREG_MAX];
+   unsigned vdecl_count;
+
+   SVGA3dPrimitiveRange prim[QSZ];
+   struct pipe_resource *prim_ib[QSZ];
+   unsigned prim_count;
+   unsigned min_index[QSZ];
+   unsigned max_index[QSZ];
+};
+
+#define IDX_CACHE_MAX  8
+
+struct svga_hwtnl {
+   struct svga_context *svga;
+   struct u_upload_mgr *upload_ib;
+   
+   /* Flatshade information:
+    */
+   unsigned api_pv;
+   unsigned hw_pv;
+   unsigned api_fillmode;
+
+   /* Cache the results of running a particular generate func on each
+    * primitive type.
+    */
+   struct index_cache index_cache[PIPE_PRIM_MAX][IDX_CACHE_MAX];
+
+   /* Try to build the maximal draw command packet before emitting:
+    */
+   struct draw_cmd cmd;
+};
+
+
+
+/***********************************************************************
+ * Internal functions
+ */
+enum pipe_error 
+svga_hwtnl_prim( struct svga_hwtnl *hwtnl,
+                 const SVGA3dPrimitiveRange *range,
+                 unsigned min_index,
+                 unsigned max_index,
+                 struct pipe_resource *ib );
+
+enum pipe_error
+svga_hwtnl_simple_draw_range_elements( struct svga_hwtnl *hwtnl,
+                                       struct pipe_resource *indexBuffer,
+                                       unsigned index_size,
+                                       int index_bias,
+                                       unsigned min_index,
+                                       unsigned max_index,
+                                       unsigned prim, 
+                                       unsigned start,
+                                       unsigned count );
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_hw_reg.h b/src/gallium/drivers/svga/svga_hw_reg.h
new file mode 100644
index 0000000000..183f4b918e
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_hw_reg.h
@@ -0,0 +1,42 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_HW_REG_H
+#define SVGA_HW_REG_H
+
+#include "pipe/p_compiler.h"
+
+#if defined(PIPE_CC_GCC)
+#ifndef HAVE_STDINT_H
+#define HAVE_STDINT_H
+#endif
+#endif
+
+#include "svga_types.h"
+
+#include "svga3d_reg.h"
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_pipe_blend.c b/src/gallium/drivers/svga/svga_pipe_blend.c
new file mode 100644
index 0000000000..594eec7166
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_blend.c
@@ -0,0 +1,240 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_context.h"
+
+#include "svga_hw_reg.h"
+
+
+static INLINE unsigned
+svga_translate_blend_factor(unsigned factor)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ZERO:            return SVGA3D_BLENDOP_ZERO;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:       return SVGA3D_BLENDOP_SRCALPHA;
+   case PIPE_BLENDFACTOR_ONE:             return SVGA3D_BLENDOP_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:       return SVGA3D_BLENDOP_SRCCOLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:   return SVGA3D_BLENDOP_INVSRCCOLOR;
+   case PIPE_BLENDFACTOR_DST_COLOR:       return SVGA3D_BLENDOP_DESTCOLOR;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:   return SVGA3D_BLENDOP_INVDESTCOLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:   return SVGA3D_BLENDOP_INVSRCALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:       return SVGA3D_BLENDOP_DESTALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:   return SVGA3D_BLENDOP_INVDESTALPHA;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: return SVGA3D_BLENDOP_SRCALPHASAT;
+   case PIPE_BLENDFACTOR_CONST_COLOR:     return SVGA3D_BLENDOP_BLENDFACTOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR: return SVGA3D_BLENDOP_INVBLENDFACTOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:     return SVGA3D_BLENDOP_BLENDFACTOR; /* ? */
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: return SVGA3D_BLENDOP_INVBLENDFACTOR; /* ? */
+   default:
+      assert(0);
+      return SVGA3D_BLENDOP_ZERO;
+   }
+}
+
+static INLINE unsigned
+svga_translate_blend_func(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_BLEND_ADD:              return SVGA3D_BLENDEQ_ADD;
+   case PIPE_BLEND_SUBTRACT:         return SVGA3D_BLENDEQ_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT: return SVGA3D_BLENDEQ_REVSUBTRACT;
+   case PIPE_BLEND_MIN:              return SVGA3D_BLENDEQ_MINIMUM;
+   case PIPE_BLEND_MAX:              return SVGA3D_BLENDEQ_MAXIMUM;
+   default:
+      assert(0);
+      return SVGA3D_BLENDEQ_ADD;
+   }
+}
+
+
+static void *
+svga_create_blend_state(struct pipe_context *pipe,
+                        const struct pipe_blend_state *templ)
+{
+   struct svga_blend_state *blend = CALLOC_STRUCT( svga_blend_state );
+   unsigned i;
+
+ 
+   /* Fill in the per-rendertarget blend state.  We currently only
+    * have one rendertarget.
+    */
+   for (i = 0; i < 1; i++) {
+      /* No way to set this in SVGA3D, and no way to correctly implement it on
+       * top of D3D9 API.  Instead we try to simulate with various blend modes.
+       */
+      if (templ->logicop_enable) {
+         switch (templ->logicop_func) {
+         case PIPE_LOGICOP_XOR:
+         case PIPE_LOGICOP_INVERT:
+            blend->need_white_fragments = TRUE;
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ONE;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_SUBTRACT;
+            break;
+         case PIPE_LOGICOP_CLEAR:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+            break;
+         case PIPE_LOGICOP_COPY:
+            blend->rt[i].blend_enable = FALSE;
+            break;
+         case PIPE_LOGICOP_COPY_INVERTED:
+            blend->rt[i].blend_enable   = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
+            break;
+         case PIPE_LOGICOP_NOOP:
+            blend->rt[i].blend_enable   = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
+            break;
+         case PIPE_LOGICOP_SET:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ONE;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+            break;
+         case PIPE_LOGICOP_AND:
+            /* Approximate with minimum - works for the 0 & anything case: */
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+            break;
+         case PIPE_LOGICOP_AND_REVERSE:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_INVDESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+            break;
+         case PIPE_LOGICOP_AND_INVERTED:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+            break;
+         case PIPE_LOGICOP_OR:
+            /* Approximate with maximum - works for the 1 | anything case: */
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+            break;
+         case PIPE_LOGICOP_OR_REVERSE:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_INVDESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+            break;
+         case PIPE_LOGICOP_OR_INVERTED:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+            break;
+         case PIPE_LOGICOP_NAND:
+         case PIPE_LOGICOP_NOR:
+         case PIPE_LOGICOP_EQUIV:
+            /* Fill these in with plausible values */
+            blend->rt[i].blend_enable = FALSE;
+            break;
+         default:
+            assert(0);
+            break;
+         }
+      }
+      else {
+         blend->rt[i].blend_enable   = templ->rt[0].blend_enable;
+
+         if (templ->rt[0].blend_enable) {
+            blend->rt[i].srcblend       = svga_translate_blend_factor(templ->rt[0].rgb_src_factor);
+            blend->rt[i].dstblend       = svga_translate_blend_factor(templ->rt[0].rgb_dst_factor);
+            blend->rt[i].blendeq        = svga_translate_blend_func(templ->rt[0].rgb_func);
+            blend->rt[i].srcblend_alpha = svga_translate_blend_factor(templ->rt[0].alpha_src_factor);
+            blend->rt[i].dstblend_alpha = svga_translate_blend_factor(templ->rt[0].alpha_dst_factor);
+            blend->rt[i].blendeq_alpha  = svga_translate_blend_func(templ->rt[0].alpha_func);
+
+            if (blend->rt[i].srcblend_alpha != blend->rt[i].srcblend ||
+                blend->rt[i].dstblend_alpha != blend->rt[i].dstblend ||
+                blend->rt[i].blendeq_alpha  != blend->rt[i].blendeq)
+            {
+               blend->rt[i].separate_alpha_blend_enable = TRUE;
+            }
+         }
+      }
+
+      blend->rt[i].writemask = templ->rt[0].colormask;
+   }
+
+   return blend;
+}
+
+static void svga_bind_blend_state(struct pipe_context *pipe,
+                                  void *blend)
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.blend = (struct svga_blend_state*)blend;
+   svga->dirty |= SVGA_NEW_BLEND;
+}
+
+
+static void svga_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   FREE(blend);
+}
+
+static void svga_set_blend_color( struct pipe_context *pipe,
+                                  const struct pipe_blend_color *blend_color )
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.blend_color = *blend_color;
+
+   svga->dirty |= SVGA_NEW_BLEND_COLOR;
+}
+
+
+void svga_init_blend_functions( struct svga_context *svga )
+{
+   svga->pipe.create_blend_state = svga_create_blend_state;
+   svga->pipe.bind_blend_state = svga_bind_blend_state;
+   svga->pipe.delete_blend_state = svga_delete_blend_state;
+
+   svga->pipe.set_blend_color = svga_set_blend_color;
+}
+
+
+
diff --git a/src/gallium/drivers/svga/svga_pipe_blit.c b/src/gallium/drivers/svga/svga_pipe_blit.c
new file mode 100644
index 0000000000..ca036a6463
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_blit.c
@@ -0,0 +1,110 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_resource_texture.h"
+#include "svga_context.h"
+#include "svga_debug.h"
+#include "svga_cmd.h"
+#include "svga_surface.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLIT
+
+
+/* XXX I got my doubts about this, should maybe use svga_texture_copy_handle directly? */
+static void svga_surface_copy(struct pipe_context *pipe,
+                              struct pipe_resource* dst_tex,
+                              struct pipe_subresource subdst,
+                              unsigned dstx, unsigned dsty, unsigned dstz,
+                              struct pipe_resource* src_tex,
+                              struct pipe_subresource subsrc,
+                              unsigned srcx, unsigned srcy, unsigned srcz,
+                              unsigned width, unsigned height)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct pipe_screen *screen = pipe->screen;
+   SVGA3dCopyBox *box;
+   enum pipe_error ret;
+   struct pipe_surface *srcsurf, *dstsurf;
+
+   svga_hwtnl_flush_retry( svga );
+
+   srcsurf = screen->get_tex_surface(screen, src_tex,
+                                     subsrc.face, subsrc.level, srcz,
+                                     PIPE_BIND_SAMPLER_VIEW);
+
+   dstsurf = screen->get_tex_surface(screen, dst_tex,
+                                     subdst.face, subdst.level, dstz,
+                                     PIPE_BIND_RENDER_TARGET);
+
+   SVGA_DBG(DEBUG_DMA, "blit to sid %p (%d,%d), from sid %p (%d,%d) sz %dx%d\n",
+            svga_surface(dstsurf)->handle,
+            dstx, dsty,
+            svga_surface(srcsurf)->handle,
+            srcx, srcy,
+            width, height);
+
+   ret = SVGA3D_BeginSurfaceCopy(svga->swc,
+                                 srcsurf,
+                                 dstsurf,
+                                 &box,
+                                 1);
+   if(ret != PIPE_OK) {
+
+      svga_context_flush(svga, NULL);
+
+      ret = SVGA3D_BeginSurfaceCopy(svga->swc,
+                                    srcsurf,
+                                    dstsurf,
+                                    &box,
+                                    1);
+      assert(ret == PIPE_OK);
+   }
+
+   box->x = dstx;
+   box->y = dsty;
+   box->z = 0;
+   box->w = width;
+   box->h = height;
+   box->d = 1;
+   box->srcx = srcx;
+   box->srcy = srcy;
+   box->srcz = 0;
+
+   SVGA_FIFOCommitAll(svga->swc);
+
+   svga_surface(dstsurf)->dirty = TRUE;
+   svga_propagate_surface(pipe, dstsurf);
+
+   pipe_surface_reference(&srcsurf, NULL);
+   pipe_surface_reference(&dstsurf, NULL);
+
+}
+
+
+void
+svga_init_blit_functions(struct svga_context *svga)
+{
+   svga->pipe.resource_copy_region = svga_surface_copy;
+}
diff --git a/src/gallium/drivers/svga/svga_pipe_clear.c b/src/gallium/drivers/svga/svga_pipe_clear.c
new file mode 100644
index 0000000000..41f239c1a8
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_clear.c
@@ -0,0 +1,127 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+#include "svga_debug.h"
+
+#include "pipe/p_defines.h"
+#include "util/u_pack_color.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_surface.h"
+
+
+static enum pipe_error
+try_clear(struct svga_context *svga, 
+          unsigned buffers,
+          const float *rgba,
+          double depth,
+          unsigned stencil)
+{
+   int ret = PIPE_OK;
+   SVGA3dRect rect = { 0, 0, 0, 0 };
+   boolean restore_viewport = FALSE;
+   SVGA3dClearFlag flags = 0;
+   struct pipe_framebuffer_state *fb = &svga->curr.framebuffer;
+   union util_color uc;
+
+   ret = svga_update_state(svga, SVGA_STATE_HW_CLEAR);
+   if (ret)
+      return ret;
+
+   if ((buffers & PIPE_CLEAR_COLOR) && fb->cbufs[0]) {
+      flags |= SVGA3D_CLEAR_COLOR;
+      util_pack_color(rgba, PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
+
+      rect.w = fb->cbufs[0]->width;
+      rect.h = fb->cbufs[0]->height;
+   }
+
+   if ((buffers & PIPE_CLEAR_DEPTHSTENCIL) && fb->zsbuf) {
+      if (buffers & PIPE_CLEAR_DEPTH)
+         flags |= SVGA3D_CLEAR_DEPTH;
+
+      if ((svga->curr.framebuffer.zsbuf->format == PIPE_FORMAT_S8_USCALED_Z24_UNORM) &&
+          (buffers & PIPE_CLEAR_STENCIL))
+         flags |= SVGA3D_CLEAR_STENCIL;
+
+      rect.w = MAX2(rect.w, fb->zsbuf->width);
+      rect.h = MAX2(rect.h, fb->zsbuf->height);
+   }
+
+   if (memcmp(&rect, &svga->state.hw_clear.viewport, sizeof(rect)) != 0) {
+      restore_viewport = TRUE;
+      ret = SVGA3D_SetViewport(svga->swc, &rect);
+      if (ret)
+         return ret;
+   }
+
+   ret = SVGA3D_ClearRect(svga->swc, flags, uc.ui, depth, stencil,
+                          rect.x, rect.y, rect.w, rect.h);
+   if (ret != PIPE_OK)
+      return ret;
+
+   if (restore_viewport) {
+      memcpy(&rect, &svga->state.hw_clear.viewport, sizeof rect);
+      ret = SVGA3D_SetViewport(svga->swc, &rect);
+   }
+   
+   return ret;
+}
+
+/**
+ * Clear the given surface to the specified value.
+ * No masking, no scissor (clear entire buffer).
+ */
+void
+svga_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
+	   double depth, unsigned stencil)
+{
+   struct svga_context *svga = svga_context( pipe );
+   int ret;
+
+   if (buffers & PIPE_CLEAR_COLOR)
+      SVGA_DBG(DEBUG_DMA, "clear sid %p\n",
+               svga_surface(svga->curr.framebuffer.cbufs[0])->handle);
+
+   ret = try_clear( svga, buffers, rgba, depth, stencil );
+
+   if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
+      /* Flush command buffer and retry:
+       */
+      svga_context_flush( svga, NULL );
+
+      ret = try_clear( svga, buffers, rgba, depth, stencil );
+   }
+
+   /*
+    * Mark target surfaces as dirty
+    * TODO Mark only cleared surfaces.
+    */
+   svga_mark_surfaces_dirty(svga);
+
+   assert (ret == PIPE_OK);
+}
diff --git a/src/gallium/drivers/svga/svga_pipe_constants.c b/src/gallium/drivers/svga/svga_pipe_constants.c
new file mode 100644
index 0000000000..2fa2142d07
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_constants.c
@@ -0,0 +1,70 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "svga_context.h"
+
+/***********************************************************************
+ * Constant buffers 
+ */
+
+struct svga_constbuf 
+{
+   unsigned type;
+   float (*data)[4];
+   unsigned count;
+};
+
+
+
+static void svga_set_constant_buffer(struct pipe_context *pipe,
+                                     uint shader, uint index,
+                                     struct pipe_resource *buf)
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   pipe_resource_reference( &svga->curr.cb[shader],
+                          buf );
+
+   if (shader == PIPE_SHADER_FRAGMENT)
+      svga->dirty |= SVGA_NEW_FS_CONST_BUFFER;
+   else
+      svga->dirty |= SVGA_NEW_VS_CONST_BUFFER;
+}
+
+
+
+void svga_init_constbuffer_functions( struct svga_context *svga )
+{
+   svga->pipe.set_constant_buffer = svga_set_constant_buffer;
+}
+
diff --git a/src/gallium/drivers/svga/svga_pipe_depthstencil.c b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
new file mode 100644
index 0000000000..c84615a1f3
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
@@ -0,0 +1,169 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_context.h"
+#include "svga_hw_reg.h"
+
+
+static INLINE unsigned
+svga_translate_compare_func(unsigned func)
+{
+   switch (func) {
+   case PIPE_FUNC_NEVER:     return SVGA3D_CMP_NEVER;
+   case PIPE_FUNC_LESS:      return SVGA3D_CMP_LESS;
+   case PIPE_FUNC_LEQUAL:    return SVGA3D_CMP_LESSEQUAL;
+   case PIPE_FUNC_GREATER:   return SVGA3D_CMP_GREATER;
+   case PIPE_FUNC_GEQUAL:    return SVGA3D_CMP_GREATEREQUAL;
+   case PIPE_FUNC_NOTEQUAL:  return SVGA3D_CMP_NOTEQUAL;
+   case PIPE_FUNC_EQUAL:     return SVGA3D_CMP_EQUAL;
+   case PIPE_FUNC_ALWAYS:    return SVGA3D_CMP_ALWAYS;
+   default:
+      assert(0);
+      return SVGA3D_CMP_ALWAYS;
+   }
+}
+
+static INLINE unsigned
+svga_translate_stencil_op(unsigned op)
+{
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:      return SVGA3D_STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:      return SVGA3D_STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:   return SVGA3D_STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:      return SVGA3D_STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR:      return SVGA3D_STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INCR_WRAP: return SVGA3D_STENCILOP_INCRSAT; /* incorrect? */
+   case PIPE_STENCIL_OP_DECR_WRAP: return SVGA3D_STENCILOP_DECRSAT; /* incorrect? */
+   case PIPE_STENCIL_OP_INVERT:    return SVGA3D_STENCILOP_INVERT;
+   default:
+      assert(0);
+      return SVGA3D_STENCILOP_KEEP;
+   }
+}
+
+
+static void *
+svga_create_depth_stencil_state(struct pipe_context *pipe,
+				const struct pipe_depth_stencil_alpha_state *templ)
+{
+   struct svga_depth_stencil_state *ds = CALLOC_STRUCT( svga_depth_stencil_state );
+
+   /* Don't try to figure out CW/CCW correspondence with
+    * stencil[0]/[1] at this point.  Presumably this can change as
+    * back/front face are modified.
+    */
+   ds->stencil[0].enabled = templ->stencil[0].enabled;
+   if (ds->stencil[0].enabled) {
+      ds->stencil[0].func  = svga_translate_compare_func(templ->stencil[0].func);
+      ds->stencil[0].fail  = svga_translate_stencil_op(templ->stencil[0].fail_op);
+      ds->stencil[0].zfail = svga_translate_stencil_op(templ->stencil[0].zfail_op);
+      ds->stencil[0].pass  = svga_translate_stencil_op(templ->stencil[0].zpass_op);
+      
+      /* SVGA3D has one ref/mask/writemask triple shared between front &
+       * back face stencil.  We really need two:
+       */
+      ds->stencil_mask      = templ->stencil[0].valuemask & 0xff;
+      ds->stencil_writemask = templ->stencil[0].writemask & 0xff;
+   }
+
+
+   ds->stencil[1].enabled = templ->stencil[1].enabled;
+   if (templ->stencil[1].enabled) {
+      ds->stencil[1].func   = svga_translate_compare_func(templ->stencil[1].func);
+      ds->stencil[1].fail   = svga_translate_stencil_op(templ->stencil[1].fail_op);
+      ds->stencil[1].zfail  = svga_translate_stencil_op(templ->stencil[1].zfail_op);
+      ds->stencil[1].pass   = svga_translate_stencil_op(templ->stencil[1].zpass_op);
+
+      ds->stencil_mask      = templ->stencil[1].valuemask & 0xff;
+      ds->stencil_writemask = templ->stencil[1].writemask & 0xff;
+   }
+
+
+   ds->zenable = templ->depth.enabled;
+   if (ds->zenable) {
+      ds->zfunc = svga_translate_compare_func(templ->depth.func);
+      ds->zwriteenable = templ->depth.writemask;
+   }
+
+   ds->alphatestenable = templ->alpha.enabled;
+   if (ds->alphatestenable) {
+      ds->alphafunc = svga_translate_compare_func(templ->alpha.func);
+      ds->alpharef = templ->alpha.ref_value;
+   }
+
+   return ds;
+}
+
+static void svga_bind_depth_stencil_state(struct pipe_context *pipe,
+                                          void *depth_stencil)
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.depth = (const struct svga_depth_stencil_state *)depth_stencil;
+   svga->dirty |= SVGA_NEW_DEPTH_STENCIL;
+}
+
+static void svga_delete_depth_stencil_state(struct pipe_context *pipe,
+                                            void *depth_stencil)
+{
+   FREE(depth_stencil);
+}
+
+
+static void svga_set_stencil_ref( struct pipe_context *pipe,
+                                  const struct pipe_stencil_ref *stencil_ref )
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.stencil_ref = *stencil_ref;
+
+   svga->dirty |= SVGA_NEW_STENCIL_REF;
+}
+
+static void
+svga_set_sample_mask(struct pipe_context *pipe,
+                     unsigned sample_mask)
+{
+}
+
+
+void svga_init_depth_stencil_functions( struct svga_context *svga )
+{
+   svga->pipe.create_depth_stencil_alpha_state = svga_create_depth_stencil_state;
+   svga->pipe.bind_depth_stencil_alpha_state = svga_bind_depth_stencil_state;
+   svga->pipe.delete_depth_stencil_alpha_state = svga_delete_depth_stencil_state;
+
+   svga->pipe.set_stencil_ref = svga_set_stencil_ref;
+   svga->pipe.set_sample_mask = svga_set_sample_mask;
+}
+
+
+
+
diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c
new file mode 100644
index 0000000000..58e930d983
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_draw.c
@@ -0,0 +1,257 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "util/u_inlines.h"
+#include "util/u_prim.h"
+#include "util/u_time.h"
+#include "indices/u_indices.h"
+
+#include "svga_hw_reg.h"
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_draw.h"
+#include "svga_state.h"
+#include "svga_swtnl.h"
+#include "svga_debug.h"
+
+
+
+static enum pipe_error
+retry_draw_range_elements( struct svga_context *svga,
+                           struct pipe_resource *index_buffer,
+                           unsigned index_size,
+                           int index_bias,
+                           unsigned min_index,
+                           unsigned max_index,
+                           unsigned prim, 
+                           unsigned start, 
+                           unsigned count,
+                           boolean do_retry )
+{
+   enum pipe_error ret = 0;
+
+   svga_hwtnl_set_unfilled( svga->hwtnl,
+                            svga->curr.rast->hw_unfilled );
+
+   svga_hwtnl_set_flatshade( svga->hwtnl,
+                             svga->curr.rast->templ.flatshade,
+                             svga->curr.rast->templ.flatshade_first );
+
+
+   ret = svga_update_state( svga, SVGA_STATE_HW_DRAW );
+   if (ret)
+      goto retry;
+
+   ret = svga_hwtnl_draw_range_elements( svga->hwtnl,
+                                         index_buffer, index_size, index_bias,
+                                         min_index, max_index,
+                                         prim, start, count );
+   if (ret)
+      goto retry;
+
+   if (svga->curr.any_user_vertex_buffers) {
+      ret = svga_hwtnl_flush( svga->hwtnl );
+      if (ret)
+         goto retry;
+   }
+
+   return PIPE_OK;
+
+retry:
+   svga_context_flush( svga, NULL );
+
+   if (do_retry)
+   {
+      return retry_draw_range_elements( svga,
+                                        index_buffer, index_size, index_bias,
+                                        min_index, max_index,
+                                        prim, start, count,
+                                        FALSE );
+   }
+
+   return ret;
+}
+
+
+static enum pipe_error
+retry_draw_arrays( struct svga_context *svga,
+                   unsigned prim, 
+                   unsigned start, 
+                   unsigned count,
+                   boolean do_retry )
+{
+   enum pipe_error ret;
+
+   svga_hwtnl_set_unfilled( svga->hwtnl,
+                            svga->curr.rast->hw_unfilled );
+
+   svga_hwtnl_set_flatshade( svga->hwtnl,
+                             svga->curr.rast->templ.flatshade,
+                             svga->curr.rast->templ.flatshade_first );
+
+   ret = svga_update_state( svga, SVGA_STATE_HW_DRAW );
+   if (ret)
+      goto retry;
+
+   ret = svga_hwtnl_draw_arrays( svga->hwtnl, prim,
+                                 start, count );
+   if (ret)
+      goto retry;
+
+   if (svga->curr.any_user_vertex_buffers) {
+      ret = svga_hwtnl_flush( svga->hwtnl );
+      if (ret)
+         goto retry;
+   }
+
+   return 0;
+
+retry:
+   if (ret == PIPE_ERROR_OUT_OF_MEMORY && do_retry) 
+   {
+      svga_context_flush( svga, NULL );
+
+      return retry_draw_arrays( svga,
+                                prim,
+                                start,
+                                count,
+                                FALSE );
+   }
+
+   return ret;
+}
+
+
+
+
+
+static void
+svga_draw_range_elements( struct pipe_context *pipe,
+                          struct pipe_resource *index_buffer,
+                          unsigned index_size,
+                          int index_bias,
+                          unsigned min_index,
+                          unsigned max_index,
+                          unsigned prim, unsigned start, unsigned count)
+{
+   struct svga_context *svga = svga_context( pipe );
+   unsigned reduced_prim = u_reduced_prim(prim);
+   enum pipe_error ret = 0;
+
+   if (!u_trim_pipe_prim( prim, &count ))
+      return;
+
+   /*
+    * Mark currently bound target surfaces as dirty
+    * doesn't really matter if it is done before drawing.
+    *
+    * TODO If we ever normaly return something other then
+    * true we should not mark it as dirty then.
+    */
+   svga_mark_surfaces_dirty(svga_context(pipe));
+
+   if (svga->curr.reduced_prim != reduced_prim) {
+      svga->curr.reduced_prim = reduced_prim;
+      svga->dirty |= SVGA_NEW_REDUCED_PRIMITIVE;
+   }
+   
+   svga_update_state_retry( svga, SVGA_STATE_NEED_SWTNL );
+
+#ifdef DEBUG
+   if (svga->curr.vs->base.id == svga->debug.disable_shader ||
+       svga->curr.fs->base.id == svga->debug.disable_shader)
+      return;
+#endif
+
+   if (svga->state.sw.need_swtnl)
+   {
+      ret = svga_swtnl_draw_range_elements( svga, 
+                                            index_buffer, 
+                                            index_size,
+                                            index_bias,
+                                            min_index, max_index,
+                                            prim,
+                                            start, count );
+   }
+   else {
+      if (index_buffer) {
+         ret = retry_draw_range_elements( svga,
+                                          index_buffer,
+                                          index_size,
+                                          index_bias,
+                                          min_index,
+                                          max_index,
+                                          prim,
+                                          start,
+                                          count,
+                                          TRUE );
+      }
+      else {
+         ret = retry_draw_arrays( svga, 
+                                  prim, 
+                                  start, 
+                                  count,
+                                  TRUE );
+      }
+   }
+
+   if (SVGA_DEBUG & DEBUG_FLUSH) {
+      svga_hwtnl_flush_retry( svga );
+      svga_context_flush(svga, NULL);
+   }
+}
+
+
+static void
+svga_draw_elements( struct pipe_context *pipe,
+                    struct pipe_resource *index_buffer,
+                    unsigned index_size, int index_bias,
+                    unsigned prim, unsigned start, unsigned count)
+{
+   svga_draw_range_elements( pipe, index_buffer,
+                             index_size, index_bias,
+                             0, 0xffffffff,
+                             prim, start, count );
+}
+
+static void
+svga_draw_arrays( struct pipe_context *pipe,
+                  unsigned prim, unsigned start, unsigned count)
+{
+   svga_draw_range_elements(pipe, NULL, 0, 0,
+                            start, start + count - 1, 
+                            prim, 
+                            start, count);
+}
+
+
+void svga_init_draw_functions( struct svga_context *svga )
+{
+   svga->pipe.draw_arrays = svga_draw_arrays;
+   svga->pipe.draw_elements = svga_draw_elements;
+   svga->pipe.draw_range_elements = svga_draw_range_elements;
+}
diff --git a/src/gallium/drivers/svga/svga_pipe_flush.c b/src/gallium/drivers/svga/svga_pipe_flush.c
new file mode 100644
index 0000000000..ab243aa6ec
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_flush.c
@@ -0,0 +1,65 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_defines.h"
+#include "svga_screen.h"
+#include "svga_surface.h"
+#include "svga_context.h"
+#include "svga_debug.h"
+
+
+static void svga_flush( struct pipe_context *pipe,
+                        unsigned flags,
+                        struct pipe_fence_handle **fence )
+{
+   struct svga_context *svga = svga_context(pipe);
+   int i;
+
+   /* Emit buffered drawing commands.
+    */
+   svga_hwtnl_flush_retry( svga );
+
+   /* Emit back-copy from render target view to texture.
+    */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      if (svga->curr.framebuffer.cbufs[i])
+         svga_propagate_surface(pipe, svga->curr.framebuffer.cbufs[i]);
+   }
+   if (svga->curr.framebuffer.zsbuf)
+      svga_propagate_surface(pipe, svga->curr.framebuffer.zsbuf);
+
+   /* Flush command queue.
+    */
+   svga_context_flush(svga, fence);
+
+   SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "%s flags %x fence_ptr %p\n",
+            __FUNCTION__, flags, fence ? *fence : 0x0);
+}
+
+
+void svga_init_flush_functions( struct svga_context *svga )
+{
+   svga->pipe.flush = svga_flush;
+}
diff --git a/src/gallium/drivers/svga/svga_pipe_fs.c b/src/gallium/drivers/svga/svga_pipe_fs.c
new file mode 100644
index 0000000000..b71bc66552
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_fs.c
@@ -0,0 +1,131 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_bitmask.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "svga_screen.h"
+#include "svga_context.h"
+#include "svga_tgsi.h"
+#include "svga_hw_reg.h"
+#include "svga_cmd.h"
+#include "svga_debug.h"
+
+
+/***********************************************************************
+ * Fragment shaders 
+ */
+
+static void *
+svga_create_fs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_screen *svgascreen = svga_screen(pipe->screen);
+   struct svga_fragment_shader *fs;
+
+   fs = CALLOC_STRUCT(svga_fragment_shader);
+   if (!fs)
+      return NULL;
+
+   fs->base.tokens = tgsi_dup_tokens(templ->tokens);
+
+   /* Collect basic info that we'll need later:
+    */
+   tgsi_scan_shader(fs->base.tokens, &fs->base.info);
+
+   fs->base.id = svga->debug.shader_id++;
+   fs->base.use_sm30 = svgascreen->use_ps30;
+   
+   if (SVGA_DEBUG & DEBUG_TGSI || 0) {
+      debug_printf("%s id: %u, inputs: %u, outputs: %u\n",
+                   __FUNCTION__, fs->base.id,
+                   fs->base.info.num_inputs, fs->base.info.num_outputs);
+   }
+
+   return fs;
+}
+
+static void
+svga_bind_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_fragment_shader *fs = (struct svga_fragment_shader *) shader;
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.fs = fs;
+   svga->dirty |= SVGA_NEW_FS;
+}
+
+static
+void svga_delete_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_fragment_shader *fs = (struct svga_fragment_shader *) shader;
+   struct svga_shader_result *result, *tmp;
+   enum pipe_error ret;
+
+   svga_hwtnl_flush_retry( svga );
+
+   for (result = fs->base.results; result; result = tmp ) {
+      tmp = result->next;
+
+      ret = SVGA3D_DestroyShader(svga->swc, 
+                                 result->id,
+                                 SVGA3D_SHADERTYPE_PS );
+      if(ret != PIPE_OK) {
+         svga_context_flush(svga, NULL);
+         ret = SVGA3D_DestroyShader(svga->swc, 
+                                    result->id,
+                                    SVGA3D_SHADERTYPE_PS );
+         assert(ret == PIPE_OK);
+      }
+
+      util_bitmask_clear( svga->fs_bm, result->id );
+
+      svga_destroy_shader_result( result );
+
+      /*
+       * Remove stale references to this result to ensure a new result on the
+       * same address will be detected as a change.
+       */
+      if(result == svga->state.hw_draw.fs)
+         svga->state.hw_draw.fs = NULL;
+   }
+
+   FREE((void *)fs->base.tokens);
+   FREE(fs);
+}
+
+
+void svga_init_fs_functions( struct svga_context *svga )
+{
+   svga->pipe.create_fs_state = svga_create_fs_state;
+   svga->pipe.bind_fs_state = svga_bind_fs_state;
+   svga->pipe.delete_fs_state = svga_delete_fs_state;
+}
+
diff --git a/src/gallium/drivers/svga/svga_pipe_misc.c b/src/gallium/drivers/svga/svga_pipe_misc.c
new file mode 100644
index 0000000000..8c24fb302f
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_misc.c
@@ -0,0 +1,183 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "util/u_inlines.h"
+
+#include "svga_context.h"
+#include "svga_surface.h"
+
+
+static void svga_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   memcpy( &svga->curr.scissor, scissor, sizeof(*scissor) );
+   svga->dirty |= SVGA_NEW_SCISSOR;
+}
+
+
+static void svga_set_polygon_stipple( struct pipe_context *pipe,
+                                      const struct pipe_poly_stipple *stipple )
+{
+   /* overridden by the draw module */
+}
+
+
+void svga_cleanup_framebuffer(struct svga_context *svga)
+{
+   struct pipe_framebuffer_state *curr = &svga->curr.framebuffer;
+   struct pipe_framebuffer_state *hw = &svga->state.hw_clear.framebuffer;
+   int i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&curr->cbufs[i], NULL);
+      pipe_surface_reference(&hw->cbufs[i], NULL);
+   }
+
+   pipe_surface_reference(&curr->zsbuf, NULL);
+   pipe_surface_reference(&hw->zsbuf, NULL);
+}
+
+
+#define DEPTH_BIAS_SCALE_FACTOR_D16    ((float)(1<<15))
+#define DEPTH_BIAS_SCALE_FACTOR_D24S8  ((float)(1<<23))
+#define DEPTH_BIAS_SCALE_FACTOR_D32    ((float)(1<<31))
+
+
+static void svga_set_framebuffer_state(struct pipe_context *pipe,
+				       const struct pipe_framebuffer_state *fb)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct pipe_framebuffer_state *dst = &svga->curr.framebuffer;
+   boolean propagate = FALSE;
+   int i;
+
+   dst->width = fb->width;
+   dst->height = fb->height;
+   dst->nr_cbufs = fb->nr_cbufs;
+
+   /* check if we need to propaget any of the target surfaces */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      if (dst->cbufs[i] && dst->cbufs[i] != fb->cbufs[i])
+         if (svga_surface_needs_propagation(dst->cbufs[i]))
+            propagate = TRUE;
+   }
+
+   if (propagate) {
+      /* make sure that drawing calls comes before propagation calls */
+      svga_hwtnl_flush_retry( svga );
+   
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+         if (dst->cbufs[i] && dst->cbufs[i] != fb->cbufs[i])
+            svga_propagate_surface(pipe, dst->cbufs[i]);
+   }
+
+   /* XXX: Actually the virtual hardware may support rendertargets with
+    * different size, depending on the host API and driver, but since we cannot
+    * know that make no such assumption here. */
+   for(i = 0; i < fb->nr_cbufs; ++i) {
+      if (fb->zsbuf && fb->cbufs[i]) {
+         assert(fb->zsbuf->width == fb->cbufs[i]->width); 
+         assert(fb->zsbuf->height == fb->cbufs[i]->height); 
+      }
+   }
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+      pipe_surface_reference(&dst->cbufs[i], fb->cbufs[i]);
+   pipe_surface_reference(&dst->zsbuf, fb->zsbuf);
+
+
+   if (svga->curr.framebuffer.zsbuf)
+   {
+      switch (svga->curr.framebuffer.zsbuf->format) {
+      case PIPE_FORMAT_Z16_UNORM:
+         svga->curr.depthscale = 1.0f / DEPTH_BIAS_SCALE_FACTOR_D16;
+         break;
+      case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+      case PIPE_FORMAT_Z24X8_UNORM:
+      case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+      case PIPE_FORMAT_X8Z24_UNORM:
+         svga->curr.depthscale = 1.0f / DEPTH_BIAS_SCALE_FACTOR_D24S8;
+         break;
+      case PIPE_FORMAT_Z32_UNORM:
+         svga->curr.depthscale = 1.0f / DEPTH_BIAS_SCALE_FACTOR_D32;
+         break;
+      case PIPE_FORMAT_Z32_FLOAT:
+         svga->curr.depthscale = 1.0f / ((float)(1<<23));
+         break;
+      default:
+         svga->curr.depthscale = 0.0f;
+         break;
+      }
+   }
+   else {
+      svga->curr.depthscale = 0.0f;
+   }
+
+   svga->dirty |= SVGA_NEW_FRAME_BUFFER;
+}
+
+
+
+static void svga_set_clip_state( struct pipe_context *pipe,
+                                 const struct pipe_clip_state *clip )
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.clip = *clip; /* struct copy */
+
+   svga->dirty |= SVGA_NEW_CLIP;
+}
+
+
+
+/* Called when driver state tracker notices changes to the viewport
+ * matrix:
+ */
+static void svga_set_viewport_state( struct pipe_context *pipe,
+				     const struct pipe_viewport_state *viewport )
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.viewport = *viewport; /* struct copy */
+
+   svga->dirty |= SVGA_NEW_VIEWPORT;
+}
+
+
+
+void svga_init_misc_functions( struct svga_context *svga )
+{
+   svga->pipe.set_scissor_state = svga_set_scissor_state;
+   svga->pipe.set_polygon_stipple = svga_set_polygon_stipple;
+   svga->pipe.set_framebuffer_state = svga_set_framebuffer_state;
+   svga->pipe.set_clip_state = svga_set_clip_state;
+   svga->pipe.set_viewport_state = svga_set_viewport_state;
+}
+
+
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
new file mode 100644
index 0000000000..579f8034c7
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -0,0 +1,268 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+
+#include "svga_cmd.h"
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_resource_buffer.h"
+#include "svga_winsys.h"
+#include "svga_debug.h"
+
+
+/* Fixme: want a public base class for all pipe structs, even if there
+ * isn't much in them.
+ */
+struct pipe_query {
+   int dummy;
+};
+
+struct svga_query {
+   struct pipe_query base;
+   SVGA3dQueryType type;
+   struct svga_winsys_buffer *hwbuf;
+   volatile SVGA3dQueryResult *queryResult;
+   struct pipe_fence_handle *fence;
+};
+
+/***********************************************************************
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static INLINE struct svga_query *
+svga_query( struct pipe_query *q )
+{
+   return (struct svga_query *)q;
+}
+
+static boolean svga_get_query_result(struct pipe_context *pipe, 
+                                     struct pipe_query *q,
+                                     boolean wait,
+                                     void *result);
+
+static struct pipe_query *svga_create_query( struct pipe_context *pipe,
+                                             unsigned query_type )
+{
+   struct svga_context *svga = svga_context( pipe );
+   struct svga_screen *svgascreen = svga_screen(pipe->screen);
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_query *sq;
+
+   SVGA_DBG(DEBUG_QUERY, "%s\n", __FUNCTION__);
+
+   sq = CALLOC_STRUCT(svga_query);
+   if (!sq)
+      goto no_sq;
+
+   sq->type = SVGA3D_QUERYTYPE_OCCLUSION;
+
+   sq->hwbuf = svga_winsys_buffer_create(svga,
+                                         1,
+                                         SVGA_BUFFER_USAGE_PINNED,
+                                         sizeof *sq->queryResult);
+   if(!sq->hwbuf)
+      goto no_hwbuf;
+    
+   sq->queryResult = (SVGA3dQueryResult *)sws->buffer_map(sws, 
+                                                          sq->hwbuf, 
+                                                          PIPE_TRANSFER_WRITE);
+   if(!sq->queryResult)
+      goto no_query_result;
+
+   sq->queryResult->totalSize = sizeof *sq->queryResult;
+   sq->queryResult->state = SVGA3D_QUERYSTATE_NEW;
+
+   /*
+    * We request the buffer to be pinned and assume it is always mapped.
+    * 
+    * The reason is that we don't want to wait for fences when checking the
+    * query status.
+    */
+   sws->buffer_unmap(sws, sq->hwbuf);
+
+   return &sq->base;
+
+no_query_result:
+   sws->buffer_destroy(sws, sq->hwbuf);
+no_hwbuf:
+   FREE(sq);
+no_sq:
+   return NULL;
+}
+
+static void svga_destroy_query(struct pipe_context *pipe,
+                               struct pipe_query *q)
+{
+   struct svga_screen *svgascreen = svga_screen(pipe->screen);
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_query *sq = svga_query( q );
+
+   SVGA_DBG(DEBUG_QUERY, "%s\n", __FUNCTION__);
+   sws->buffer_destroy(sws, sq->hwbuf);
+   sws->fence_reference(sws, &sq->fence, NULL);
+   FREE(sq);
+}
+
+static void svga_begin_query(struct pipe_context *pipe, 
+                             struct pipe_query *q)
+{
+   struct svga_screen *svgascreen = svga_screen(pipe->screen);
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_context *svga = svga_context( pipe );
+   struct svga_query *sq = svga_query( q );
+   enum pipe_error ret;
+
+   SVGA_DBG(DEBUG_QUERY, "%s\n", __FUNCTION__);
+   
+   assert(!svga->sq);
+
+   /* Need to flush out buffered drawing commands so that they don't
+    * get counted in the query results.
+    */
+   svga_hwtnl_flush_retry(svga);
+   
+   if(sq->queryResult->state == SVGA3D_QUERYSTATE_PENDING) {
+      /* The application doesn't care for the pending query result. We cannot
+       * let go the existing buffer and just get a new one because its storage
+       * may be reused for other purposes and clobbered by the host when it
+       * determines the query result. So the only option here is to wait for
+       * the existing query's result -- not a big deal, given that no sane
+       * application would do this.
+       */
+      uint64_t result;
+
+      svga_get_query_result(pipe, q, TRUE, &result);
+      
+      assert(sq->queryResult->state != SVGA3D_QUERYSTATE_PENDING);
+   }
+   
+   sq->queryResult->state = SVGA3D_QUERYSTATE_NEW;
+   sws->fence_reference(sws, &sq->fence, NULL);
+
+   ret = SVGA3D_BeginQuery(svga->swc, sq->type);
+   if(ret != PIPE_OK) {
+      svga_context_flush(svga, NULL);
+      ret = SVGA3D_BeginQuery(svga->swc, sq->type);
+      assert(ret == PIPE_OK);
+   }
+
+   svga->sq = sq;
+}
+
+static void svga_end_query(struct pipe_context *pipe, 
+                           struct pipe_query *q)
+{
+   struct svga_context *svga = svga_context( pipe );
+   struct svga_query *sq = svga_query( q );
+   enum pipe_error ret;
+
+   SVGA_DBG(DEBUG_QUERY, "%s\n", __FUNCTION__);
+   assert(svga->sq == sq);
+
+   svga_hwtnl_flush_retry(svga);
+   
+   /* Set to PENDING before sending EndQuery. */
+   sq->queryResult->state = SVGA3D_QUERYSTATE_PENDING;
+
+   ret = SVGA3D_EndQuery( svga->swc, sq->type, sq->hwbuf);
+   if(ret != PIPE_OK) {
+      svga_context_flush(svga, NULL);
+      ret = SVGA3D_EndQuery( svga->swc, sq->type, sq->hwbuf);
+      assert(ret == PIPE_OK);
+   }
+   
+   /* TODO: Delay flushing. We don't really need to flush here, just ensure 
+    * that there is one flush before svga_get_query_result attempts to get the
+    * result */
+   svga_context_flush(svga, NULL);
+
+   svga->sq = NULL;
+}
+
+static boolean svga_get_query_result(struct pipe_context *pipe, 
+                                     struct pipe_query *q,
+                                     boolean wait,
+                                     void *vresult)
+{
+   struct svga_context *svga = svga_context( pipe );
+   struct svga_screen *svgascreen = svga_screen( pipe->screen );
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_query *sq = svga_query( q );
+   SVGA3dQueryState state;
+   uint64_t *result = (uint64_t*)vresult;
+   
+   SVGA_DBG(DEBUG_QUERY, "%s wait: %d\n", __FUNCTION__);
+
+   /* The query status won't be updated by the host unless 
+    * SVGA_3D_CMD_WAIT_FOR_QUERY is emitted. Unfortunately this will cause a 
+    * synchronous wait on the host */
+   if(!sq->fence) {
+      enum pipe_error ret;
+
+      ret = SVGA3D_WaitForQuery( svga->swc, sq->type, sq->hwbuf);
+      if(ret != PIPE_OK) {
+         svga_context_flush(svga, NULL);
+         ret = SVGA3D_WaitForQuery( svga->swc, sq->type, sq->hwbuf);
+         assert(ret == PIPE_OK);
+      }
+   
+      svga_context_flush(svga, &sq->fence);
+      
+      assert(sq->fence);
+   }
+
+   state = sq->queryResult->state;
+   if(state == SVGA3D_QUERYSTATE_PENDING) {
+      if(!wait)
+         return FALSE;
+   
+      sws->fence_finish(sws, sq->fence, 0);
+      
+      state = sq->queryResult->state;
+   }
+
+   assert(state == SVGA3D_QUERYSTATE_SUCCEEDED || 
+          state == SVGA3D_QUERYSTATE_FAILED);
+   
+   *result = (uint64_t)sq->queryResult->result32;
+
+   SVGA_DBG(DEBUG_QUERY, "%s result %d\n", __FUNCTION__, (unsigned)*result);
+
+   return TRUE;
+}
+
+
+
+void svga_init_query_functions( struct svga_context *svga )
+{
+   svga->pipe.create_query = svga_create_query;
+   svga->pipe.destroy_query = svga_destroy_query;
+   svga->pipe.begin_query = svga_begin_query;
+   svga->pipe.end_query = svga_end_query;
+   svga->pipe.get_query_result = svga_get_query_result;
+}
diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
new file mode 100644
index 0000000000..660eb0757a
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
@@ -0,0 +1,250 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "draw/draw_context.h"
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_context.h"
+
+#include "svga_hw_reg.h"
+
+/* Hardware frontwinding is always set up as SVGA3D_FRONTWINDING_CW.
+ */
+static SVGA3dFace svga_translate_cullmode( unsigned mode,
+                                           unsigned front_ccw )
+{
+   const int hw_front_ccw = 0;  /* hardware is always CW */
+   switch (mode) {
+   case PIPE_FACE_NONE:
+      return SVGA3D_FACE_NONE;
+   case PIPE_FACE_FRONT:
+      return front_ccw == hw_front_ccw ? SVGA3D_FACE_FRONT : SVGA3D_FACE_BACK;
+   case PIPE_FACE_BACK:
+      return front_ccw == hw_front_ccw ? SVGA3D_FACE_BACK : SVGA3D_FACE_FRONT;
+   case PIPE_FACE_FRONT_AND_BACK:
+      return SVGA3D_FACE_FRONT_BACK;
+   default:
+      assert(0);
+      return SVGA3D_FACE_NONE;
+   }
+}
+
+static SVGA3dShadeMode svga_translate_flatshade( unsigned mode )
+{
+   return mode ? SVGA3D_SHADEMODE_FLAT : SVGA3D_SHADEMODE_SMOOTH;
+}
+
+
+static void *
+svga_create_rasterizer_state(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *templ)
+{
+   struct svga_rasterizer_state *rast = CALLOC_STRUCT( svga_rasterizer_state );
+   /* need this for draw module. */
+   rast->templ = *templ;
+
+   /* light_twoside          - XXX: need fragment shader varient */
+   /* poly_smooth            - XXX: no fallback available */
+   /* poly_stipple_enable    - draw module */
+   /* sprite_coord_enable    - ? */
+   /* point_quad_rasterization - ? */
+   /* point_size_per_vertex  - ? */
+   /* sprite_coord_mode      - ??? */
+   /* bypass_vs_viewport_and_clip        - handled by viewport setup */
+   /* flatshade_first        - handled by index translation */
+   /* gl_rasterization_rules - XXX - viewport code */
+   /* line_width             - draw module */
+   /* fill_cw, fill_ccw      - draw module or index translation */
+
+   rast->shademode = svga_translate_flatshade( templ->flatshade );
+   rast->cullmode = svga_translate_cullmode( templ->cull_face, 
+                                             templ->front_ccw );
+   rast->scissortestenable = templ->scissor;
+   rast->multisampleantialias = templ->multisample;
+   rast->antialiasedlineenable = templ->line_smooth;
+   rast->lastpixel = templ->line_last_pixel;
+   rast->pointsize = templ->point_size;
+   rast->hw_unfilled = PIPE_POLYGON_MODE_FILL;
+
+   /* Use swtnl + decomposition implement these:
+    */
+   if (templ->poly_stipple_enable)
+      rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+
+   if (templ->line_width != 1.0 &&
+       templ->line_width != 0.0)
+      rast->need_pipeline |= SVGA_PIPELINE_FLAG_LINES;
+
+   if (templ->line_stipple_enable) {
+      /* LinePattern not implemented on all backends. 
+       */
+      if (0) {
+         SVGA3dLinePattern lp;
+         lp.repeat = templ->line_stipple_factor + 1;
+         lp.pattern = templ->line_stipple_pattern;
+         rast->linepattern = lp.uintValue;
+      }
+      else {
+         rast->need_pipeline |= SVGA_PIPELINE_FLAG_LINES;
+      }
+   } 
+
+   if (templ->point_smooth)
+      rast->need_pipeline |= SVGA_PIPELINE_FLAG_POINTS;
+
+   {
+      int fill_front = templ->fill_front;
+      int fill_back = templ->fill_back;
+      int fill = PIPE_POLYGON_MODE_FILL;
+      boolean offset_front = util_get_offset(templ, fill_front);
+      boolean offset_back = util_get_offset(templ, fill_back);
+      boolean offset  = 0;
+
+      switch (templ->cull_face) {
+      case PIPE_FACE_FRONT_AND_BACK:
+         offset = 0;
+         fill = PIPE_POLYGON_MODE_FILL;
+         break;
+
+      case PIPE_FACE_FRONT:
+         offset = offset_front;
+         fill = fill_front;
+         break;
+
+      case PIPE_FACE_BACK:
+         offset = offset_back;
+         fill = fill_back;
+         break;
+
+      case PIPE_FACE_NONE:
+         if (fill_front != fill_back || offset_front != offset_back) 
+         {
+            /* Always need the draw module to work out different
+             * front/back fill modes:
+             */
+            rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+         }
+         else {
+            offset = offset_front;
+            fill = fill_front;
+         }
+         break;
+
+      default:
+         assert(0);
+         break;
+      }
+
+      /* Unfilled primitive modes aren't implemented on all virtual
+       * hardware.  We can do some unfilled processing with index
+       * translation, but otherwise need the draw module:
+       */
+      if (fill != PIPE_POLYGON_MODE_FILL &&
+          (templ->flatshade ||
+           templ->light_twoside ||
+           offset ||
+           templ->cull_face != PIPE_FACE_NONE)) 
+      {
+         fill = PIPE_POLYGON_MODE_FILL;
+         rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+      }
+
+      /* If we are decomposing to lines, and lines need the pipeline,
+       * then we also need the pipeline for tris.
+       */
+      if (fill == PIPE_POLYGON_MODE_LINE &&
+          (rast->need_pipeline & SVGA_PIPELINE_FLAG_LINES))
+      {
+         fill = PIPE_POLYGON_MODE_FILL;
+         rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+      }
+
+      /* Similarly for points:
+       */
+      if (fill == PIPE_POLYGON_MODE_POINT &&
+          (rast->need_pipeline & SVGA_PIPELINE_FLAG_POINTS))
+      {
+         fill = PIPE_POLYGON_MODE_FILL;
+         rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+      }
+
+      if (offset) {
+         rast->slopescaledepthbias = templ->offset_scale;
+         rast->depthbias = templ->offset_units;
+      }
+
+      rast->hw_unfilled = fill;
+   }
+
+
+
+
+   if (rast->need_pipeline & SVGA_PIPELINE_FLAG_TRIS) {
+      /* Turn off stuff which will get done in the draw module:
+       */
+      rast->hw_unfilled = PIPE_POLYGON_MODE_FILL;
+      rast->slopescaledepthbias = 0;
+      rast->depthbias = 0;
+   }
+
+   return rast;
+}
+
+static void svga_bind_rasterizer_state( struct pipe_context *pipe,
+                                        void *state )
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_rasterizer_state *raster = (struct svga_rasterizer_state *)state;
+
+   svga->curr.rast = raster;
+
+   draw_set_rasterizer_state(svga->swtnl.draw, raster ? &raster->templ : NULL,
+                             state);
+   
+   svga->dirty |= SVGA_NEW_RAST;
+}
+
+static void svga_delete_rasterizer_state(struct pipe_context *pipe,
+                                         void *raster)
+{
+   FREE(raster);
+}
+
+
+void svga_init_rasterizer_functions( struct svga_context *svga )
+{
+   svga->pipe.create_rasterizer_state = svga_create_rasterizer_state;
+   svga->pipe.bind_rasterizer_state = svga_bind_rasterizer_state;
+   svga->pipe.delete_rasterizer_state = svga_delete_rasterizer_state;
+}
+
+
+/***********************************************************************
+ * Hardware state update
+ */
+
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
new file mode 100644
index 0000000000..f44a0e1325
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -0,0 +1,267 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "svga_context.h"
+#include "svga_resource_texture.h"
+
+#include "svga_debug.h"
+
+static INLINE unsigned
+translate_wrap_mode(unsigned wrap)
+{
+   switch (wrap) {
+   case PIPE_TEX_WRAP_REPEAT: 
+      return SVGA3D_TEX_ADDRESS_WRAP;
+
+   case PIPE_TEX_WRAP_CLAMP: 
+      return SVGA3D_TEX_ADDRESS_CLAMP;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE: 
+      /* Unfortunately SVGA3D_TEX_ADDRESS_EDGE not respected by
+       * hardware.
+       */
+      return SVGA3D_TEX_ADDRESS_CLAMP;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER: 
+      return SVGA3D_TEX_ADDRESS_BORDER;
+
+   case PIPE_TEX_WRAP_MIRROR_REPEAT: 
+      return SVGA3D_TEX_ADDRESS_MIRROR;
+
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:  
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:   
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: 
+      return SVGA3D_TEX_ADDRESS_MIRRORONCE;
+
+   default:
+      assert(0);
+      return SVGA3D_TEX_ADDRESS_WRAP;
+   }
+}
+
+static INLINE unsigned translate_img_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_FILTER_NEAREST: return SVGA3D_TEX_FILTER_NEAREST;
+   case PIPE_TEX_FILTER_LINEAR:  return SVGA3D_TEX_FILTER_LINEAR;
+   default:
+      assert(0);
+      return SVGA3D_TEX_FILTER_NEAREST;
+   }
+}
+
+static INLINE unsigned translate_mip_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_MIPFILTER_NONE:    return SVGA3D_TEX_FILTER_NONE;
+   case PIPE_TEX_MIPFILTER_NEAREST: return SVGA3D_TEX_FILTER_NEAREST;
+   case PIPE_TEX_MIPFILTER_LINEAR:  return SVGA3D_TEX_FILTER_LINEAR;
+   default:
+      assert(0);
+      return SVGA3D_TEX_FILTER_NONE;
+   }
+}
+
+static void *
+svga_create_sampler_state(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_sampler_state *cso = CALLOC_STRUCT( svga_sampler_state );
+   
+   cso->mipfilter = translate_mip_filter(sampler->min_mip_filter);
+   cso->magfilter = translate_img_filter( sampler->mag_img_filter );
+   cso->minfilter = translate_img_filter( sampler->min_img_filter );
+   cso->aniso_level = MAX2( sampler->max_anisotropy, 1 );
+   if(sampler->max_anisotropy)
+      cso->magfilter = cso->minfilter = SVGA3D_TEX_FILTER_ANISOTROPIC;
+   cso->lod_bias = sampler->lod_bias;
+   cso->addressu = translate_wrap_mode(sampler->wrap_s);
+   cso->addressv = translate_wrap_mode(sampler->wrap_t);
+   cso->addressw = translate_wrap_mode(sampler->wrap_r);
+   cso->normalized_coords = sampler->normalized_coords;
+   cso->compare_mode = sampler->compare_mode;
+   cso->compare_func = sampler->compare_func;
+
+   {
+      uint32 r = float_to_ubyte(sampler->border_color[0]);
+      uint32 g = float_to_ubyte(sampler->border_color[1]);
+      uint32 b = float_to_ubyte(sampler->border_color[2]);
+      uint32 a = float_to_ubyte(sampler->border_color[3]);
+
+      cso->bordercolor = (a << 24) | (r << 16) | (g << 8) | b;
+   }
+
+   /* No SVGA3D support for:
+    *    - min/max LOD clamping
+    */
+   cso->min_lod = 0;
+   cso->view_min_lod = MAX2(sampler->min_lod, 0);
+   cso->view_max_lod = MAX2(sampler->max_lod, 0);
+
+   /* Use min_mipmap */
+   if (svga->debug.use_min_mipmap) {
+      if (cso->view_min_lod == cso->view_max_lod) {
+         cso->min_lod = cso->view_min_lod;
+         cso->view_min_lod = 0;
+         cso->view_max_lod = 1000; /* Just a high number */
+         cso->mipfilter = SVGA3D_TEX_FILTER_NONE;
+      }
+   }
+
+   SVGA_DBG(DEBUG_VIEWS, "min %u, view(min %u, max %u) lod, mipfilter %s\n",
+            cso->min_lod, cso->view_min_lod, cso->view_max_lod,
+            cso->mipfilter == SVGA3D_TEX_FILTER_NONE ? "SVGA3D_TEX_FILTER_NONE" : "SOMETHING");
+
+   return cso;
+}
+
+static void svga_bind_sampler_states(struct pipe_context *pipe,
+                                     unsigned num, void **sampler)
+{
+   struct svga_context *svga = svga_context(pipe);
+   unsigned i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == svga->curr.num_samplers &&
+       !memcmp(svga->curr.sampler, sampler, num * sizeof(void *))) {
+      if (0) debug_printf("sampler noop\n");
+      return;
+   }
+
+   for (i = 0; i < num; i++)
+      svga->curr.sampler[i] = sampler[i];
+
+   for (i = num; i < svga->curr.num_samplers; i++)
+      svga->curr.sampler[i] = NULL;
+
+   svga->curr.num_samplers = num;
+   svga->dirty |= SVGA_NEW_SAMPLER;
+}
+
+static void svga_delete_sampler_state(struct pipe_context *pipe,
+                                      void *sampler)
+{
+   FREE(sampler);
+}
+
+
+static struct pipe_sampler_view *
+svga_create_sampler_view(struct pipe_context *pipe,
+                         struct pipe_resource *texture,
+                         const struct pipe_sampler_view *templ)
+{
+   struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
+
+   if (view) {
+      *view = *templ;
+      view->reference.count = 1;
+      view->texture = NULL;
+      pipe_resource_reference(&view->texture, texture);
+      view->context = pipe;
+   }
+
+   return view;
+}
+
+
+static void
+svga_sampler_view_destroy(struct pipe_context *pipe,
+                          struct pipe_sampler_view *view)
+{
+   pipe_resource_reference(&view->texture, NULL);
+   FREE(view);
+}
+
+static void svga_set_sampler_views(struct pipe_context *pipe,
+                                   unsigned num,
+                                   struct pipe_sampler_view **views)
+{
+   struct svga_context *svga = svga_context(pipe);
+   unsigned flag_1d = 0;
+   unsigned flag_srgb = 0;
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == svga->curr.num_sampler_views &&
+       !memcmp(svga->curr.sampler_views, views, num * sizeof(struct pipe_sampler_view *))) {
+      if (0) debug_printf("texture noop\n");
+      return;
+   }
+
+   for (i = 0; i < num; i++) {
+      pipe_sampler_view_reference(&svga->curr.sampler_views[i],
+                                  views[i]);
+
+      if (!views[i])
+         continue;
+
+      if (views[i]->texture->format == PIPE_FORMAT_B8G8R8A8_SRGB)
+         flag_srgb |= 1 << i;
+
+      if (views[i]->texture->target == PIPE_TEXTURE_1D)
+         flag_1d |= 1 << i;
+   }
+
+   for (i = num; i < svga->curr.num_sampler_views; i++)
+      pipe_sampler_view_reference(&svga->curr.sampler_views[i],
+                                  NULL);
+
+   svga->curr.num_sampler_views = num;
+   svga->dirty |= SVGA_NEW_TEXTURE_BINDING;
+
+   if (flag_srgb != svga->curr.tex_flags.flag_srgb ||
+       flag_1d != svga->curr.tex_flags.flag_1d) 
+   {
+      svga->dirty |= SVGA_NEW_TEXTURE_FLAGS;
+      svga->curr.tex_flags.flag_1d = flag_1d;
+      svga->curr.tex_flags.flag_srgb = flag_srgb;
+   }  
+}
+
+
+
+void svga_init_sampler_functions( struct svga_context *svga )
+{
+   svga->pipe.create_sampler_state = svga_create_sampler_state;
+   svga->pipe.bind_fragment_sampler_states = svga_bind_sampler_states;
+   svga->pipe.delete_sampler_state = svga_delete_sampler_state;
+   svga->pipe.set_fragment_sampler_views = svga_set_sampler_views;
+   svga->pipe.create_sampler_view = svga_create_sampler_view;
+   svga->pipe.sampler_view_destroy = svga_sampler_view_destroy;
+}
+
+
+
diff --git a/src/gallium/drivers/svga/svga_pipe_vertex.c b/src/gallium/drivers/svga/svga_pipe_vertex.c
new file mode 100644
index 0000000000..23808ad08e
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_vertex.c
@@ -0,0 +1,117 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "svga_screen.h"
+#include "svga_resource_buffer.h"
+#include "svga_context.h"
+
+
+static void svga_set_vertex_buffers(struct pipe_context *pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_buffer *buffers)
+{
+   struct svga_context *svga = svga_context(pipe);
+   unsigned i;
+   boolean any_user_buffer = FALSE;
+
+   /* Check for no change */
+   if (count == svga->curr.num_vertex_buffers &&
+       memcmp(svga->curr.vb, buffers, count * sizeof buffers[0]) == 0)
+      return;
+
+   /* Adjust refcounts */
+   for (i = 0; i < count; i++) {
+      pipe_resource_reference(&svga->curr.vb[i].buffer, buffers[i].buffer);
+      if (svga_buffer_is_user_buffer(buffers[i].buffer))
+         any_user_buffer = TRUE;
+   }
+
+   for ( ; i < svga->curr.num_vertex_buffers; i++)
+      pipe_resource_reference(&svga->curr.vb[i].buffer, NULL);
+
+   /* Copy remaining data */
+   memcpy(svga->curr.vb, buffers, count * sizeof buffers[0]);
+   svga->curr.num_vertex_buffers = count;
+   svga->curr.any_user_vertex_buffers = any_user_buffer;
+
+   svga->dirty |= SVGA_NEW_VBUFFER;
+}
+
+
+static void *
+svga_create_vertex_elements_state(struct pipe_context *pipe,
+                                  unsigned count,
+                                  const struct pipe_vertex_element *attribs)
+{
+   struct svga_velems_state *velems;
+   assert(count <= PIPE_MAX_ATTRIBS);
+   velems = (struct svga_velems_state *) MALLOC(sizeof(struct svga_velems_state));
+   if (velems) {
+      velems->count = count;
+      memcpy(velems->velem, attribs, sizeof(*attribs) * count);
+   }
+   return velems;
+}
+
+static void svga_bind_vertex_elements_state(struct pipe_context *pipe,
+                                            void *velems)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_velems_state *svga_velems = (struct svga_velems_state *) velems;
+
+   svga->curr.velems = svga_velems;
+   svga->dirty |= SVGA_NEW_VELEMENT;
+}
+
+static void svga_delete_vertex_elements_state(struct pipe_context *pipe,
+                                              void *velems)
+{
+   FREE(velems);
+}
+
+void svga_cleanup_vertex_state( struct svga_context *svga )
+{
+   unsigned i;
+   
+   for (i = 0 ; i < svga->curr.num_vertex_buffers; i++)
+      pipe_resource_reference(&svga->curr.vb[i].buffer, NULL);
+}
+
+
+void svga_init_vertex_functions( struct svga_context *svga )
+{
+   svga->pipe.set_vertex_buffers = svga_set_vertex_buffers;
+   svga->pipe.create_vertex_elements_state = svga_create_vertex_elements_state;
+   svga->pipe.bind_vertex_elements_state = svga_bind_vertex_elements_state;
+   svga->pipe.delete_vertex_elements_state = svga_delete_vertex_elements_state;
+}
+
+
diff --git a/src/gallium/drivers/svga/svga_pipe_vs.c b/src/gallium/drivers/svga/svga_pipe_vs.c
new file mode 100644
index 0000000000..de8c919e12
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_vs.c
@@ -0,0 +1,198 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "draw/draw_context.h"
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_bitmask.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_text.h"
+
+#include "svga_screen.h"
+#include "svga_context.h"
+#include "svga_tgsi.h"
+#include "svga_hw_reg.h"
+#include "svga_cmd.h"
+#include "svga_debug.h"
+
+
+static const struct tgsi_token *substitute_vs( 
+   unsigned shader_id,
+   const struct tgsi_token *old_tokens )
+{
+#if 0
+   if (shader_id == 12) {
+   static struct tgsi_token tokens[300];
+
+   const char *text = 
+      "VERT\n"
+      "DCL IN[0]\n"
+      "DCL IN[1]\n"
+      "DCL IN[2]\n"
+      "DCL OUT[0], POSITION\n"
+      "DCL TEMP[0..4]\n"
+      "IMM FLT32 {     1.0000,     1.0000,     1.0000,     1.0000 }\n"
+      "IMM FLT32 {     0.45,     1.0000,     1.0000,     1.0000 }\n"
+      "IMM FLT32 { 1.297863, 0.039245, 0.035993, 0.035976}\n"
+      "IMM FLT32 { -0.019398, 1.696131, -0.202151, -0.202050  }\n"
+      "IMM FLT32 { 0.051711, -0.348713, -0.979204, -0.978714  }\n"
+      "IMM FLT32 { 0.000000, 0.000003, 139.491577, 141.421356 }\n"
+      "DCL CONST[0..7]\n"
+      "DCL CONST[9..16]\n"
+      "  MOV TEMP[2], IMM[0]\n"
+
+      "  MOV TEMP[2].xyz, IN[2]\n"
+      "  MOV TEMP[2].xyz, IN[0]\n"
+      "  MOV TEMP[2].xyz, IN[1]\n"
+
+      "  MUL TEMP[1], IMM[3], TEMP[2].yyyy\n"
+      "  MAD TEMP[3], IMM[2],  TEMP[2].xxxx, TEMP[1]\n"
+      "  MAD TEMP[1], IMM[4], TEMP[2].zzzz, TEMP[3]\n"
+      "  MAD TEMP[4], IMM[5], TEMP[2].wwww, TEMP[1]\n"
+
+      "  MOV OUT[0], TEMP[4]\n"
+      "  END\n";
+
+   if (!tgsi_text_translate( text,
+                             tokens,
+                             Elements(tokens) ))
+   {
+      assert(0);
+      return NULL;
+   }
+
+   return tokens;
+   }
+#endif
+
+   return old_tokens;
+}
+
+
+/***********************************************************************
+ * Vertex shaders 
+ */
+
+static void *
+svga_create_vs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_screen *svgascreen = svga_screen(pipe->screen);
+   struct svga_vertex_shader *vs = CALLOC_STRUCT(svga_vertex_shader);
+   if (!vs)
+      return NULL;
+
+   /* substitute a debug shader?
+    */
+   vs->base.tokens = tgsi_dup_tokens(substitute_vs(svga->debug.shader_id,
+                                                   templ->tokens));
+
+
+   /* Collect basic info that we'll need later:
+    */
+   tgsi_scan_shader(vs->base.tokens, &vs->base.info);
+
+   {
+      /* Need to do construct a new template in case we substitued a
+       * debug shader.
+       */
+      struct pipe_shader_state tmp2 = *templ;
+      tmp2.tokens = vs->base.tokens;
+      vs->draw_shader = draw_create_vertex_shader(svga->swtnl.draw, &tmp2);
+   }
+
+   vs->base.id = svga->debug.shader_id++;
+   vs->base.use_sm30 = svgascreen->use_vs30;
+
+   if (SVGA_DEBUG & DEBUG_TGSI || 0) {
+      debug_printf("%s id: %u, inputs: %u, outputs: %u\n",
+                   __FUNCTION__, vs->base.id,
+                   vs->base.info.num_inputs, vs->base.info.num_outputs);
+   }
+
+   return vs;
+}
+
+static void svga_bind_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_vertex_shader *vs = (struct svga_vertex_shader *)shader;
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.vs = vs;
+   svga->dirty |= SVGA_NEW_VS;
+}
+
+
+static void svga_delete_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_vertex_shader *vs = (struct svga_vertex_shader *)shader;
+   struct svga_shader_result *result, *tmp;
+   enum pipe_error ret;
+
+   svga_hwtnl_flush_retry( svga );
+
+   draw_delete_vertex_shader(svga->swtnl.draw, vs->draw_shader);
+   
+   for (result = vs->base.results; result; result = tmp ) {
+      tmp = result->next;
+
+      ret = SVGA3D_DestroyShader(svga->swc, 
+                                 result->id,
+                                 SVGA3D_SHADERTYPE_VS );
+      if(ret != PIPE_OK) {
+         svga_context_flush(svga, NULL);
+         ret = SVGA3D_DestroyShader(svga->swc, 
+                                    result->id,
+                                    SVGA3D_SHADERTYPE_VS );
+         assert(ret == PIPE_OK);
+      }
+
+      util_bitmask_clear( svga->vs_bm, result->id );
+
+      svga_destroy_shader_result( result );
+
+      /*
+       * Remove stale references to this result to ensure a new result on the
+       * same address will be detected as a change.
+       */
+      if(result == svga->state.hw_draw.vs)
+         svga->state.hw_draw.vs = NULL;
+   }
+
+   FREE((void *)vs->base.tokens);
+   FREE(vs);
+}
+
+
+void svga_init_vs_functions( struct svga_context *svga )
+{
+   svga->pipe.create_vs_state = svga_create_vs_state;
+   svga->pipe.bind_vs_state = svga_bind_vs_state;
+   svga->pipe.delete_vs_state = svga_delete_vs_state;
+}
+
diff --git a/src/gallium/drivers/svga/svga_resource.c b/src/gallium/drivers/svga/svga_resource.c
new file mode 100644
index 0000000000..ef2a0c40f0
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_resource.c
@@ -0,0 +1,56 @@
+#include "util/u_debug.h"
+
+#include "svga_resource.h"
+#include "svga_resource_buffer.h"
+#include "svga_resource_texture.h"
+#include "svga_context.h"
+#include "svga_screen.h"
+
+
+static struct pipe_resource *
+svga_resource_create(struct pipe_screen *screen,
+                    const struct pipe_resource *template)
+{
+   if (template->target == PIPE_BUFFER)
+      return svga_buffer_create(screen, template);
+   else
+      return svga_texture_create(screen, template);
+
+}
+
+static struct pipe_resource *
+svga_resource_from_handle(struct pipe_screen * screen,
+			 const struct pipe_resource *template,
+			 struct winsys_handle *whandle)
+{
+   if (template->target == PIPE_BUFFER)
+      return NULL;
+   else
+      return svga_texture_from_handle(screen, template, whandle);
+}
+
+
+void
+svga_init_resource_functions(struct svga_context *svga)
+{
+   svga->pipe.is_resource_referenced = u_is_resource_referenced_vtbl;
+   svga->pipe.get_transfer = u_get_transfer_vtbl;
+   svga->pipe.transfer_map = u_transfer_map_vtbl;
+   svga->pipe.transfer_flush_region = u_transfer_flush_region_vtbl;
+   svga->pipe.transfer_unmap = u_transfer_unmap_vtbl;
+   svga->pipe.transfer_destroy = u_transfer_destroy_vtbl;
+   svga->pipe.transfer_inline_write = u_transfer_inline_write_vtbl;
+}
+
+void
+svga_init_screen_resource_functions(struct svga_screen *is)
+{
+   is->screen.resource_create = svga_resource_create;
+   is->screen.resource_from_handle = svga_resource_from_handle;
+   is->screen.resource_get_handle = u_resource_get_handle_vtbl;
+   is->screen.resource_destroy = u_resource_destroy_vtbl;
+   is->screen.user_buffer_create = svga_user_buffer_create;
+}
+
+
+
diff --git a/src/gallium/drivers/svga/svga_resource.h b/src/gallium/drivers/svga/svga_resource.h
new file mode 100644
index 0000000000..851e3b50ce
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_resource.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SVGA_RESOURCE_H
+#define SVGA_RESOURCE_H
+
+struct svga_screen;
+
+#include "util/u_debug.h"
+
+struct svga_context;
+struct svga_screen;
+
+
+void svga_init_screen_resource_functions(struct svga_screen *is);
+void svga_init_resource_functions(struct svga_context *svga );
+
+
+#endif /* SVGA_RESOURCE_H */
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c
new file mode 100644
index 0000000000..198d401332
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_resource_buffer.c
@@ -0,0 +1,351 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "os/os_thread.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_resource_buffer.h"
+#include "svga_resource_buffer_upload.h"
+#include "svga_winsys.h"
+#include "svga_debug.h"
+
+
+/**
+ * Vertex and index buffers need hardware backing.  Constant buffers
+ * do not.  No other types of buffers currently supported.
+ */
+static INLINE boolean
+svga_buffer_needs_hw_storage(unsigned usage)
+{
+   return usage & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER);
+}
+
+
+static unsigned int
+svga_buffer_is_referenced( struct pipe_context *pipe,
+			     struct pipe_resource *buf,
+			     unsigned face, unsigned level)
+{
+   struct svga_screen *ss = svga_screen(pipe->screen);
+   struct svga_buffer *sbuf = svga_buffer(buf);
+
+   /**
+    * XXX: Check this.
+    * The screen may cache buffer writes, but when we map, we map out
+    * of those cached writes, so we don't need to set a
+    * PIPE_REFERENCED_FOR_WRITE flag for cached buffers.
+    */
+
+   if (!sbuf->handle || ss->sws->surface_is_flushed(ss->sws, sbuf->handle))
+     return PIPE_UNREFERENCED;
+
+   /**
+    * sws->surface_is_flushed() does not distinguish between read references
+    * and write references. So assume a reference is both,
+    * however, we make an exception for index- and vertex buffers, to avoid
+    * a flush in st_bufferobj_get_subdata, during display list replay.
+    */
+
+   if (sbuf->b.b.bind & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER))
+      return PIPE_REFERENCED_FOR_READ;
+
+   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+}
+
+
+
+
+
+
+static void *
+svga_buffer_map_range( struct pipe_screen *screen,
+                       struct pipe_resource *buf,
+                       unsigned offset,
+		       unsigned length,
+                       unsigned usage )
+{
+   struct svga_screen *ss = svga_screen(screen); 
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_buffer *sbuf = svga_buffer( buf );
+   void *map;
+
+   if (!sbuf->swbuf && !sbuf->hwbuf) {
+      if (svga_buffer_create_hw_storage(ss, sbuf) != PIPE_OK) {
+         /*
+          * We can't create a hardware buffer big enough, so create a malloc
+          * buffer instead.
+          */
+         debug_printf("%s: failed to allocate %u KB of DMA, splitting DMA transfers\n",
+                      __FUNCTION__,
+                      (sbuf->b.b.width0 + 1023)/1024);
+
+         sbuf->swbuf = align_malloc(sbuf->b.b.width0, 16);
+      }
+   }
+
+   if (sbuf->swbuf) {
+      /* User/malloc buffer */
+      map = sbuf->swbuf;
+   }
+   else if (sbuf->hwbuf) {
+      map = sws->buffer_map(sws, sbuf->hwbuf, usage);
+   }
+   else {
+      map = NULL;
+   }
+
+   if(map) {
+      ++sbuf->map.count;
+
+      if (usage & PIPE_TRANSFER_WRITE) {
+         assert(sbuf->map.count <= 1);
+         sbuf->map.writing = TRUE;
+         if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT)
+            sbuf->map.flush_explicit = TRUE;
+      }
+   }
+   
+   return map;
+}
+
+
+
+static void 
+svga_buffer_flush_mapped_range( struct pipe_screen *screen,
+                                struct pipe_resource *buf,
+                                unsigned offset, unsigned length)
+{
+   struct svga_buffer *sbuf = svga_buffer( buf );
+   struct svga_screen *ss = svga_screen(screen);
+   
+   pipe_mutex_lock(ss->swc_mutex);
+   assert(sbuf->map.writing);
+   if(sbuf->map.writing) {
+      assert(sbuf->map.flush_explicit);
+      svga_buffer_add_range(sbuf, offset, offset + length);
+   }
+   pipe_mutex_unlock(ss->swc_mutex);
+}
+
+static void 
+svga_buffer_unmap( struct pipe_screen *screen,
+                   struct pipe_resource *buf)
+{
+   struct svga_screen *ss = svga_screen(screen); 
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_buffer *sbuf = svga_buffer( buf );
+   
+   pipe_mutex_lock(ss->swc_mutex);
+   
+   assert(sbuf->map.count);
+   if(sbuf->map.count)
+      --sbuf->map.count;
+
+   if(sbuf->hwbuf)
+      sws->buffer_unmap(sws, sbuf->hwbuf);
+
+   if(sbuf->map.writing) {
+      if(!sbuf->map.flush_explicit) {
+         /* No mapped range was flushed -- flush the whole buffer */
+         SVGA_DBG(DEBUG_DMA, "flushing the whole buffer\n");
+   
+         svga_buffer_add_range(sbuf, 0, sbuf->b.b.width0);
+      }
+      
+      sbuf->map.writing = FALSE;
+      sbuf->map.flush_explicit = FALSE;
+   }
+
+   pipe_mutex_unlock(ss->swc_mutex);
+}
+
+
+
+static void
+svga_buffer_destroy( struct pipe_screen *screen,
+		     struct pipe_resource *buf )
+{
+   struct svga_screen *ss = svga_screen(screen); 
+   struct svga_buffer *sbuf = svga_buffer( buf );
+
+   assert(!p_atomic_read(&buf->reference.count));
+   
+   assert(!sbuf->dma.pending);
+
+   if(sbuf->handle)
+      svga_buffer_destroy_host_surface(ss, sbuf);
+   
+   if(sbuf->uploaded.buffer)
+      pipe_resource_reference(&sbuf->uploaded.buffer, NULL);
+
+   if(sbuf->hwbuf)
+      svga_buffer_destroy_hw_storage(ss, sbuf);
+   
+   if(sbuf->swbuf && !sbuf->user)
+      align_free(sbuf->swbuf);
+   
+   FREE(sbuf);
+}
+
+
+/* Keep the original code more or less intact, implement transfers in
+ * terms of the old functions.
+ */
+static void *
+svga_buffer_transfer_map( struct pipe_context *pipe,
+			  struct pipe_transfer *transfer )
+{
+   uint8_t *map = svga_buffer_map_range( pipe->screen,
+					 transfer->resource,
+					 transfer->box.x,
+					 transfer->box.width,
+					 transfer->usage );
+   if (map == NULL)
+      return NULL;
+
+   /* map_buffer() returned a pointer to the beginning of the buffer,
+    * but transfers are expected to return a pointer to just the
+    * region specified in the box.
+    */
+   return map + transfer->box.x;
+}
+
+
+
+static void svga_buffer_transfer_flush_region( struct pipe_context *pipe,
+					       struct pipe_transfer *transfer,
+					       const struct pipe_box *box)
+{
+   assert(box->x + box->width <= transfer->box.width);
+
+   svga_buffer_flush_mapped_range(pipe->screen,
+				  transfer->resource,
+				  transfer->box.x + box->x,
+				  box->width);
+}
+
+static void svga_buffer_transfer_unmap( struct pipe_context *pipe,
+			    struct pipe_transfer *transfer )
+{
+   svga_buffer_unmap(pipe->screen,
+		     transfer->resource);
+}
+
+
+
+
+
+
+
+struct u_resource_vtbl svga_buffer_vtbl = 
+{
+   u_default_resource_get_handle,      /* get_handle */
+   svga_buffer_destroy,		     /* resource_destroy */
+   svga_buffer_is_referenced,	     /* is_resource_referenced */
+   u_default_get_transfer,	     /* get_transfer */
+   u_default_transfer_destroy,	     /* transfer_destroy */
+   svga_buffer_transfer_map,	     /* transfer_map */
+   svga_buffer_transfer_flush_region,  /* transfer_flush_region */
+   svga_buffer_transfer_unmap,	     /* transfer_unmap */
+   u_default_transfer_inline_write   /* transfer_inline_write */
+};
+
+
+
+struct pipe_resource *
+svga_buffer_create(struct pipe_screen *screen,
+		   const struct pipe_resource *template)
+{
+   struct svga_screen *ss = svga_screen(screen);
+   struct svga_buffer *sbuf;
+   
+   sbuf = CALLOC_STRUCT(svga_buffer);
+   if(!sbuf)
+      goto error1;
+   
+   sbuf->b.b = *template;
+   sbuf->b.vtbl = &svga_buffer_vtbl;
+   pipe_reference_init(&sbuf->b.b.reference, 1);
+   sbuf->b.b.screen = screen;
+
+   if(svga_buffer_needs_hw_storage(template->bind)) {
+      if(svga_buffer_create_host_surface(ss, sbuf) != PIPE_OK)
+         goto error2;
+   }
+   else {
+      sbuf->swbuf = align_malloc(template->width0, 64);
+      if(!sbuf->swbuf)
+         goto error2;
+   }
+      
+   return &sbuf->b.b; 
+
+error2:
+   FREE(sbuf);
+error1:
+   return NULL;
+}
+
+struct pipe_resource *
+svga_user_buffer_create(struct pipe_screen *screen,
+                        void *ptr,
+                        unsigned bytes,
+			unsigned bind)
+{
+   struct svga_buffer *sbuf;
+   
+   sbuf = CALLOC_STRUCT(svga_buffer);
+   if(!sbuf)
+      goto no_sbuf;
+      
+   pipe_reference_init(&sbuf->b.b.reference, 1);
+   sbuf->b.vtbl = &svga_buffer_vtbl;
+   sbuf->b.b.screen = screen;
+   sbuf->b.b.format = PIPE_FORMAT_R8_UNORM; /* ?? */
+   sbuf->b.b.usage = PIPE_USAGE_IMMUTABLE;
+   sbuf->b.b.bind = bind;
+   sbuf->b.b.width0 = bytes;
+   sbuf->b.b.height0 = 1;
+   sbuf->b.b.depth0 = 1;
+
+   sbuf->swbuf = ptr;
+   sbuf->user = TRUE;
+   
+   return &sbuf->b.b; 
+
+no_sbuf:
+   return NULL;
+}
+
+
+
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.h b/src/gallium/drivers/svga/svga_resource_buffer.h
new file mode 100644
index 0000000000..d3ec11bfd5
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_resource_buffer.h
@@ -0,0 +1,246 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_BUFFER_H
+#define SVGA_BUFFER_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "util/u_transfer.h"
+
+#include "util/u_double_list.h"
+
+#include "svga_screen_cache.h"
+
+
+/**
+ * Maximum number of discontiguous ranges
+ */
+#define SVGA_BUFFER_MAX_RANGES 32
+
+
+struct svga_screen;
+struct svga_context;
+struct svga_winsys_buffer;
+struct svga_winsys_surface;
+
+
+extern struct u_resource_vtbl svga_buffer_vtbl;
+
+struct svga_buffer_range
+{
+   unsigned start;
+   unsigned end;
+};
+
+
+/**
+ * SVGA pipe buffer.
+ */
+struct svga_buffer 
+{
+   struct u_resource b;
+
+   /**
+    * Regular (non DMA'able) memory.
+    * 
+    * Used for user buffers or for buffers which we know before hand that can
+    * never be used by the virtual hardware directly, such as constant buffers.
+    */
+   void *swbuf;
+   
+   /** 
+    * Whether swbuf was created by the user or not.
+    */
+   boolean user;
+   
+   /**
+    * Creation key for the host surface handle.
+    * 
+    * This structure describes all the host surface characteristics so that it 
+    * can be looked up in cache, since creating a host surface is often a slow
+    * operation.
+    */
+   struct svga_host_surface_cache_key key;
+   
+   /**
+    * Host surface handle.
+    * 
+    * This is a platform independent abstraction for host SID. We create when 
+    * trying to bind
+    */
+   struct svga_winsys_surface *handle;
+
+   /**
+    * Information about ongoing and past map operations.
+    */
+   struct {
+      /**
+       * Number of concurrent mappings.
+       *
+       * XXX: It is impossible to guarantee concurrent maps work in all
+       * circumstances -- pipe_buffers really need transfer objects too.
+       */
+      unsigned count;
+
+      /**
+       * Whether this buffer is currently mapped for writing.
+       */
+      boolean writing;
+
+      /**
+       * Whether the application will tell us explicity which ranges it touched
+       * or not.
+       */
+      boolean flush_explicit;
+
+      /**
+       * Dirty ranges.
+       *
+       * Ranges that were touched by the application and need to be uploaded to
+       * the host.
+       *
+       * This information will be copied into dma.boxes, when emiting the
+       * SVGA3dCmdSurfaceDMA command.
+       */
+      struct svga_buffer_range ranges[SVGA_BUFFER_MAX_RANGES];
+      unsigned num_ranges;
+   } map;
+
+   /**
+    * Information about uploaded version of user buffers.
+    */
+   struct {
+      struct pipe_resource *buffer;
+
+      /**
+       * We combine multiple user buffers into the same hardware buffer. This
+       * is the relative offset within that buffer.
+       */
+      unsigned offset;
+   } uploaded;
+
+   /**
+    * DMA'ble memory.
+    *
+    * A piece of GMR memory, with the same size of the buffer. It is created
+    * when mapping the buffer, and will be used to upload vertex data to the
+    * host.
+    */
+   struct svga_winsys_buffer *hwbuf;
+
+   /**
+    * Information about pending DMA uploads.
+    *
+    */
+   struct {
+      /**
+       * Whether this buffer has an unfinished DMA upload command.
+       *
+       * If not set then the rest of the information is null.
+       */
+      boolean pending;
+
+      SVGA3dSurfaceDMAFlags flags;
+
+      /**
+       * Pointer to the DMA copy box *inside* the command buffer.
+       */
+      SVGA3dCopyBox *boxes;
+
+      /**
+       * Context that has the pending DMA to this buffer.
+       */
+      struct svga_context *svga;
+   } dma;
+
+   /**
+    * Linked list head, used to gather all buffers with pending dma uploads on
+    * a context. It is only valid if the dma.pending is set above.
+    */
+   struct list_head head;
+};
+
+
+static INLINE struct svga_buffer *
+svga_buffer(struct pipe_resource *buffer)
+{
+   if (buffer) {
+      assert(((struct svga_buffer *)buffer)->b.vtbl == &svga_buffer_vtbl);
+      return (struct svga_buffer *)buffer;
+   }
+   return NULL;
+}
+
+
+/**
+ * Returns TRUE for user buffers.  We may
+ * decide to use an alternate upload path for these buffers.
+ */
+static INLINE boolean 
+svga_buffer_is_user_buffer( struct pipe_resource *buffer )
+{
+   return svga_buffer(buffer)->user;
+}
+
+
+
+
+struct pipe_resource *
+svga_user_buffer_create(struct pipe_screen *screen,
+                        void *ptr,
+                        unsigned bytes,
+			unsigned usage);
+
+struct pipe_resource *
+svga_buffer_create(struct pipe_screen *screen,
+		   const struct pipe_resource *template);
+
+
+
+/**
+ * Get the host surface handle for this buffer.
+ *
+ * This will ensure the host surface is updated, issuing DMAs as needed.
+ *
+ * NOTE: This may insert new commands in the context, so it *must* be called
+ * before reserving command buffer space. And, in order to insert commands
+ * it may need to call svga_context_flush().
+ */
+struct svga_winsys_surface *
+svga_buffer_handle(struct svga_context *svga,
+                   struct pipe_resource *buf);
+
+void
+svga_context_flush_buffers(struct svga_context *svga);
+
+struct svga_winsys_buffer *
+svga_winsys_buffer_create(struct svga_context *svga,
+                          unsigned alignment, 
+                          unsigned usage,
+                          unsigned size);
+
+#endif /* SVGA_BUFFER_H */
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
new file mode 100644
index 0000000000..3de5216a94
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -0,0 +1,640 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "os/os_thread.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_resource_buffer.h"
+#include "svga_resource_buffer_upload.h"
+#include "svga_winsys.h"
+#include "svga_debug.h"
+
+
+/**
+ * Allocate a winsys_buffer (ie. DMA, aka GMR memory).
+ *
+ * It will flush and retry in case the first attempt to create a DMA buffer
+ * fails, so it should not be called from any function involved in flushing
+ * to avoid recursion.
+ */
+struct svga_winsys_buffer *
+svga_winsys_buffer_create( struct svga_context *svga,
+                           unsigned alignment, 
+                           unsigned usage,
+                           unsigned size )
+{
+   struct svga_screen *svgascreen = svga_screen(svga->pipe.screen);
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_winsys_buffer *buf;
+   
+   /* Just try */
+   buf = sws->buffer_create(sws, alignment, usage, size);
+   if(!buf) {
+
+      SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "flushing screen to find %d bytes GMR\n", 
+               size); 
+      
+      /* Try flushing all pending DMAs */
+      svga_context_flush(svga, NULL);
+      buf = sws->buffer_create(sws, alignment, usage, size);
+   }
+   
+   return buf;
+}
+
+
+void
+svga_buffer_destroy_hw_storage(struct svga_screen *ss, struct svga_buffer *sbuf)
+{
+   struct svga_winsys_screen *sws = ss->sws;
+
+   assert(!sbuf->map.count);
+   assert(sbuf->hwbuf);
+   if(sbuf->hwbuf) {
+      sws->buffer_destroy(sws, sbuf->hwbuf);
+      sbuf->hwbuf = NULL;
+   }
+}
+
+
+
+/**
+ * Allocate DMA'ble storage for the buffer. 
+ * 
+ * Called before mapping a buffer.
+ */
+enum pipe_error
+svga_buffer_create_hw_storage(struct svga_screen *ss,
+                              struct svga_buffer *sbuf)
+{
+   assert(!sbuf->user);
+
+   if(!sbuf->hwbuf) {
+      struct svga_winsys_screen *sws = ss->sws;
+      unsigned alignment = 16;
+      unsigned usage = 0;
+      unsigned size = sbuf->b.b.width0;
+      
+      sbuf->hwbuf = sws->buffer_create(sws, alignment, usage, size);
+      if(!sbuf->hwbuf)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      
+      assert(!sbuf->dma.pending);
+   }
+   
+   return PIPE_OK;
+}
+
+
+
+enum pipe_error
+svga_buffer_create_host_surface(struct svga_screen *ss,
+                                struct svga_buffer *sbuf)
+{
+   if(!sbuf->handle) {
+      sbuf->key.flags = 0;
+      
+      sbuf->key.format = SVGA3D_BUFFER;
+      if(sbuf->b.b.bind & PIPE_BIND_VERTEX_BUFFER)
+         sbuf->key.flags |= SVGA3D_SURFACE_HINT_VERTEXBUFFER;
+      if(sbuf->b.b.bind & PIPE_BIND_INDEX_BUFFER)
+         sbuf->key.flags |= SVGA3D_SURFACE_HINT_INDEXBUFFER;
+      
+      sbuf->key.size.width = sbuf->b.b.width0;
+      sbuf->key.size.height = 1;
+      sbuf->key.size.depth = 1;
+      
+      sbuf->key.numFaces = 1;
+      sbuf->key.numMipLevels = 1;
+      sbuf->key.cachable = 1;
+      
+      SVGA_DBG(DEBUG_DMA, "surface_create for buffer sz %d\n", sbuf->b.b.width0);
+
+      sbuf->handle = svga_screen_surface_create(ss, &sbuf->key);
+      if(!sbuf->handle)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+   
+      /* Always set the discard flag on the first time the buffer is written
+       * as svga_screen_surface_create might have passed a recycled host
+       * buffer.
+       */
+      sbuf->dma.flags.discard = TRUE;
+
+      SVGA_DBG(DEBUG_DMA, "   --> got sid %p sz %d (buffer)\n", sbuf->handle, sbuf->b.b.width0);
+   }
+   
+   return PIPE_OK;
+}   
+
+
+void
+svga_buffer_destroy_host_surface(struct svga_screen *ss,
+                                 struct svga_buffer *sbuf)
+{
+   if(sbuf->handle) {
+      SVGA_DBG(DEBUG_DMA, " ungrab sid %p sz %d\n", sbuf->handle, sbuf->b.b.width0);
+      svga_screen_surface_destroy(ss, &sbuf->key, &sbuf->handle);
+   }
+}   
+
+
+/**
+ * Variant of SVGA3D_BufferDMA which leaves the copy box temporarily in blank.
+ */
+static enum pipe_error
+svga_buffer_upload_command(struct svga_context *svga,
+                           struct svga_buffer *sbuf)
+{
+   struct svga_winsys_context *swc = svga->swc;
+   struct svga_winsys_buffer *guest = sbuf->hwbuf;
+   struct svga_winsys_surface *host = sbuf->handle;
+   SVGA3dTransferType transfer = SVGA3D_WRITE_HOST_VRAM;
+   SVGA3dCmdSurfaceDMA *cmd;
+   uint32 numBoxes = sbuf->map.num_ranges;
+   SVGA3dCopyBox *boxes;
+   SVGA3dCmdSurfaceDMASuffix *pSuffix;
+   unsigned region_flags;
+   unsigned surface_flags;
+   struct pipe_resource *dummy;
+
+   if(transfer == SVGA3D_WRITE_HOST_VRAM) {
+      region_flags = SVGA_RELOC_READ;
+      surface_flags = SVGA_RELOC_WRITE;
+   }
+   else if(transfer == SVGA3D_READ_HOST_VRAM) {
+      region_flags = SVGA_RELOC_WRITE;
+      surface_flags = SVGA_RELOC_READ;
+   }
+   else {
+      assert(0);
+      return PIPE_ERROR_BAD_INPUT;
+   }
+
+   assert(numBoxes);
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_DMA,
+                            sizeof *cmd + numBoxes * sizeof *boxes + sizeof *pSuffix,
+                            2);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->region_relocation(swc, &cmd->guest.ptr, guest, 0, region_flags);
+   cmd->guest.pitch = 0;
+
+   swc->surface_relocation(swc, &cmd->host.sid, host, surface_flags);
+   cmd->host.face = 0;
+   cmd->host.mipmap = 0;
+
+   cmd->transfer = transfer;
+
+   sbuf->dma.boxes = (SVGA3dCopyBox *)&cmd[1];
+   sbuf->dma.svga = svga;
+
+   /* Increment reference count */
+   dummy = NULL;
+   pipe_resource_reference(&dummy, &sbuf->b.b);
+
+   pSuffix = (SVGA3dCmdSurfaceDMASuffix *)((uint8_t*)cmd + sizeof *cmd + numBoxes * sizeof *boxes);
+   pSuffix->suffixSize = sizeof *pSuffix;
+   pSuffix->maximumOffset = sbuf->b.b.width0;
+   pSuffix->flags = sbuf->dma.flags;
+
+   SVGA_FIFOCommitAll(swc);
+
+   sbuf->dma.flags.discard = FALSE;
+
+   return PIPE_OK;
+}
+
+
+/**
+ * Patch up the upload DMA command reserved by svga_buffer_upload_command
+ * with the final ranges.
+ */
+static void
+svga_buffer_upload_flush(struct svga_context *svga,
+                         struct svga_buffer *sbuf)
+{
+   SVGA3dCopyBox *boxes;
+   unsigned i;
+
+   assert(sbuf->handle); 
+   assert(sbuf->hwbuf);
+   assert(sbuf->map.num_ranges);
+   assert(sbuf->dma.svga == svga);
+   assert(sbuf->dma.boxes);
+   
+   /*
+    * Patch the DMA command with the final copy box.
+    */
+
+   SVGA_DBG(DEBUG_DMA, "dma to sid %p\n", sbuf->handle);
+
+   boxes = sbuf->dma.boxes;
+   for(i = 0; i < sbuf->map.num_ranges; ++i) {
+      SVGA_DBG(DEBUG_DMA, "  bytes %u - %u\n",
+               sbuf->map.ranges[i].start, sbuf->map.ranges[i].end);
+
+      boxes[i].x = sbuf->map.ranges[i].start;
+      boxes[i].y = 0;
+      boxes[i].z = 0;
+      boxes[i].w = sbuf->map.ranges[i].end - sbuf->map.ranges[i].start;
+      boxes[i].h = 1;
+      boxes[i].d = 1;
+      boxes[i].srcx = sbuf->map.ranges[i].start;
+      boxes[i].srcy = 0;
+      boxes[i].srcz = 0;
+   }
+
+   sbuf->map.num_ranges = 0;
+
+   assert(sbuf->head.prev && sbuf->head.next);
+   LIST_DEL(&sbuf->head);
+#ifdef DEBUG
+   sbuf->head.next = sbuf->head.prev = NULL; 
+#endif
+   sbuf->dma.pending = FALSE;
+
+   sbuf->dma.svga = NULL;
+   sbuf->dma.boxes = NULL;
+
+   /* Decrement reference count */
+   pipe_reference(&(sbuf->b.b.reference), NULL);
+   sbuf = NULL;
+}
+
+
+
+/**
+ * Note a dirty range.
+ *
+ * This function only notes the range down. It doesn't actually emit a DMA
+ * upload command. That only happens when a context tries to refer to this
+ * buffer, and the DMA upload command is added to that context's command buffer.
+ * 
+ * We try to lump as many contiguous DMA transfers together as possible.
+ */
+void
+svga_buffer_add_range(struct svga_buffer *sbuf,
+                      unsigned start,
+                      unsigned end)
+{
+   unsigned i;
+   unsigned nearest_range;
+   unsigned nearest_dist;
+
+   assert(end > start);
+   
+   if (sbuf->map.num_ranges < SVGA_BUFFER_MAX_RANGES) {
+      nearest_range = sbuf->map.num_ranges;
+      nearest_dist = ~0;
+   } else {
+      nearest_range = SVGA_BUFFER_MAX_RANGES - 1;
+      nearest_dist = 0;
+   }
+
+   /*
+    * Try to grow one of the ranges.
+    *
+    * Note that it is not this function task to care about overlapping ranges,
+    * as the GMR was already given so it is too late to do anything. Situations
+    * where overlapping ranges may pose a problem should be detected via
+    * pipe_context::is_resource_referenced and the context that refers to the
+    * buffer should be flushed.
+    */
+
+   for(i = 0; i < sbuf->map.num_ranges; ++i) {
+      int left_dist;
+      int right_dist;
+      int dist;
+
+      left_dist = start - sbuf->map.ranges[i].end;
+      right_dist = sbuf->map.ranges[i].start - end;
+      dist = MAX2(left_dist, right_dist);
+
+      if (dist <= 0) {
+         /*
+          * Ranges are contiguous or overlapping -- extend this one and return.
+          */
+
+         sbuf->map.ranges[i].start = MIN2(sbuf->map.ranges[i].start, start);
+         sbuf->map.ranges[i].end   = MAX2(sbuf->map.ranges[i].end,   end);
+         return;
+      }
+      else {
+         /*
+          * Discontiguous ranges -- keep track of the nearest range.
+          */
+
+         if (dist < nearest_dist) {
+            nearest_range = i;
+            nearest_dist = dist;
+         }
+      }
+   }
+
+   /*
+    * We cannot add a new range to an existing DMA command, so patch-up the
+    * pending DMA upload and start clean.
+    */
+
+   if(sbuf->dma.pending)
+      svga_buffer_upload_flush(sbuf->dma.svga, sbuf);
+
+   assert(!sbuf->dma.pending);
+   assert(!sbuf->dma.svga);
+   assert(!sbuf->dma.boxes);
+
+   if (sbuf->map.num_ranges < SVGA_BUFFER_MAX_RANGES) {
+      /*
+       * Add a new range.
+       */
+
+      sbuf->map.ranges[sbuf->map.num_ranges].start = start;
+      sbuf->map.ranges[sbuf->map.num_ranges].end = end;
+      ++sbuf->map.num_ranges;
+   } else {
+      /*
+       * Everything else failed, so just extend the nearest range.
+       *
+       * It is OK to do this because we always keep a local copy of the
+       * host buffer data, for SW TNL, and the host never modifies the buffer.
+       */
+
+      assert(nearest_range < SVGA_BUFFER_MAX_RANGES);
+      assert(nearest_range < sbuf->map.num_ranges);
+      sbuf->map.ranges[nearest_range].start = MIN2(sbuf->map.ranges[nearest_range].start, start);
+      sbuf->map.ranges[nearest_range].end   = MAX2(sbuf->map.ranges[nearest_range].end,   end);
+   }
+}
+
+
+
+/**
+ * Copy the contents of the malloc buffer to a hardware buffer.
+ */
+static INLINE enum pipe_error
+svga_buffer_update_hw(struct svga_screen *ss, struct svga_buffer *sbuf)
+{
+   assert(!sbuf->user);
+   if(!sbuf->hwbuf) {
+      enum pipe_error ret;
+      void *map;
+      
+      assert(sbuf->swbuf);
+      if(!sbuf->swbuf)
+         return PIPE_ERROR;
+      
+      ret = svga_buffer_create_hw_storage(ss, sbuf);
+      if(ret != PIPE_OK)
+         return ret;
+
+      pipe_mutex_lock(ss->swc_mutex);
+      map = ss->sws->buffer_map(ss->sws, sbuf->hwbuf, PIPE_TRANSFER_WRITE);
+      assert(map);
+      if(!map) {
+	 pipe_mutex_unlock(ss->swc_mutex);
+         svga_buffer_destroy_hw_storage(ss, sbuf);
+         return PIPE_ERROR;
+      }
+
+      memcpy(map, sbuf->swbuf, sbuf->b.b.width0);
+      ss->sws->buffer_unmap(ss->sws, sbuf->hwbuf);
+
+      /* This user/malloc buffer is now indistinguishable from a gpu buffer */
+      assert(!sbuf->map.count);
+      if(!sbuf->map.count) {
+         if(sbuf->user)
+            sbuf->user = FALSE;
+         else
+            align_free(sbuf->swbuf);
+         sbuf->swbuf = NULL;
+      }
+      
+      pipe_mutex_unlock(ss->swc_mutex);
+   }
+   
+   return PIPE_OK;
+}
+
+
+/**
+ * Upload the buffer to the host in a piecewise fashion.
+ *
+ * Used when the buffer is too big to fit in the GMR aperture.
+ */
+static INLINE enum pipe_error
+svga_buffer_upload_piecewise(struct svga_screen *ss,
+                             struct svga_context *svga,
+                             struct svga_buffer *sbuf)
+{
+   struct svga_winsys_screen *sws = ss->sws;
+   const unsigned alignment = sizeof(void *);
+   const unsigned usage = 0;
+   unsigned i;
+
+   assert(sbuf->map.num_ranges);
+   assert(!sbuf->dma.pending);
+
+   SVGA_DBG(DEBUG_DMA, "dma to sid %p\n", sbuf->handle);
+
+   for (i = 0; i < sbuf->map.num_ranges; ++i) {
+      struct svga_buffer_range *range = &sbuf->map.ranges[i];
+      unsigned offset = range->start;
+      unsigned size = range->end - range->start;
+
+      while (offset < range->end) {
+         struct svga_winsys_buffer *hwbuf;
+         uint8_t *map;
+         enum pipe_error ret;
+
+         if (offset + size > range->end)
+            size = range->end - offset;
+
+         hwbuf = sws->buffer_create(sws, alignment, usage, size);
+         while (!hwbuf) {
+            size /= 2;
+            if (!size)
+               return PIPE_ERROR_OUT_OF_MEMORY;
+            hwbuf = sws->buffer_create(sws, alignment, usage, size);
+         }
+
+         SVGA_DBG(DEBUG_DMA, "  bytes %u - %u\n",
+                  offset, offset + size);
+
+         map = sws->buffer_map(sws, hwbuf,
+                               PIPE_TRANSFER_WRITE |
+                               PIPE_TRANSFER_DISCARD);
+         assert(map);
+         if (map) {
+            memcpy(map, sbuf->swbuf, size);
+            sws->buffer_unmap(sws, hwbuf);
+         }
+
+         ret = SVGA3D_BufferDMA(svga->swc,
+                                hwbuf, sbuf->handle,
+                                SVGA3D_WRITE_HOST_VRAM,
+                                size, 0, offset, sbuf->dma.flags);
+         if(ret != PIPE_OK) {
+            svga_context_flush(svga, NULL);
+            ret =  SVGA3D_BufferDMA(svga->swc,
+                                    hwbuf, sbuf->handle,
+                                    SVGA3D_WRITE_HOST_VRAM,
+                                    size, 0, offset, sbuf->dma.flags);
+            assert(ret == PIPE_OK);
+         }
+
+         sbuf->dma.flags.discard = FALSE;
+
+         sws->buffer_destroy(sws, hwbuf);
+
+         offset += size;
+      }
+   }
+
+   sbuf->map.num_ranges = 0;
+
+   return PIPE_OK;
+}
+
+
+
+
+/* Get (or create/upload) the winsys surface handle so that we can
+ * refer to this buffer in fifo commands.
+ */
+struct svga_winsys_surface *
+svga_buffer_handle(struct svga_context *svga,
+                   struct pipe_resource *buf)
+{
+   struct pipe_screen *screen = svga->pipe.screen;
+   struct svga_screen *ss = svga_screen(screen);
+   struct svga_buffer *sbuf;
+   enum pipe_error ret;
+
+   if(!buf)
+      return NULL;
+
+   sbuf = svga_buffer(buf);
+   
+   assert(!sbuf->map.count);
+   assert(!sbuf->user);
+   
+   if(!sbuf->handle) {
+      ret = svga_buffer_create_host_surface(ss, sbuf);
+      if(ret != PIPE_OK)
+	 return NULL;
+   }
+
+   assert(sbuf->handle);
+
+   if (sbuf->map.num_ranges) {
+      if (!sbuf->dma.pending) {
+         /*
+          * No pending DMA upload yet, so insert a DMA upload command now.
+          */
+
+         /*
+          * Migrate the data from swbuf -> hwbuf if necessary.
+          */
+         ret = svga_buffer_update_hw(ss, sbuf);
+         if (ret == PIPE_OK) {
+            /*
+             * Queue a dma command.
+             */
+
+            ret = svga_buffer_upload_command(svga, sbuf);
+            if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
+               svga_context_flush(svga, NULL);
+               ret = svga_buffer_upload_command(svga, sbuf);
+               assert(ret == PIPE_OK);
+            }
+            if (ret == PIPE_OK) {
+               sbuf->dma.pending = TRUE;
+               assert(!sbuf->head.prev && !sbuf->head.next);
+               LIST_ADDTAIL(&sbuf->head, &svga->dirty_buffers);
+            }
+         }
+         else if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
+            /*
+             * The buffer is too big to fit in the GMR aperture, so break it in
+             * smaller pieces.
+             */
+            ret = svga_buffer_upload_piecewise(ss, svga, sbuf);
+         }
+
+         if (ret != PIPE_OK) {
+            /*
+             * Something unexpected happened above. There is very little that
+             * we can do other than proceeding while ignoring the dirty ranges.
+             */
+            assert(0);
+            sbuf->map.num_ranges = 0;
+         }
+      }
+      else {
+         /*
+          * There a pending dma already. Make sure it is from this context.
+          */
+         assert(sbuf->dma.svga == svga);
+      }
+   }
+
+   assert(!sbuf->map.num_ranges || sbuf->dma.pending);
+
+   return sbuf->handle;
+}
+
+
+
+void
+svga_context_flush_buffers(struct svga_context *svga)
+{
+   struct list_head *curr, *next;
+   struct svga_buffer *sbuf;
+
+   curr = svga->dirty_buffers.next;
+   next = curr->next;
+   while(curr != &svga->dirty_buffers) {
+      sbuf = LIST_ENTRY(struct svga_buffer, curr, head);
+
+      assert(p_atomic_read(&sbuf->b.b.reference.count) != 0);
+      assert(sbuf->dma.pending);
+      
+      svga_buffer_upload_flush(svga, sbuf);
+
+      curr = next; 
+      next = curr->next;
+   }
+}
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.h b/src/gallium/drivers/svga/svga_resource_buffer_upload.h
new file mode 100644
index 0000000000..11df306526
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.h
@@ -0,0 +1,54 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_BUFFER_UPLOAD_H
+#define SVGA_BUFFER_UPLOAD_H
+
+
+void
+svga_buffer_add_range(struct svga_buffer *sbuf,
+                      unsigned start,
+                      unsigned end);
+
+enum pipe_error
+svga_buffer_create_hw_storage(struct svga_screen *ss,
+                              struct svga_buffer *sbuf);
+
+void
+svga_buffer_destroy_hw_storage(struct svga_screen *ss,
+			       struct svga_buffer *sbuf);
+
+enum pipe_error
+svga_buffer_create_host_surface(struct svga_screen *ss,
+                                struct svga_buffer *sbuf);
+
+void
+svga_buffer_destroy_host_surface(struct svga_screen *ss,
+                                 struct svga_buffer *sbuf);
+
+
+
+
+#endif /* SVGA_BUFFER_H */
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c
new file mode 100644
index 0000000000..ff83c750aa
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -0,0 +1,634 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "os/os_thread.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_screen.h"
+#include "svga_context.h"
+#include "svga_resource_texture.h"
+#include "svga_resource_buffer.h"
+#include "svga_sampler_view.h"
+#include "svga_winsys.h"
+#include "svga_debug.h"
+
+
+/* XXX: This isn't a real hardware flag, but just a hack for kernel to
+ * know about primary surfaces. Find a better way to accomplish this.
+ */
+#define SVGA3D_SURFACE_HINT_SCANOUT (1 << 9)
+
+
+static unsigned int
+svga_texture_is_referenced( struct pipe_context *pipe,
+			    struct pipe_resource *texture,
+			    unsigned face, unsigned level)
+{
+   struct svga_texture *tex = svga_texture(texture);
+   struct svga_screen *ss = svga_screen(pipe->screen);
+
+   /**
+    * The screen does not cache texture writes.
+    */
+
+   if (!tex->handle || ss->sws->surface_is_flushed(ss->sws, tex->handle))
+      return PIPE_UNREFERENCED;
+
+   /**
+    * sws->surface_is_flushed() does not distinguish between read references
+    * and write references. So assume a reference is both.
+    */
+
+   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+}
+
+
+
+/*
+ * Helper function and arrays
+ */
+
+SVGA3dSurfaceFormat
+svga_translate_format(enum pipe_format format)
+{
+   switch(format) {
+   
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return SVGA3D_A8R8G8B8;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      return SVGA3D_X8R8G8B8;
+
+      /* Required for GL2.1:
+       */
+   case PIPE_FORMAT_B8G8R8A8_SRGB:
+      return SVGA3D_A8R8G8B8;
+
+   case PIPE_FORMAT_B5G6R5_UNORM:
+      return SVGA3D_R5G6B5;
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
+      return SVGA3D_A1R5G5B5;
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
+      return SVGA3D_A4R4G4B4;
+
+      
+   /* XXX: Doesn't seem to work properly.
+   case PIPE_FORMAT_Z32_UNORM:
+      return SVGA3D_Z_D32;
+    */
+   case PIPE_FORMAT_Z16_UNORM:
+      return SVGA3D_Z_D16;
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+      return SVGA3D_Z_D24S8;
+   case PIPE_FORMAT_X8Z24_UNORM:
+      return SVGA3D_Z_D24X8;
+
+   case PIPE_FORMAT_A8_UNORM:
+      return SVGA3D_ALPHA8;
+   case PIPE_FORMAT_L8_UNORM:
+      return SVGA3D_LUMINANCE8;
+
+   case PIPE_FORMAT_DXT1_RGB:
+   case PIPE_FORMAT_DXT1_RGBA:
+      return SVGA3D_DXT1;
+   case PIPE_FORMAT_DXT3_RGBA:
+      return SVGA3D_DXT3;
+   case PIPE_FORMAT_DXT5_RGBA:
+      return SVGA3D_DXT5;
+
+   default:
+      return SVGA3D_FORMAT_INVALID;
+   }
+}
+
+
+SVGA3dSurfaceFormat
+svga_translate_format_render(enum pipe_format format)
+{
+   switch(format) { 
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
+   case PIPE_FORMAT_B5G6R5_UNORM:
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_Z32_UNORM:
+   case PIPE_FORMAT_Z16_UNORM:
+   case PIPE_FORMAT_L8_UNORM:
+      return svga_translate_format(format);
+
+#if 1
+   /* For on host conversion */
+   case PIPE_FORMAT_DXT1_RGB:
+      return SVGA3D_X8R8G8B8;
+   case PIPE_FORMAT_DXT1_RGBA:
+   case PIPE_FORMAT_DXT3_RGBA:
+   case PIPE_FORMAT_DXT5_RGBA:
+      return SVGA3D_A8R8G8B8;
+#endif
+
+   default:
+      return SVGA3D_FORMAT_INVALID;
+   }
+}
+
+
+static INLINE void
+svga_transfer_dma_band(struct svga_context *svga,
+                       struct svga_transfer *st,
+                       SVGA3dTransferType transfer,
+                       unsigned y, unsigned h, unsigned srcy)
+{
+   struct svga_texture *texture = svga_texture(st->base.resource); 
+   SVGA3dCopyBox box;
+   enum pipe_error ret;
+   
+   SVGA_DBG(DEBUG_DMA, "dma %s sid %p, face %u, (%u, %u, %u) - (%u, %u, %u), %ubpp\n",
+                transfer == SVGA3D_WRITE_HOST_VRAM ? "to" : "from", 
+                texture->handle,
+                st->base.sr.face,
+                st->base.box.x,
+                y,
+                st->base.box.z,
+                st->base.box.x + st->base.box.width,
+                y + h,
+                st->base.box.z + 1,
+                util_format_get_blocksize(texture->b.b.format) * 8 /
+                (util_format_get_blockwidth(texture->b.b.format)*util_format_get_blockheight(texture->b.b.format)));
+   
+   box.x = st->base.box.x;
+   box.y = y;
+   box.z = st->base.box.z;
+   box.w = st->base.box.width;
+   box.h = h;
+   box.d = 1;
+   box.srcx = 0;
+   box.srcy = srcy;
+   box.srcz = 0;
+
+   ret = SVGA3D_SurfaceDMA(svga->swc, st, transfer, &box, 1);
+   if(ret != PIPE_OK) {
+      svga->swc->flush(svga->swc, NULL);
+      ret = SVGA3D_SurfaceDMA(svga->swc, st, transfer, &box, 1);
+      assert(ret == PIPE_OK);
+   }
+}
+
+
+static INLINE void
+svga_transfer_dma(struct svga_context *svga,
+                  struct svga_transfer *st,
+                  SVGA3dTransferType transfer)
+{
+   struct svga_texture *texture = svga_texture(st->base.resource); 
+   struct svga_screen *screen = svga_screen(texture->b.b.screen);
+   struct svga_winsys_screen *sws = screen->sws;
+   struct pipe_fence_handle *fence = NULL;
+   
+   if (transfer == SVGA3D_READ_HOST_VRAM) {
+      SVGA_DBG(DEBUG_PERF, "%s: readback transfer\n", __FUNCTION__);
+   }
+
+
+   if(!st->swbuf) {
+      /* Do the DMA transfer in a single go */
+      
+      svga_transfer_dma_band(svga, st, transfer, st->base.box.y, st->base.box.height, 0);
+
+      if(transfer == SVGA3D_READ_HOST_VRAM) {
+         svga_context_flush(svga, &fence);
+         sws->fence_finish(sws, fence, 0);
+         sws->fence_reference(sws, &fence, NULL);
+      }
+   }
+   else {
+      unsigned y, h, srcy;
+      unsigned blockheight = util_format_get_blockheight(st->base.resource->format);
+      h = st->hw_nblocksy * blockheight;
+      srcy = 0;
+      for(y = 0; y < st->base.box.height; y += h) {
+         unsigned offset, length;
+         void *hw, *sw;
+
+         if (y + h > st->base.box.height)
+            h = st->base.box.height - y;
+
+         /* Transfer band must be aligned to pixel block boundaries */
+         assert(y % blockheight == 0);
+         assert(h % blockheight == 0);
+         
+         offset = y * st->base.stride / blockheight;
+         length = h * st->base.stride / blockheight;
+
+         sw = (uint8_t *)st->swbuf + offset;
+         
+         if(transfer == SVGA3D_WRITE_HOST_VRAM) {
+            /* Wait for the previous DMAs to complete */
+            /* TODO: keep one DMA (at half the size) in the background */
+            if(y) {
+               svga_context_flush(svga, &fence);
+               sws->fence_finish(sws, fence, 0);
+               sws->fence_reference(sws, &fence, NULL);
+            }
+
+            hw = sws->buffer_map(sws, st->hwbuf, PIPE_TRANSFER_WRITE);
+            assert(hw);
+            if(hw) {
+               memcpy(hw, sw, length);
+               sws->buffer_unmap(sws, st->hwbuf);
+            }
+         }
+         
+         svga_transfer_dma_band(svga, st, transfer, y, h, srcy);
+         
+         if(transfer == SVGA3D_READ_HOST_VRAM) {
+            svga_context_flush(svga, &fence);
+            sws->fence_finish(sws, fence, 0);
+
+            hw = sws->buffer_map(sws, st->hwbuf, PIPE_TRANSFER_READ);
+            assert(hw);
+            if(hw) {
+               memcpy(sw, hw, length);
+               sws->buffer_unmap(sws, st->hwbuf);
+            }
+         }
+      }
+   }
+}
+
+
+
+
+
+static boolean 
+svga_texture_get_handle(struct pipe_screen *screen,
+                               struct pipe_resource *texture,
+                               struct winsys_handle *whandle)
+{
+   struct svga_winsys_screen *sws = svga_winsys_screen(texture->screen);
+   unsigned stride;
+
+   assert(svga_texture(texture)->key.cachable == 0);
+   svga_texture(texture)->key.cachable = 0;
+   stride = util_format_get_nblocksx(texture->format, texture->width0) *
+            util_format_get_blocksize(texture->format);
+   return sws->surface_get_handle(sws, svga_texture(texture)->handle, stride, whandle);
+}
+
+
+static void
+svga_texture_destroy(struct pipe_screen *screen,
+		     struct pipe_resource *pt)
+{
+   struct svga_screen *ss = svga_screen(screen);
+   struct svga_texture *tex = (struct svga_texture *)pt;
+
+   ss->texture_timestamp++;
+
+   svga_sampler_view_reference(&tex->cached_view, NULL);
+
+   /*
+     DBG("%s deleting %p\n", __FUNCTION__, (void *) tex);
+   */
+   SVGA_DBG(DEBUG_DMA, "unref sid %p (texture)\n", tex->handle);
+   svga_screen_surface_destroy(ss, &tex->key, &tex->handle);
+
+   FREE(tex);
+}
+
+
+
+
+
+
+
+/* XXX: Still implementing this as if it was a screen function, but
+ * can now modify it to queue transfers on the context.
+ */
+static struct pipe_transfer *
+svga_texture_get_transfer(struct pipe_context *pipe,
+			  struct pipe_resource *texture,
+			  struct pipe_subresource sr,
+			  unsigned usage,
+			  const struct pipe_box *box)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_screen *ss = svga_screen(pipe->screen);
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_transfer *st;
+   unsigned nblocksx = util_format_get_nblocksx(texture->format, box->width);
+   unsigned nblocksy = util_format_get_nblocksy(texture->format, box->height);
+
+   /* We can't map texture storage directly */
+   if (usage & PIPE_TRANSFER_MAP_DIRECTLY)
+      return NULL;
+
+   st = CALLOC_STRUCT(svga_transfer);
+   if (!st)
+      return NULL;
+   
+   pipe_resource_reference(&st->base.resource, texture);
+   st->base.sr = sr;
+   st->base.usage = usage;
+   st->base.box = *box;
+   st->base.stride = nblocksx*util_format_get_blocksize(texture->format);
+   st->base.slice_stride = 0;
+
+   st->hw_nblocksy = nblocksy;
+   
+   st->hwbuf = svga_winsys_buffer_create(svga,
+                                         1, 
+                                         0,
+                                         st->hw_nblocksy*st->base.stride);
+   while(!st->hwbuf && (st->hw_nblocksy /= 2)) {
+      st->hwbuf = svga_winsys_buffer_create(svga,
+                                            1, 
+                                            0,
+                                            st->hw_nblocksy*st->base.stride);
+   }
+
+   if(!st->hwbuf)
+      goto no_hwbuf;
+
+   if(st->hw_nblocksy < nblocksy) {
+      /* We couldn't allocate a hardware buffer big enough for the transfer, 
+       * so allocate regular malloc memory instead */
+      debug_printf("%s: failed to allocate %u KB of DMA, splitting into %u x %u KB DMA transfers\n",
+                   __FUNCTION__,
+                   (nblocksy*st->base.stride + 1023)/1024,
+                   (nblocksy + st->hw_nblocksy - 1)/st->hw_nblocksy,
+                   (st->hw_nblocksy*st->base.stride + 1023)/1024);
+      st->swbuf = MALLOC(nblocksy*st->base.stride);
+      if(!st->swbuf)
+         goto no_swbuf;
+   }
+   
+   if (usage & PIPE_TRANSFER_READ)
+      svga_transfer_dma(svga, st, SVGA3D_READ_HOST_VRAM);
+
+   return &st->base;
+
+no_swbuf:
+   sws->buffer_destroy(sws, st->hwbuf);
+no_hwbuf:
+   FREE(st);
+   return NULL;
+}
+
+
+/* XXX: Still implementing this as if it was a screen function, but
+ * can now modify it to queue transfers on the context.
+ */
+static void *
+svga_texture_transfer_map( struct pipe_context *pipe,
+			   struct pipe_transfer *transfer )
+{
+   struct svga_screen *ss = svga_screen(pipe->screen);
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_transfer *st = svga_transfer(transfer);
+
+   if(st->swbuf)
+      return st->swbuf;
+   else
+      /* The wait for read transfers already happened when svga_transfer_dma
+       * was called. */
+      return sws->buffer_map(sws, st->hwbuf, transfer->usage);
+}
+
+
+/* XXX: Still implementing this as if it was a screen function, but
+ * can now modify it to queue transfers on the context.
+ */
+static void
+svga_texture_transfer_unmap(struct pipe_context *pipe,
+			    struct pipe_transfer *transfer)
+{
+   struct svga_screen *ss = svga_screen(pipe->screen);
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_transfer *st = svga_transfer(transfer);
+   
+   if(!st->swbuf)
+      sws->buffer_unmap(sws, st->hwbuf);
+}
+
+
+static void
+svga_texture_transfer_destroy(struct pipe_context *pipe,
+			      struct pipe_transfer *transfer)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_texture *tex = svga_texture(transfer->resource);
+   struct svga_screen *ss = svga_screen(pipe->screen);
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_transfer *st = svga_transfer(transfer);
+
+   if (st->base.usage & PIPE_TRANSFER_WRITE) {
+      svga_transfer_dma(svga, st, SVGA3D_WRITE_HOST_VRAM);
+      ss->texture_timestamp++;
+      tex->view_age[transfer->sr.level] = ++(tex->age);
+      tex->defined[transfer->sr.face][transfer->sr.level] = TRUE;
+   }
+
+   pipe_resource_reference(&st->base.resource, NULL);
+   FREE(st->swbuf);
+   sws->buffer_destroy(sws, st->hwbuf);
+   FREE(st);
+}
+
+
+
+
+
+struct u_resource_vtbl svga_texture_vtbl = 
+{
+   svga_texture_get_handle,	      /* get_handle */
+   svga_texture_destroy,	      /* resource_destroy */
+   svga_texture_is_referenced,	      /* is_resource_referenced */
+   svga_texture_get_transfer,	      /* get_transfer */
+   svga_texture_transfer_destroy,     /* transfer_destroy */
+   svga_texture_transfer_map,	      /* transfer_map */
+   u_default_transfer_flush_region,   /* transfer_flush_region */
+   svga_texture_transfer_unmap,	      /* transfer_unmap */
+   u_default_transfer_inline_write    /* transfer_inline_write */
+};
+
+
+
+
+struct pipe_resource *
+svga_texture_create(struct pipe_screen *screen,
+                    const struct pipe_resource *template)
+{
+   struct svga_screen *svgascreen = svga_screen(screen);
+   struct svga_texture *tex = CALLOC_STRUCT(svga_texture);
+   
+   if (!tex)
+      goto error1;
+
+   tex->b.b = *template;
+   tex->b.vtbl = &svga_texture_vtbl;
+   pipe_reference_init(&tex->b.b.reference, 1);
+   tex->b.b.screen = screen;
+
+   assert(template->last_level < SVGA_MAX_TEXTURE_LEVELS);
+   if(template->last_level >= SVGA_MAX_TEXTURE_LEVELS)
+      goto error2;
+   
+   tex->key.flags = 0;
+   tex->key.size.width = template->width0;
+   tex->key.size.height = template->height0;
+   tex->key.size.depth = template->depth0;
+   
+   if(template->target == PIPE_TEXTURE_CUBE) {
+      tex->key.flags |= SVGA3D_SURFACE_CUBEMAP;
+      tex->key.numFaces = 6;
+   }
+   else {
+      tex->key.numFaces = 1;
+   }
+
+   tex->key.cachable = 1;
+
+   if (template->bind & PIPE_BIND_SAMPLER_VIEW)
+      tex->key.flags |= SVGA3D_SURFACE_HINT_TEXTURE;
+
+   if (template->bind & PIPE_BIND_DISPLAY_TARGET) {
+      tex->key.cachable = 0;
+   }
+
+   if (template->bind & PIPE_BIND_SHARED) {
+      tex->key.cachable = 0;
+   }
+
+   if (template->bind & PIPE_BIND_SCANOUT) {
+      tex->key.flags |= SVGA3D_SURFACE_HINT_SCANOUT;
+      tex->key.cachable = 0;
+   }
+   
+   /* 
+    * XXX: Never pass the SVGA3D_SURFACE_HINT_RENDERTARGET hint. Mesa cannot
+    * know beforehand whether a texture will be used as a rendertarget or not
+    * and it always requests PIPE_BIND_RENDER_TARGET, therefore
+    * passing the SVGA3D_SURFACE_HINT_RENDERTARGET here defeats its purpose.
+    */
+#if 0
+   if((template->bind & PIPE_BIND_RENDER_TARGET) &&
+      !util_format_is_s3tc(template->format))
+      tex->key.flags |= SVGA3D_SURFACE_HINT_RENDERTARGET;
+#endif
+   
+   if(template->bind & PIPE_BIND_DEPTH_STENCIL)
+      tex->key.flags |= SVGA3D_SURFACE_HINT_DEPTHSTENCIL;
+   
+   tex->key.numMipLevels = template->last_level + 1;
+   
+   tex->key.format = svga_translate_format(template->format);
+   if(tex->key.format == SVGA3D_FORMAT_INVALID)
+      goto error2;
+
+   SVGA_DBG(DEBUG_DMA, "surface_create for texture\n", tex->handle);
+   tex->handle = svga_screen_surface_create(svgascreen, &tex->key);
+   if (tex->handle)
+      SVGA_DBG(DEBUG_DMA, "  --> got sid %p (texture)\n", tex->handle);
+
+   return &tex->b.b;
+
+error2:
+   FREE(tex);
+error1:
+   return NULL;
+}
+
+
+
+
+struct pipe_resource *
+svga_texture_from_handle(struct pipe_screen *screen,
+			 const struct pipe_resource *template,
+			 struct winsys_handle *whandle)
+{
+   struct svga_winsys_screen *sws = svga_winsys_screen(screen);
+   struct svga_winsys_surface *srf;
+   struct svga_texture *tex;
+   enum SVGA3dSurfaceFormat format = 0;
+   assert(screen);
+
+   /* Only supports one type */
+   if (template->target != PIPE_TEXTURE_2D ||
+       template->last_level != 0 ||
+       template->depth0 != 1) {
+      return NULL;
+   }
+
+   srf = sws->surface_from_handle(sws, whandle, &format);
+
+   if (!srf)
+      return NULL;
+
+   if (svga_translate_format(template->format) != format) {
+      unsigned f1 = svga_translate_format(template->format);
+      unsigned f2 = format;
+
+      /* It's okay for XRGB and ARGB or depth with/out stencil to get mixed up */
+      if ( !( (f1 == SVGA3D_X8R8G8B8 && f2 == SVGA3D_A8R8G8B8) ||
+              (f1 == SVGA3D_A8R8G8B8 && f2 == SVGA3D_X8R8G8B8) ||
+              (f1 == SVGA3D_Z_D24X8 && f2 == SVGA3D_Z_D24S8) ) ) {
+         debug_printf("%s wrong format %u != %u\n", __FUNCTION__, f1, f2);
+         return NULL;
+      }
+   }
+
+   tex = CALLOC_STRUCT(svga_texture);
+   if (!tex)
+      return NULL;
+
+   tex->b.b = *template;
+   tex->b.vtbl = &svga_texture_vtbl;
+   pipe_reference_init(&tex->b.b.reference, 1);
+   tex->b.b.screen = screen;
+
+   if (format == SVGA3D_X8R8G8B8)
+      tex->b.b.format = PIPE_FORMAT_B8G8R8X8_UNORM;
+   else if (format == SVGA3D_A8R8G8B8)
+      tex->b.b.format = PIPE_FORMAT_B8G8R8A8_UNORM;
+   else {
+      /* ?? */
+   }
+
+   SVGA_DBG(DEBUG_DMA, "wrap surface sid %p\n", srf);
+
+   tex->key.cachable = 0;
+   tex->handle = srf;
+
+   return &tex->b.b;
+}
+
diff --git a/src/gallium/drivers/svga/svga_resource_texture.h b/src/gallium/drivers/svga/svga_resource_texture.h
new file mode 100644
index 0000000000..631937f2eb
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_resource_texture.h
@@ -0,0 +1,134 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_TEXTURE_H
+#define SVGA_TEXTURE_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_transfer.h"
+#include "svga_screen_cache.h"
+
+struct pipe_context;
+struct pipe_screen;
+struct svga_context;
+struct svga_winsys_surface;
+enum SVGA3dSurfaceFormat;
+
+
+#define SVGA_MAX_TEXTURE_LEVELS 16
+
+
+extern struct u_resource_vtbl svga_texture_vtbl;
+
+
+struct svga_texture 
+{
+   struct u_resource b;
+
+   boolean defined[6][SVGA_MAX_TEXTURE_LEVELS];
+   
+   struct svga_sampler_view *cached_view;
+
+   unsigned view_age[SVGA_MAX_TEXTURE_LEVELS];
+   unsigned age;
+
+   boolean views_modified;
+
+   /**
+    * Creation key for the host surface handle.
+    * 
+    * This structure describes all the host surface characteristics so that it 
+    * can be looked up in cache, since creating a host surface is often a slow
+    * operation.
+    */
+   struct svga_host_surface_cache_key key;
+
+   /**
+    * Handle for the host side surface.
+    *
+    * This handle is owned by this texture. Views should hold on to a reference
+    * to this texture and never destroy this handle directly.
+    */
+   struct svga_winsys_surface *handle;
+};
+
+
+
+/* Note this is only used for texture (not buffer) transfers:
+ */
+struct svga_transfer
+{
+   struct pipe_transfer base;
+
+   struct svga_winsys_buffer *hwbuf;
+
+   /* Height of the hardware buffer in pixel blocks */
+   unsigned hw_nblocksy;
+
+   /* Temporary malloc buffer when we can't allocate a hardware buffer
+    * big enough */
+   void *swbuf;
+};
+
+
+static INLINE struct svga_texture *svga_texture( struct pipe_resource *resource )
+{
+   struct svga_texture *tex = (struct svga_texture *)resource;
+   assert(tex == NULL || tex->b.vtbl == &svga_texture_vtbl);
+   return tex;
+}
+
+
+static INLINE struct svga_transfer *
+svga_transfer(struct pipe_transfer *transfer)
+{
+   assert(transfer);
+   return (struct svga_transfer *)transfer;
+}
+
+
+
+struct pipe_resource *
+svga_texture_create(struct pipe_screen *screen,
+                    const struct pipe_resource *template);
+
+struct pipe_resource *
+svga_texture_from_handle(struct pipe_screen * screen,
+			const struct pipe_resource *template,
+			struct winsys_handle *whandle);
+
+
+
+enum SVGA3dSurfaceFormat
+svga_translate_format(enum pipe_format format);
+
+enum SVGA3dSurfaceFormat
+svga_translate_format_render(enum pipe_format format);
+
+
+#endif /* SVGA_TEXTURE_H */
diff --git a/src/gallium/drivers/svga/svga_sampler_view.c b/src/gallium/drivers/svga/svga_sampler_view.c
new file mode 100644
index 0000000000..6911f13f77
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_sampler_view.c
@@ -0,0 +1,196 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "os/os_thread.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_screen.h"
+#include "svga_context.h"
+#include "svga_resource_texture.h"
+#include "svga_sampler_view.h"
+#include "svga_debug.h"
+#include "svga_surface.h"
+
+
+struct svga_sampler_view *
+svga_get_tex_sampler_view(struct pipe_context *pipe,
+			  struct pipe_resource *pt,
+                          unsigned min_lod, unsigned max_lod)
+{
+   struct svga_screen *ss = svga_screen(pt->screen);
+   struct svga_texture *tex = svga_texture(pt); 
+   struct svga_sampler_view *sv = NULL;
+   SVGA3dSurfaceFormat format = svga_translate_format(pt->format);
+   boolean view = TRUE;
+
+   assert(pt);
+   assert(min_lod >= 0);
+   assert(min_lod <= max_lod);
+   assert(max_lod <= pt->last_level);
+
+
+   /* Is a view needed */
+   {
+      /*
+       * Can't control max lod. For first level views and when we only
+       * look at one level we disable mip filtering to achive the same
+       * results as a view.
+       */
+      if (min_lod == 0 && max_lod >= pt->last_level)
+         view = FALSE;
+
+      if (util_format_is_s3tc(pt->format) && view) {
+         format = svga_translate_format_render(pt->format);
+      }
+
+      if (ss->debug.no_sampler_view)
+         view = FALSE;
+
+      if (ss->debug.force_sampler_view)
+         view = TRUE;
+   }
+
+   /* First try the cache */
+   if (view) {
+      pipe_mutex_lock(ss->tex_mutex);
+      if (tex->cached_view &&
+          tex->cached_view->min_lod == min_lod &&
+          tex->cached_view->max_lod == max_lod) {
+         svga_sampler_view_reference(&sv, tex->cached_view);
+         pipe_mutex_unlock(ss->tex_mutex);
+         SVGA_DBG(DEBUG_VIEWS, "svga: Sampler view: reuse %p, %u %u, last %u\n",
+                              pt, min_lod, max_lod, pt->last_level);
+         svga_validate_sampler_view(svga_context(pipe), sv);
+         return sv;
+      }
+      pipe_mutex_unlock(ss->tex_mutex);
+   }
+
+   sv = CALLOC_STRUCT(svga_sampler_view);
+   pipe_reference_init(&sv->reference, 1);
+   pipe_resource_reference(&sv->texture, pt);
+   sv->min_lod = min_lod;
+   sv->max_lod = max_lod;
+
+   /* No view needed just use the whole texture */
+   if (!view) {
+      SVGA_DBG(DEBUG_VIEWS,
+               "svga: Sampler view: no %p, mips %u..%u, nr %u, size (%ux%ux%u), last %u\n",
+               pt, min_lod, max_lod,
+               max_lod - min_lod + 1,
+               pt->width0,
+               pt->height0,
+               pt->depth0,
+               pt->last_level);
+      sv->key.cachable = 0;
+      sv->handle = tex->handle;
+      return sv;
+   }
+
+   SVGA_DBG(DEBUG_VIEWS,
+            "svga: Sampler view: yes %p, mips %u..%u, nr %u, size (%ux%ux%u), last %u\n",
+            pt, min_lod, max_lod,
+            max_lod - min_lod + 1,
+            pt->width0,
+            pt->height0,
+            pt->depth0,
+            pt->last_level);
+
+   sv->age = tex->age;
+   sv->handle = svga_texture_view_surface(pipe, tex, format,
+                                          min_lod,
+                                          max_lod - min_lod + 1,
+                                          -1, -1,
+                                          &sv->key);
+
+   if (!sv->handle) {
+      assert(0);
+      sv->key.cachable = 0;
+      sv->handle = tex->handle;
+      return sv;
+   }
+
+   pipe_mutex_lock(ss->tex_mutex);
+   svga_sampler_view_reference(&tex->cached_view, sv);
+   pipe_mutex_unlock(ss->tex_mutex);
+
+   return sv;
+}
+
+void
+svga_validate_sampler_view(struct svga_context *svga, struct svga_sampler_view *v)
+{
+   struct svga_texture *tex = svga_texture(v->texture);
+   unsigned numFaces;
+   unsigned age = 0;
+   int i, k;
+
+   assert(svga);
+
+   if (v->handle == tex->handle)
+      return;
+
+   age = tex->age;
+
+   if(tex->b.b.target == PIPE_TEXTURE_CUBE)
+      numFaces = 6;
+   else
+      numFaces = 1;
+
+   for (i = v->min_lod; i <= v->max_lod; i++) {
+      for (k = 0; k < numFaces; k++) {
+         if (v->age < tex->view_age[i])
+            svga_texture_copy_handle(svga,
+                                     tex->handle, 0, 0, 0, i, k,
+                                     v->handle, 0, 0, 0, i - v->min_lod, k,
+                                     u_minify(tex->b.b.width0, i),
+                                     u_minify(tex->b.b.height0, i),
+                                     u_minify(tex->b.b.depth0, i));
+      }
+   }
+
+   v->age = age;
+}
+
+void
+svga_destroy_sampler_view_priv(struct svga_sampler_view *v)
+{
+   struct svga_texture *tex = svga_texture(v->texture);
+
+   if(v->handle != tex->handle) {
+      struct svga_screen *ss = svga_screen(v->texture->screen);
+      SVGA_DBG(DEBUG_DMA, "unref sid %p (sampler view)\n", v->handle);
+      svga_screen_surface_destroy(ss, &v->key, &v->handle);
+   }
+   pipe_resource_reference(&v->texture, NULL);
+   FREE(v);
+}
diff --git a/src/gallium/drivers/svga/svga_sampler_view.h b/src/gallium/drivers/svga/svga_sampler_view.h
new file mode 100644
index 0000000000..e64665f2e5
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_sampler_view.h
@@ -0,0 +1,97 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_SAMPLER_VIEW_H
+#define SVGA_SAMPLER_VIEW_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "svga_screen_cache.h"
+
+struct pipe_context;
+struct pipe_screen;
+struct svga_context;
+struct svga_winsys_surface;
+enum SVGA3dSurfaceFormat;
+
+
+/**
+ * A sampler's view into a texture
+ *
+ * We currently cache one sampler view on
+ * the texture and in there by holding a reference
+ * from the texture to the sampler view.
+ *
+ * Because of this we can not hold a refernce to the
+ * texture from the sampler view. So the user
+ * of the sampler views must make sure that the
+ * texture has a reference take for as long as
+ * the sampler view is refrenced.
+ *
+ * Just unreferencing the sampler_view before the
+ * texture is enough.
+ */
+struct svga_sampler_view
+{
+   struct pipe_reference reference;
+
+   struct pipe_resource *texture;
+
+   int min_lod;
+   int max_lod;
+
+   unsigned age;
+
+   struct svga_host_surface_cache_key key;
+   struct svga_winsys_surface *handle;
+};
+
+
+
+extern struct svga_sampler_view *
+svga_get_tex_sampler_view(struct pipe_context *pipe,
+                          struct pipe_resource *pt,
+                          unsigned min_lod, unsigned max_lod);
+
+void
+svga_validate_sampler_view(struct svga_context *svga, struct svga_sampler_view *v);
+
+void
+svga_destroy_sampler_view_priv(struct svga_sampler_view *v);
+
+static INLINE void
+svga_sampler_view_reference(struct svga_sampler_view **ptr, struct svga_sampler_view *v)
+{
+   struct svga_sampler_view *old = *ptr;
+
+   if (pipe_reference(&(*ptr)->reference, &v->reference))
+      svga_destroy_sampler_view_priv(old);
+   *ptr = v;
+}
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
new file mode 100644
index 0000000000..54d9faeb72
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -0,0 +1,505 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_string.h"
+#include "util/u_math.h"
+
+#include "svga_winsys.h"
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_resource_texture.h"
+#include "svga_resource.h"
+#include "svga_debug.h"
+#include "svga_surface.h"
+
+#include "svga3d_shaderdefs.h"
+
+
+#ifdef DEBUG
+int SVGA_DEBUG = 0;
+
+static const struct debug_named_value svga_debug_flags[] = {
+   { "dma",      DEBUG_DMA, NULL },
+   { "tgsi",     DEBUG_TGSI, NULL },
+   { "pipe",     DEBUG_PIPE, NULL },
+   { "state",    DEBUG_STATE, NULL },
+   { "screen",   DEBUG_SCREEN, NULL },
+   { "tex",      DEBUG_TEX, NULL },
+   { "swtnl",    DEBUG_SWTNL, NULL },
+   { "const",    DEBUG_CONSTS, NULL },
+   { "viewport", DEBUG_VIEWPORT, NULL },
+   { "views",    DEBUG_VIEWS, NULL },
+   { "perf",     DEBUG_PERF, NULL },
+   { "flush",    DEBUG_FLUSH, NULL },
+   { "sync",     DEBUG_SYNC, NULL },
+   { "cache",    DEBUG_CACHE, NULL },
+   DEBUG_NAMED_VALUE_END
+};
+#endif
+
+static const char *
+svga_get_vendor( struct pipe_screen *pscreen )
+{
+   return "VMware, Inc.";
+}
+
+
+static const char *
+svga_get_name( struct pipe_screen *pscreen )
+{
+#ifdef DEBUG
+   /* Only return internal details in the DEBUG version:
+    */
+   return "SVGA3D; build: DEBUG; mutex: " PIPE_ATOMIC;
+#else
+   return "SVGA3D; build: RELEASE; ";
+#endif
+}
+
+
+
+
+static float
+svga_get_paramf(struct pipe_screen *screen, enum pipe_cap param)
+{
+   struct svga_screen *svgascreen = svga_screen(screen);
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   SVGA3dDevCapResult result;
+
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 7.0;
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      /* Keep this to a reasonable size to avoid failures in
+       * conform/pntaa.c:
+       */
+      return SVGA_MAX_POINTSIZE;
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      if(!sws->get_cap(sws, SVGA3D_DEVCAP_MAX_TEXTURE_ANISOTROPY, &result))
+         return 4.0;
+      return result.u;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0;
+
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return 16;
+   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+      return 16;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return svgascreen->use_ps30 && svgascreen->use_vs30;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 1;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      if(!sws->get_cap(sws, SVGA3D_DEVCAP_MAX_RENDER_TARGETS, &result))
+         return 1;
+      if(!result.u)
+         return 1;
+      return MIN2(result.u, PIPE_MAX_COLOR_BUFS);
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_TIMER_QUERY:
+      return 0;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      {
+         unsigned levels = SVGA_MAX_TEXTURE_LEVELS;
+         if (sws->get_cap(sws, SVGA3D_DEVCAP_MAX_TEXTURE_WIDTH, &result))
+            levels = MIN2(util_logbase2(result.u) + 1, levels);
+         else
+            levels = 12 /* 2048x2048 */;
+         if (sws->get_cap(sws, SVGA3D_DEVCAP_MAX_TEXTURE_HEIGHT, &result))
+            levels = MIN2(util_logbase2(result.u) + 1, levels);
+         else
+            levels = 12 /* 2048x2048 */;
+         return levels;
+      }
+
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      if (!sws->get_cap(sws, SVGA3D_DEVCAP_MAX_VOLUME_EXTENT, &result))
+         return 8;  /* max 128x128x128 */
+      return MIN2(util_logbase2(result.u) + 1, SVGA_MAX_TEXTURE_LEVELS);
+
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      /*
+       * No mechanism to query the host, and at least limited to 2048x2048 on
+       * certain hardware.
+       */
+      return MIN2(screen->get_paramf(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS),
+                  12.0 /* 2048x2048 */);
+
+   case PIPE_CAP_TEXTURE_MIRROR_REPEAT: /* req. for GL 1.4 */
+      return 1;
+
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE: /* req. for GL 1.5 */
+      return 1;
+
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+      return 0;
+
+   /*
+    * Fragment shader limits
+    */
+
+   case PIPE_CAP_MAX_FS_INSTRUCTIONS:
+   case PIPE_CAP_MAX_FS_ALU_INSTRUCTIONS:
+   case PIPE_CAP_MAX_FS_TEX_INSTRUCTIONS:
+   case PIPE_CAP_MAX_FS_TEX_INDIRECTIONS:
+      return svgascreen->use_ps30 ? 512 : 96;
+   case PIPE_CAP_MAX_FS_CONTROL_FLOW_DEPTH:
+      return SVGA3D_MAX_NESTING_LEVEL;
+   case PIPE_CAP_MAX_FS_INPUTS:
+      return 10;
+   case PIPE_CAP_MAX_FS_CONSTS:
+      return svgascreen->use_vs30 ? 224 : 16;
+   case PIPE_CAP_MAX_FS_TEMPS:
+      if (!sws->get_cap(sws, SVGA3D_DEVCAP_MAX_FRAGMENT_SHADER_TEMPS, &result))
+         return svgascreen->use_ps30 ? 32 : 12;
+      return result.u;
+   case PIPE_CAP_MAX_FS_ADDRS:
+      return svgascreen->use_ps30 ? 1 : 0;
+   case PIPE_CAP_MAX_FS_PREDS:
+      return svgascreen->use_ps30 ? 1 : 0;
+
+   /*
+    * Vertex shader limits
+    */
+   case PIPE_CAP_MAX_VS_INSTRUCTIONS:
+   case PIPE_CAP_MAX_VS_ALU_INSTRUCTIONS:
+      if (!sws->get_cap(sws, SVGA3D_DEVCAP_MAX_VERTEX_SHADER_INSTRUCTIONS, &result))
+         return svgascreen->use_vs30 ? 512 : 256;
+      return result.u;
+   case PIPE_CAP_MAX_VS_TEX_INSTRUCTIONS:
+   case PIPE_CAP_MAX_VS_TEX_INDIRECTIONS:
+      /* XXX: until we have vertex texture support */
+      return 0;
+   case PIPE_CAP_MAX_VS_CONTROL_FLOW_DEPTH:
+      return SVGA3D_MAX_NESTING_LEVEL;
+   case PIPE_CAP_MAX_VS_INPUTS:
+      return 16;
+   case PIPE_CAP_MAX_VS_CONSTS:
+      return 256;
+   case PIPE_CAP_MAX_VS_TEMPS:
+      if (!sws->get_cap(sws, SVGA3D_DEVCAP_MAX_VERTEX_SHADER_TEMPS, &result))
+         return svgascreen->use_vs30 ? 32 : 12;
+      return result.u;
+   case PIPE_CAP_MAX_VS_ADDRS:
+      return svgascreen->use_vs30 ? 1 : 0;
+   case PIPE_CAP_MAX_VS_PREDS:
+      return svgascreen->use_vs30 ? 1 : 0;
+
+   case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
+      return 1;
+
+   default:
+      return 0;
+   }
+}
+
+
+/* This is a fairly pointless interface
+ */
+static int
+svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
+{
+   return (int) svga_get_paramf( screen, param );
+}
+
+
+static INLINE SVGA3dDevCapIndex
+svga_translate_format_cap(enum pipe_format format)
+{
+   switch(format) {
+   
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_A8R8G8B8;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_X8R8G8B8;
+
+   case PIPE_FORMAT_B5G6R5_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_R5G6B5;
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_A1R5G5B5;
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_A4R4G4B4;
+
+   case PIPE_FORMAT_Z16_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_Z_D16;
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_Z_D24S8;
+   case PIPE_FORMAT_X8Z24_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_Z_D24X8;
+
+   case PIPE_FORMAT_A8_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_ALPHA8;
+   case PIPE_FORMAT_L8_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_LUMINANCE8;
+
+   case PIPE_FORMAT_DXT1_RGB:
+   case PIPE_FORMAT_DXT1_RGBA:
+      return SVGA3D_DEVCAP_SURFACEFMT_DXT1;
+   case PIPE_FORMAT_DXT3_RGBA:
+      return SVGA3D_DEVCAP_SURFACEFMT_DXT3;
+   case PIPE_FORMAT_DXT5_RGBA:
+      return SVGA3D_DEVCAP_SURFACEFMT_DXT5;
+
+   default:
+      return SVGA3D_DEVCAP_MAX;
+   }
+}
+
+
+static boolean
+svga_is_format_supported( struct pipe_screen *screen,
+                          enum pipe_format format,
+                          enum pipe_texture_target target,
+                          unsigned sample_count,
+                          unsigned tex_usage,
+                          unsigned geom_flags )
+{
+   struct svga_winsys_screen *sws = svga_screen(screen)->sws;
+   SVGA3dDevCapIndex index;
+   SVGA3dDevCapResult result;
+
+   assert(tex_usage);
+
+   if (sample_count > 1)
+      return FALSE;
+
+   /* Override host capabilities */
+   if (tex_usage & PIPE_BIND_RENDER_TARGET) {
+      switch(format) { 
+
+      /* Often unsupported/problematic. This means we end up with the same
+       * visuals for all virtual hardware implementations.
+       */
+      case PIPE_FORMAT_B4G4R4A4_UNORM:
+      case PIPE_FORMAT_B5G5R5A1_UNORM:
+         return FALSE;
+         
+      /* Simulate ability to render into compressed textures */
+      case PIPE_FORMAT_DXT1_RGB:
+      case PIPE_FORMAT_DXT1_RGBA:
+      case PIPE_FORMAT_DXT3_RGBA:
+      case PIPE_FORMAT_DXT5_RGBA:
+         return TRUE;
+
+      default:
+         break;
+      }
+   }
+   
+   /* Try to query the host */
+   index = svga_translate_format_cap(format);
+   if( index < SVGA3D_DEVCAP_MAX && 
+       sws->get_cap(sws, index, &result) )
+   {
+      SVGA3dSurfaceFormatCaps mask;
+      
+      mask.value = 0;
+      if (tex_usage & PIPE_BIND_RENDER_TARGET)
+         mask.offscreenRenderTarget = 1;
+      if (tex_usage & PIPE_BIND_DEPTH_STENCIL)
+         mask.zStencil = 1;
+      if (tex_usage & PIPE_BIND_SAMPLER_VIEW)
+         mask.texture = 1;
+
+      if ((result.u & mask.value) == mask.value)
+         return TRUE;
+      else
+         return FALSE;
+   }
+
+   /* Use our translate functions directly rather than relying on a
+    * duplicated list of supported formats which is prone to getting
+    * out of sync:
+    */
+   if(tex_usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL))
+      return svga_translate_format_render(format) != SVGA3D_FORMAT_INVALID;
+   else
+      return svga_translate_format(format) != SVGA3D_FORMAT_INVALID;
+}
+
+
+static void
+svga_fence_reference(struct pipe_screen *screen,
+                     struct pipe_fence_handle **ptr,
+                     struct pipe_fence_handle *fence)
+{
+   struct svga_winsys_screen *sws = svga_screen(screen)->sws;
+   sws->fence_reference(sws, ptr, fence);
+}
+
+
+static int
+svga_fence_signalled(struct pipe_screen *screen,
+                     struct pipe_fence_handle *fence,
+                     unsigned flag)
+{
+   struct svga_winsys_screen *sws = svga_screen(screen)->sws;
+   return sws->fence_signalled(sws, fence, flag);
+}
+
+
+static int
+svga_fence_finish(struct pipe_screen *screen,
+                  struct pipe_fence_handle *fence,
+                  unsigned flag)
+{
+   struct svga_winsys_screen *sws = svga_screen(screen)->sws;
+
+   SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "%s fence_ptr %p\n",
+            __FUNCTION__, fence);
+
+   return sws->fence_finish(sws, fence, flag);
+}
+
+
+static void
+svga_destroy_screen( struct pipe_screen *screen )
+{
+   struct svga_screen *svgascreen = svga_screen(screen);
+   
+   svga_screen_cache_cleanup(svgascreen);
+
+   pipe_mutex_destroy(svgascreen->swc_mutex);
+   pipe_mutex_destroy(svgascreen->tex_mutex);
+
+   svgascreen->sws->destroy(svgascreen->sws);
+   
+   FREE(svgascreen);
+}
+
+
+/**
+ * Create a new svga_screen object
+ */
+struct pipe_screen *
+svga_screen_create(struct svga_winsys_screen *sws)
+{
+   struct svga_screen *svgascreen;
+   struct pipe_screen *screen;
+   SVGA3dDevCapResult result;
+
+#ifdef DEBUG
+   SVGA_DEBUG = debug_get_flags_option("SVGA_DEBUG", svga_debug_flags, 0 );
+#endif
+
+   svgascreen = CALLOC_STRUCT(svga_screen);
+   if (!svgascreen)
+      goto error1;
+
+   svgascreen->debug.force_level_surface_view =
+      debug_get_bool_option("SVGA_FORCE_LEVEL_SURFACE_VIEW", FALSE);
+   svgascreen->debug.force_surface_view =
+      debug_get_bool_option("SVGA_FORCE_SURFACE_VIEW", FALSE);
+   svgascreen->debug.force_sampler_view =
+      debug_get_bool_option("SVGA_FORCE_SAMPLER_VIEW", FALSE);
+   svgascreen->debug.no_surface_view =
+      debug_get_bool_option("SVGA_NO_SURFACE_VIEW", FALSE);
+   svgascreen->debug.no_sampler_view =
+      debug_get_bool_option("SVGA_NO_SAMPLER_VIEW", FALSE);
+
+   screen = &svgascreen->screen;
+
+   screen->destroy = svga_destroy_screen;
+   screen->get_name = svga_get_name;
+   screen->get_vendor = svga_get_vendor;
+   screen->get_param = svga_get_param;
+   screen->get_paramf = svga_get_paramf;
+   screen->is_format_supported = svga_is_format_supported;
+   screen->context_create = svga_context_create;
+   screen->fence_reference = svga_fence_reference;
+   screen->fence_signalled = svga_fence_signalled;
+   screen->fence_finish = svga_fence_finish;
+   svgascreen->sws = sws;
+
+   svga_screen_init_surface_functions(screen);
+   svga_init_screen_resource_functions(svgascreen);
+
+   svgascreen->use_ps30 =
+      sws->get_cap(sws, SVGA3D_DEVCAP_FRAGMENT_SHADER_VERSION, &result) &&
+      result.u >= SVGA3DPSVERSION_30 ? TRUE : FALSE;
+
+   svgascreen->use_vs30 =
+      sws->get_cap(sws, SVGA3D_DEVCAP_VERTEX_SHADER_VERSION, &result) &&
+      result.u >= SVGA3DVSVERSION_30 ? TRUE : FALSE;
+
+#if 1
+   /* Shader model 2.0 is unsupported at the moment. */
+   if(!svgascreen->use_ps30 || !svgascreen->use_vs30)
+      goto error2;
+#else
+   if(debug_get_bool_option("SVGA_NO_SM30", FALSE))
+      svgascreen->use_vs30 = svgascreen->use_ps30 = FALSE;
+#endif
+
+   pipe_mutex_init(svgascreen->tex_mutex);
+   pipe_mutex_init(svgascreen->swc_mutex);
+
+   svga_screen_cache_init(svgascreen);
+
+   return screen;
+error2:
+   FREE(svgascreen);
+error1:
+   return NULL;
+}
+
+struct svga_winsys_screen *
+svga_winsys_screen(struct pipe_screen *screen)
+{
+   return svga_screen(screen)->sws;
+}
+
+#ifdef DEBUG
+struct svga_screen *
+svga_screen(struct pipe_screen *screen)
+{
+   assert(screen);
+   assert(screen->destroy == svga_destroy_screen);
+   return (struct svga_screen *)screen;
+}
+#endif
diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h
new file mode 100644
index 0000000000..86ec89d88c
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_screen.h
@@ -0,0 +1,83 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_SCREEN_H
+#define SVGA_SCREEN_H
+
+
+#include "pipe/p_screen.h"
+#include "os/os_thread.h"
+
+#include "util/u_double_list.h"
+
+#include "svga_screen_cache.h"
+
+
+struct svga_winsys_screen;
+struct svga_winsys_context;
+struct SVGACmdMemory;
+
+#define SVGA_COMBINE_USERBUFFERS 1
+
+/**
+ * Subclass of pipe_screen
+ */
+struct svga_screen
+{
+   struct pipe_screen screen;
+   struct svga_winsys_screen *sws;
+
+   unsigned use_ps30;
+   unsigned use_vs30;
+   
+   struct {
+      boolean force_level_surface_view;
+      boolean force_surface_view;
+      boolean no_surface_view;
+      boolean force_sampler_view;
+      boolean no_sampler_view;
+   } debug;
+
+   unsigned texture_timestamp;
+   pipe_mutex tex_mutex; 
+
+   pipe_mutex swc_mutex; /* Used for buffer uploads */
+
+   struct svga_host_surface_cache cache;
+};
+
+#ifndef DEBUG
+/** cast wrapper */
+static INLINE struct svga_screen *
+svga_screen(struct pipe_screen *pscreen)
+{
+   return (struct svga_screen *) pscreen;
+}
+#else
+struct svga_screen *
+svga_screen(struct pipe_screen *screen);
+#endif
+
+#endif /* SVGA_SCREEN_H */
diff --git a/src/gallium/drivers/svga/svga_screen_cache.c b/src/gallium/drivers/svga/svga_screen_cache.c
new file mode 100644
index 0000000000..eff36e0bcc
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_screen_cache.c
@@ -0,0 +1,347 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_memory.h"
+#include "util/u_hash.h"
+
+#include "svga_debug.h"
+#include "svga_winsys.h"
+#include "svga_screen.h"
+#include "svga_screen_cache.h"
+
+
+#define SVGA_SURFACE_CACHE_ENABLED 1
+
+
+/** 
+ * Compute the bucket for this key. 
+ */
+static INLINE unsigned
+svga_screen_cache_bucket(const struct svga_host_surface_cache_key *key)
+{
+   return util_hash_crc32( key, sizeof *key ) % SVGA_HOST_SURFACE_CACHE_BUCKETS;
+}
+
+
+static INLINE struct svga_winsys_surface *
+svga_screen_cache_lookup(struct svga_screen *svgascreen,
+                         const struct svga_host_surface_cache_key *key)
+{
+   struct svga_host_surface_cache *cache = &svgascreen->cache;
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_host_surface_cache_entry *entry;
+   struct svga_winsys_surface *handle = NULL;
+   struct list_head *curr, *next;
+   unsigned bucket;
+   unsigned tries = 0;
+
+   assert(key->cachable);
+
+   bucket = svga_screen_cache_bucket(key);
+
+   pipe_mutex_lock(cache->mutex);
+
+   curr = cache->bucket[bucket].next;
+   next = curr->next;
+   while(curr != &cache->bucket[bucket]) {
+      ++tries;
+      
+      entry = LIST_ENTRY(struct svga_host_surface_cache_entry, curr, bucket_head);
+
+      assert(entry->handle);
+      
+      if(memcmp(&entry->key, key, sizeof *key) == 0 &&
+         sws->fence_signalled( sws, entry->fence, 0 ) == 0) {
+         assert(sws->surface_is_flushed(sws, entry->handle));
+         
+         handle = entry->handle; // Reference is transfered here.
+         entry->handle = NULL;
+         
+         LIST_DEL(&entry->bucket_head);
+
+         LIST_DEL(&entry->head);
+         
+         LIST_ADD(&entry->head, &cache->empty);
+
+         break;
+      }
+
+      curr = next; 
+      next = curr->next;
+   }
+
+   pipe_mutex_unlock(cache->mutex);
+   
+   if (SVGA_DEBUG & DEBUG_DMA)
+      debug_printf("%s: cache %s after %u tries (bucket %d)\n", __FUNCTION__, 
+                   handle ? "hit" : "miss", tries, bucket);
+   
+   return handle;
+}
+
+
+/*
+ * Transfers a handle reference.
+ */
+                           
+static INLINE void
+svga_screen_cache_add(struct svga_screen *svgascreen,
+                      const struct svga_host_surface_cache_key *key, 
+                      struct svga_winsys_surface **p_handle)
+{
+   struct svga_host_surface_cache *cache = &svgascreen->cache;
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_host_surface_cache_entry *entry = NULL;
+   struct svga_winsys_surface *handle = *p_handle;
+   
+   assert(key->cachable);
+
+   assert(handle);
+   if(!handle)
+      return;
+   
+   *p_handle = NULL;
+   pipe_mutex_lock(cache->mutex);
+   
+   if(!LIST_IS_EMPTY(&cache->empty)) {
+      /* use the first empty entry */
+      entry = LIST_ENTRY(struct svga_host_surface_cache_entry, cache->empty.next, head);
+        
+      LIST_DEL(&entry->head);
+   }
+   else if(!LIST_IS_EMPTY(&cache->unused)) {
+      /* free the last used buffer and reuse its entry */
+      entry = LIST_ENTRY(struct svga_host_surface_cache_entry, cache->unused.prev, head);
+      SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+               "unref sid %p (make space)\n", entry->handle);
+      sws->surface_reference(sws, &entry->handle, NULL);
+
+      LIST_DEL(&entry->bucket_head);
+
+      LIST_DEL(&entry->head);
+   }
+
+   if(entry) {
+      entry->handle = handle;
+      memcpy(&entry->key, key, sizeof entry->key);
+   
+      SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+               "cache sid %p\n", entry->handle);
+      LIST_ADD(&entry->head, &cache->validated);
+   }
+   else {
+      /* Couldn't cache the buffer -- this really shouldn't happen */
+      SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+               "unref sid %p (couldn't find space)\n", handle);
+      sws->surface_reference(sws, &handle, NULL);
+   }
+   
+   pipe_mutex_unlock(cache->mutex);
+}
+
+
+/**
+ * Called during the screen flush to move all buffers not in a validate list
+ * into the unused list.
+ */
+void
+svga_screen_cache_flush(struct svga_screen *svgascreen,
+                        struct pipe_fence_handle *fence)
+{
+   struct svga_host_surface_cache *cache = &svgascreen->cache;
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_host_surface_cache_entry *entry;
+   struct list_head *curr, *next;
+   unsigned bucket;
+
+   pipe_mutex_lock(cache->mutex);
+
+   curr = cache->validated.next;
+   next = curr->next;
+   while(curr != &cache->validated) {
+      entry = LIST_ENTRY(struct svga_host_surface_cache_entry, curr, head);
+
+      assert(entry->handle);
+
+      if(sws->surface_is_flushed(sws, entry->handle)) {
+         LIST_DEL(&entry->head);
+         
+         svgascreen->sws->fence_reference(svgascreen->sws, &entry->fence, fence);
+
+         LIST_ADD(&entry->head, &cache->unused);
+
+         bucket = svga_screen_cache_bucket(&entry->key);
+         LIST_ADD(&entry->bucket_head, &cache->bucket[bucket]);
+      }
+
+      curr = next; 
+      next = curr->next;
+   }
+
+   pipe_mutex_unlock(cache->mutex);
+}
+
+
+void
+svga_screen_cache_cleanup(struct svga_screen *svgascreen)
+{
+   struct svga_host_surface_cache *cache = &svgascreen->cache;
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   unsigned i;
+   
+   for(i = 0; i < SVGA_HOST_SURFACE_CACHE_SIZE; ++i) {
+      if(cache->entries[i].handle) {
+	 SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+                  "unref sid %p (shutdown)\n", cache->entries[i].handle);
+	 sws->surface_reference(sws, &cache->entries[i].handle, NULL);
+      }
+
+      if(cache->entries[i].fence)
+         svgascreen->sws->fence_reference(svgascreen->sws, &cache->entries[i].fence, NULL);
+   }
+   
+   pipe_mutex_destroy(cache->mutex);
+}
+
+
+enum pipe_error
+svga_screen_cache_init(struct svga_screen *svgascreen)
+{
+   struct svga_host_surface_cache *cache = &svgascreen->cache;
+   unsigned i;
+
+   pipe_mutex_init(cache->mutex);
+   
+   for(i = 0; i < SVGA_HOST_SURFACE_CACHE_BUCKETS; ++i)
+      LIST_INITHEAD(&cache->bucket[i]);
+
+   LIST_INITHEAD(&cache->unused);
+   
+   LIST_INITHEAD(&cache->validated);
+   
+   LIST_INITHEAD(&cache->empty);
+   for(i = 0; i < SVGA_HOST_SURFACE_CACHE_SIZE; ++i)
+      LIST_ADDTAIL(&cache->entries[i].head, &cache->empty);
+
+   return PIPE_OK;
+}
+
+                           
+struct svga_winsys_surface *
+svga_screen_surface_create(struct svga_screen *svgascreen,
+                           struct svga_host_surface_cache_key *key)
+{
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_winsys_surface *handle = NULL;
+   boolean cachable = SVGA_SURFACE_CACHE_ENABLED && key->cachable;
+
+   SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+            "%s sz %dx%dx%d mips %d faces %d cachable %d\n", 
+            __FUNCTION__,
+            key->size.width,
+            key->size.height,
+            key->size.depth,
+            key->numMipLevels,
+            key->numFaces,
+            key->cachable);
+
+   if (cachable) {
+      if (key->format == SVGA3D_BUFFER) {
+         /* For buffers, round the buffer size up to the nearest power
+          * of two to increase the probability of cache hits.  Keep
+          * texture surface dimensions unchanged.
+          */
+         uint32_t size = 1;
+         while(size < key->size.width)
+            size <<= 1;
+         key->size.width = size;
+	 /* Since we're reusing buffers we're effectively transforming all
+	  * of them into dynamic buffers.
+	  *
+	  * It would be nice to not cache long lived static buffers. But there
+	  * is no way to detect the long lived from short lived ones yet. A
+	  * good heuristic would be buffer size.
+	  */
+	 key->flags &= ~SVGA3D_SURFACE_HINT_STATIC;
+	 key->flags |= SVGA3D_SURFACE_HINT_DYNAMIC;
+      }
+
+      handle = svga_screen_cache_lookup(svgascreen, key);
+      if (handle) {
+         if (key->format == SVGA3D_BUFFER)
+            SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+                     "reuse sid %p sz %d (buffer)\n", handle, 
+                     key->size.width);
+         else
+            SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+                     "reuse sid %p sz %dx%dx%d mips %d faces %d\n", handle, 
+                     key->size.width,
+                     key->size.height,
+                     key->size.depth,
+                     key->numMipLevels,
+                     key->numFaces);
+      }
+   }
+
+   if (!handle) {
+      handle = sws->surface_create(sws,
+                                   key->flags,
+                                   key->format,
+                                   key->size, 
+                                   key->numFaces, 
+                                   key->numMipLevels);
+      if (handle)
+         SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+                  "  CREATE sid %p sz %dx%dx%d\n", 
+                  handle, 
+                  key->size.width,
+                  key->size.height,
+                  key->size.depth);
+   }
+
+   return handle;
+}
+
+
+void
+svga_screen_surface_destroy(struct svga_screen *svgascreen,
+                            const struct svga_host_surface_cache_key *key,
+                            struct svga_winsys_surface **p_handle)
+{
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   
+   /* We only set the cachable flag for surfaces of which we are the
+    * exclusive owner.  So just hold onto our existing reference in
+    * that case.
+    */
+   if(SVGA_SURFACE_CACHE_ENABLED && key->cachable) {
+      svga_screen_cache_add(svgascreen, key, p_handle);
+   }
+   else {
+      SVGA_DBG(DEBUG_DMA,
+               "unref sid %p (uncachable)\n", *p_handle);
+      sws->surface_reference(sws, p_handle, NULL);
+   }
+}
diff --git a/src/gallium/drivers/svga/svga_screen_cache.h b/src/gallium/drivers/svga/svga_screen_cache.h
new file mode 100644
index 0000000000..62156e3f52
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_screen_cache.h
@@ -0,0 +1,144 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_SCREEN_CACHE_H_
+#define SVGA_SCREEN_CACHE_H_
+
+
+#include "svga_types.h"
+#include "svga_reg.h"
+#include "svga3d_reg.h"
+
+#include "os/os_thread.h"
+
+#include "util/u_double_list.h"
+
+
+/* Guess the storage size of cached surfaces and try and keep it under
+ * this amount:
+ */ 
+#define SVGA_HOST_SURFACE_CACHE_BYTES 16*1024*1024
+
+/* Maximum number of discrete surfaces in the cache:
+ */
+#define SVGA_HOST_SURFACE_CACHE_SIZE 1024
+
+/* Number of hash buckets:
+ */
+#define SVGA_HOST_SURFACE_CACHE_BUCKETS 256
+
+
+struct svga_winsys_surface;
+struct svga_screen;
+
+/**
+ * Same as svga_winsys_screen::surface_create.
+ */
+struct svga_host_surface_cache_key
+{
+   SVGA3dSurfaceFlags flags;
+   SVGA3dSurfaceFormat format;
+   SVGA3dSize size;
+   uint32_t numFaces:24;
+   uint32_t numMipLevels:7;
+   uint32_t cachable:1;         /* False if this is a shared surface */
+};
+
+
+struct svga_host_surface_cache_entry 
+{
+   /** 
+    * Head for the LRU list, svga_host_surface_cache::unused, and
+    * svga_host_surface_cache::empty
+    */
+   struct list_head head;
+   
+   /** Head for the bucket lists. */
+   struct list_head bucket_head;
+
+   struct svga_host_surface_cache_key key;
+   struct svga_winsys_surface *handle;
+   
+   struct pipe_fence_handle *fence;
+};
+
+
+/**
+ * Cache of the host surfaces.
+ * 
+ * A cache entry can be in the following stages:
+ * 1. empty
+ * 2. holding a buffer in a validate list
+ * 3. holding a flushed buffer (not in any validate list) with an active fence
+ * 4. holding a flushed buffer with an expired fence
+ * 
+ * An entry progresses from 1 -> 2 -> 3 -> 4. When we need an entry to put a 
+ * buffer into we preferencial take from 1, or from the least recentely used 
+ * buffer from 3/4.
+ */
+struct svga_host_surface_cache 
+{
+   pipe_mutex mutex;
+   
+   /* Unused buffers are put in buckets to speed up lookups */
+   struct list_head bucket[SVGA_HOST_SURFACE_CACHE_BUCKETS];
+   
+   /* Entries with unused buffers, ordered from most to least recently used 
+    * (3 and 4) */
+   struct list_head unused;
+   
+   /* Entries with buffers still in validate lists (2) */
+   struct list_head validated;
+   
+   /** Empty entries (1) */
+   struct list_head empty;
+
+   /** The actual storage for the entries */
+   struct svga_host_surface_cache_entry entries[SVGA_HOST_SURFACE_CACHE_SIZE];
+};
+
+
+void
+svga_screen_cache_cleanup(struct svga_screen *svgascreen);
+
+void
+svga_screen_cache_flush(struct svga_screen *svgascreen,
+                        struct pipe_fence_handle *fence);
+
+enum pipe_error
+svga_screen_cache_init(struct svga_screen *svgascreen);
+
+
+struct svga_winsys_surface *
+svga_screen_surface_create(struct svga_screen *svgascreen,
+                           struct svga_host_surface_cache_key *key);
+
+void
+svga_screen_surface_destroy(struct svga_screen *svgascreen,
+                            const struct svga_host_surface_cache_key *key,
+                            struct svga_winsys_surface **handle);
+
+
+#endif /* SVGA_SCREEN_CACHE_H_ */
diff --git a/src/gallium/drivers/svga/svga_state.c b/src/gallium/drivers/svga/svga_state.c
new file mode 100644
index 0000000000..1c21d3acfe
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state.c
@@ -0,0 +1,278 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_debug.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_state.h"
+#include "svga_draw.h"
+#include "svga_cmd.h"
+#include "svga_hw_reg.h"
+
+/* This is just enough to decide whether we need to use the draw
+ * module (swtnl) or not.
+ */
+static const struct svga_tracked_state *need_swtnl_state[] =
+{
+   &svga_update_need_swvfetch,
+   &svga_update_need_pipeline,
+   &svga_update_need_swtnl,
+   NULL
+};
+
+
+/* Atoms to update hardware state prior to emitting a clear or draw
+ * packet.
+ */
+static const struct svga_tracked_state *hw_clear_state[] =
+{
+   &svga_hw_scissor,
+   &svga_hw_viewport,
+   &svga_hw_framebuffer,
+   NULL
+};
+
+
+/* Atoms to update hardware state prior to emitting a draw packet.
+ */
+static const struct svga_tracked_state *hw_draw_state[] =
+{
+   &svga_hw_update_zero_stride,
+   &svga_hw_fs,
+   &svga_hw_vs,
+   &svga_hw_rss,
+   &svga_hw_tss,
+   &svga_hw_tss_binding,
+   &svga_hw_clip_planes,
+   &svga_hw_vdecl,
+   &svga_hw_fs_parameters,
+   &svga_hw_vs_parameters,
+   NULL
+};
+
+
+static const struct svga_tracked_state *swtnl_draw_state[] =
+{
+   &svga_update_swtnl_draw,
+   &svga_update_swtnl_vdecl,
+   NULL
+};
+
+/* Flattens the graph of state dependencies.  Could swap the positions
+ * of hw_clear_state and need_swtnl_state without breaking anything.
+ */
+static const struct svga_tracked_state **state_levels[] = 
+{
+   need_swtnl_state,
+   hw_clear_state,
+   hw_draw_state,
+   swtnl_draw_state
+};
+
+
+
+static unsigned check_state( unsigned a,
+                             unsigned b )
+{
+   return (a & b);
+}
+
+static void accumulate_state( unsigned *a,
+			      unsigned b )
+{
+   *a |= b;
+}
+
+
+static void xor_states( unsigned *result,
+                        unsigned a,
+                        unsigned b )
+{
+   *result = a ^ b;
+}
+
+
+
+static int update_state( struct svga_context *svga,
+                         const struct svga_tracked_state *atoms[],
+                         unsigned *state )
+{
+   boolean debug = TRUE;
+   enum pipe_error ret = 0;
+   unsigned i;
+
+   ret = svga_hwtnl_flush( svga->hwtnl );
+   if (ret != 0)
+      return ret;
+
+   if (debug) {
+      /* Debug version which enforces various sanity checks on the
+       * state flags which are generated and checked to help ensure
+       * state atoms are ordered correctly in the list.
+       */
+      unsigned examined, prev;      
+
+      examined = 0;
+      prev = *state;
+
+      for (i = 0; atoms[i] != NULL; i++) {	 
+	 unsigned generated;
+
+	 assert(atoms[i]->dirty); 
+	 assert(atoms[i]->update);
+
+	 if (check_state(*state, atoms[i]->dirty)) {
+	    if (0)
+               debug_printf("update: %s\n", atoms[i]->name);
+	    ret = atoms[i]->update( svga, *state );
+            if (ret != 0)
+               return ret;
+	 }
+
+	 /* generated = (prev ^ state)
+	  * if (examined & generated)
+	  *     fail;
+	  */
+	 xor_states(&generated, prev, *state);
+	 if (check_state(examined, generated)) {
+	    debug_printf("state atom %s generated state already examined\n", 
+                         atoms[i]->name);
+	    assert(0);
+	 }
+			 
+	 prev = *state;
+	 accumulate_state(&examined, atoms[i]->dirty);
+      }
+   }
+   else {
+      for (i = 0; atoms[i] != NULL; i++) {	 
+	 if (check_state(*state, atoms[i]->dirty)) {
+	    ret = atoms[i]->update( svga, *state );
+            if (ret != 0)
+               return ret;
+         }
+      }
+   }
+
+   return 0;
+}
+
+
+
+int svga_update_state( struct svga_context *svga,
+                       unsigned max_level )
+{
+   struct svga_screen *screen = svga_screen(svga->pipe.screen);
+   int ret = 0;
+   int i;
+
+   /* Check for updates to bound textures.  This can't be done in an
+    * atom as there is no flag which could provoke this test, and we
+    * cannot create one.
+    */
+   if (svga->state.texture_timestamp != screen->texture_timestamp) {
+      svga->state.texture_timestamp = screen->texture_timestamp;
+      svga->dirty |= SVGA_NEW_TEXTURE;
+   }
+
+   for (i = 0; i <= max_level; i++) {
+      svga->dirty |= svga->state.dirty[i];
+
+      if (svga->dirty) {
+         ret = update_state( svga, 
+                             state_levels[i], 
+                             &svga->dirty );
+         if (ret != 0)
+            return ret;
+
+         svga->state.dirty[i] = 0;
+      }
+   }
+   
+   for (; i < SVGA_STATE_MAX; i++) 
+      svga->state.dirty[i] |= svga->dirty;
+
+   svga->dirty = 0;
+   return 0;
+}
+
+
+
+
+void svga_update_state_retry( struct svga_context *svga,
+                              unsigned max_level )
+{
+   int ret;
+
+   ret = svga_update_state( svga, max_level );
+
+   if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
+      svga_context_flush(svga, NULL);
+      ret = svga_update_state( svga, max_level );
+   }
+
+   assert( ret == 0 );
+}
+
+
+
+#define EMIT_RS(_rs, _count, _name, _value)     \
+do {                                            \
+   _rs[_count].state = _name;                   \
+   _rs[_count].uintValue = _value;              \
+   _count++;                                    \
+} while (0)
+
+
+/* Setup any hardware state which will be constant through the life of
+ * a context.
+ */
+enum pipe_error svga_emit_initial_state( struct svga_context *svga )
+{
+   SVGA3dRenderState *rs;
+   unsigned count = 0;
+   const unsigned COUNT = 2;
+   enum pipe_error ret;
+
+   ret = SVGA3D_BeginSetRenderState( svga->swc, &rs, COUNT );
+   if (ret)
+      return ret;
+
+   /* Always use D3D style coordinate space as this is the only one
+    * which is implemented on all backends.
+    */
+   EMIT_RS(rs, count, SVGA3D_RS_COORDINATETYPE, SVGA3D_COORDINATE_LEFTHANDED );
+   EMIT_RS(rs, count, SVGA3D_RS_FRONTWINDING, SVGA3D_FRONTWINDING_CW );
+   
+   assert( COUNT == count );
+   SVGA_FIFOCommitAll( svga->swc );
+
+   return 0;
+
+}
diff --git a/src/gallium/drivers/svga/svga_state.h b/src/gallium/drivers/svga/svga_state.h
new file mode 100644
index 0000000000..22d5a6d552
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state.h
@@ -0,0 +1,95 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_STATE_H
+#define SVGA_STATE_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+
+struct svga_context;
+
+
+void svga_init_state( struct svga_context *svga );
+void svga_destroy_state( struct svga_context *svga );
+
+
+struct svga_tracked_state {
+   const char *name;
+   unsigned dirty;
+   int (*update)( struct svga_context *svga, unsigned dirty );
+};
+
+/* NEED_SWTNL
+ */
+extern struct svga_tracked_state svga_update_need_swvfetch;
+extern struct svga_tracked_state svga_update_need_pipeline;
+extern struct svga_tracked_state svga_update_need_swtnl;
+
+/* HW_CLEAR
+ */
+extern struct svga_tracked_state svga_hw_viewport;
+extern struct svga_tracked_state svga_hw_scissor;
+extern struct svga_tracked_state svga_hw_framebuffer;
+
+/* HW_DRAW
+ */
+extern struct svga_tracked_state svga_hw_vs;
+extern struct svga_tracked_state svga_hw_fs;
+extern struct svga_tracked_state svga_hw_rss;
+extern struct svga_tracked_state svga_hw_tss;
+extern struct svga_tracked_state svga_hw_tss_binding;
+extern struct svga_tracked_state svga_hw_clip_planes;
+extern struct svga_tracked_state svga_hw_vdecl;
+extern struct svga_tracked_state svga_hw_fs_parameters;
+extern struct svga_tracked_state svga_hw_vs_parameters;
+extern struct svga_tracked_state svga_hw_update_zero_stride;
+
+/* SWTNL_DRAW
+ */
+extern struct svga_tracked_state svga_update_swtnl_draw;
+extern struct svga_tracked_state svga_update_swtnl_vdecl;
+
+/* Bring the hardware fully up-to-date so that we can emit draw
+ * commands.
+ */
+#define SVGA_STATE_NEED_SWTNL        0
+#define SVGA_STATE_HW_CLEAR          1
+#define SVGA_STATE_HW_DRAW           2
+#define SVGA_STATE_SWTNL_DRAW        3
+#define SVGA_STATE_MAX               4
+
+
+enum pipe_error svga_update_state( struct svga_context *svga,
+                                   unsigned level );
+
+void svga_update_state_retry( struct svga_context *svga,
+                              unsigned level );
+
+
+enum pipe_error svga_emit_initial_state( struct svga_context *svga );
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
new file mode 100644
index 0000000000..97c818cd37
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -0,0 +1,241 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_cmd.h"
+#include "svga_tgsi.h"
+#include "svga_debug.h"
+
+#include "svga_hw_reg.h"
+
+/***********************************************************************
+ * Hardware update 
+ */
+
+/* Convert from PIPE_SHADER_* to SVGA3D_SHADERTYPE_*
+ */
+static int svga_shader_type( int unit )
+{
+   return unit + 1;
+}
+
+
+static int emit_const( struct svga_context *svga,
+                       int unit,
+                       int i,
+                       const float *value )
+{
+   int ret = PIPE_OK;
+
+   if (memcmp(svga->state.hw_draw.cb[unit][i], value, 4 * sizeof(float)) != 0) {
+      if (SVGA_DEBUG & DEBUG_CONSTS)
+         debug_printf("%s %s %d: %f %f %f %f\n",
+                      __FUNCTION__,
+                      unit == PIPE_SHADER_VERTEX ? "VERT" : "FRAG",
+                      i,
+                      value[0],
+                      value[1],
+                      value[2],
+                      value[3]);
+
+      ret = SVGA3D_SetShaderConst( svga->swc, 
+                                   i,
+                                   svga_shader_type(unit),
+                                   SVGA3D_CONST_TYPE_FLOAT,
+                                   value );
+      if (ret)
+         return ret;
+
+      memcpy(svga->state.hw_draw.cb[unit][i], value, 4 * sizeof(float));
+   }
+   
+   return ret;
+}
+
+static int emit_consts( struct svga_context *svga,
+                        int offset,
+                        int unit )
+{
+   struct pipe_transfer *transfer = NULL;
+   unsigned count;
+   const float (*data)[4] = NULL;
+   unsigned i;
+   int ret = PIPE_OK;
+
+   if (svga->curr.cb[unit] == NULL)
+      goto done;
+
+   count = svga->curr.cb[unit]->width0 / (4 * sizeof(float));
+
+   data = (const float (*)[4])pipe_buffer_map(&svga->pipe,
+                                              svga->curr.cb[unit],
+                                              PIPE_TRANSFER_READ,
+					      &transfer);
+   if (data == NULL) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto done;
+   }
+
+   for (i = 0; i < count; i++) {
+      ret = emit_const( svga, unit, offset + i, data[i] );
+      if (ret)
+         goto done;
+   }
+
+done:
+   if (data)
+      pipe_buffer_unmap(&svga->pipe, svga->curr.cb[unit], transfer);
+
+   return ret;
+}
+   
+static int emit_fs_consts( struct svga_context *svga,
+                           unsigned dirty )
+{
+   const struct svga_shader_result *result = svga->state.hw_draw.fs;
+   const struct svga_fs_compile_key *key = &result->key.fkey;
+   int ret = 0;
+
+   ret = emit_consts( svga, 0, PIPE_SHADER_FRAGMENT );
+   if (ret)
+      return ret;
+
+   /* The internally generated fragment shader for xor blending
+    * doesn't have a 'result' struct.  It should be fixed to avoid
+    * this special case, but work around it with a NULL check:
+    */
+   if (result != NULL &&
+       key->num_unnormalized_coords)
+   {
+      unsigned offset = result->shader->info.file_max[TGSI_FILE_CONSTANT] + 1;
+      int i;
+
+      for (i = 0; i < key->num_textures; i++) {
+         if (key->tex[i].unnormalized) {
+            struct pipe_resource *tex = svga->curr.sampler_views[i]->texture;
+            float data[4];
+
+            data[0] = 1.0 / (float)tex->width0;
+            data[1] = 1.0 / (float)tex->height0;
+            data[2] = 1.0;
+            data[3] = 1.0;
+
+            ret = emit_const( svga,
+                              PIPE_SHADER_FRAGMENT,
+                              key->tex[i].width_height_idx + offset,
+                              data );
+            if (ret)
+               return ret;
+         }
+      }
+
+      offset += key->num_unnormalized_coords;
+   }
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_hw_fs_parameters = 
+{
+   "hw fs params",
+   (SVGA_NEW_FS_CONST_BUFFER |
+    SVGA_NEW_FS_RESULT |
+    SVGA_NEW_TEXTURE_BINDING),
+   emit_fs_consts
+};
+
+/***********************************************************************
+ */
+
+static int emit_vs_consts( struct svga_context *svga,
+                           unsigned dirty )
+{
+   const struct svga_shader_result *result = svga->state.hw_draw.vs;
+   const struct svga_vs_compile_key *key = &result->key.vkey;
+   int ret = 0;
+   unsigned offset;
+
+   /* SVGA_NEW_VS_RESULT
+    */
+   if (result == NULL) 
+      return 0;
+
+   /* SVGA_NEW_VS_CONST_BUFFER 
+    */
+   ret = emit_consts( svga, 0, PIPE_SHADER_VERTEX );
+   if (ret)
+      return ret;
+
+   offset = result->shader->info.file_max[TGSI_FILE_CONSTANT] + 1;
+
+   /* SVGA_NEW_VS_RESULT
+    */
+   if (key->need_prescale) {
+      ret = emit_const( svga, PIPE_SHADER_VERTEX, offset++,
+                        svga->state.hw_clear.prescale.scale );
+      if (ret)
+         return ret;
+
+      ret = emit_const( svga, PIPE_SHADER_VERTEX, offset++,
+                        svga->state.hw_clear.prescale.translate );
+      if (ret)
+         return ret;
+   }
+
+   /* SVGA_NEW_ZERO_STRIDE
+    */
+   if (key->zero_stride_vertex_elements) {
+      unsigned i, curr_zero_stride = 0;
+      for (i = 0; i < PIPE_MAX_ATTRIBS; ++i) {
+         if (key->zero_stride_vertex_elements & (1 << i)) {
+            ret = emit_const( svga, PIPE_SHADER_VERTEX, offset++,
+                              svga->curr.zero_stride_constants +
+                              4 * curr_zero_stride );
+            if (ret)
+               return ret;
+            ++curr_zero_stride;
+         }
+      }
+   }
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_hw_vs_parameters = 
+{
+   "hw vs params",
+   (SVGA_NEW_PRESCALE |
+    SVGA_NEW_VS_CONST_BUFFER |
+    SVGA_NEW_ZERO_STRIDE |
+    SVGA_NEW_VS_RESULT),
+   emit_vs_consts
+};
+
diff --git a/src/gallium/drivers/svga/svga_state_framebuffer.c b/src/gallium/drivers/svga/svga_state_framebuffer.c
new file mode 100644
index 0000000000..bd92f00343
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_framebuffer.c
@@ -0,0 +1,435 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_cmd.h"
+#include "svga_debug.h"
+
+
+/***********************************************************************
+ * Hardware state update
+ */
+
+
+static int emit_framebuffer( struct svga_context *svga,
+                             unsigned dirty )
+{
+   const struct pipe_framebuffer_state *curr = &svga->curr.framebuffer;
+   struct pipe_framebuffer_state *hw = &svga->state.hw_clear.framebuffer;
+   unsigned i;
+   enum pipe_error ret;
+
+   /* XXX: Need shadow state in svga->hw to eliminate redundant
+    * uploads, especially of NULL buffers.
+    */
+   
+   for(i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
+      if (curr->cbufs[i] != hw->cbufs[i]) {
+         if (svga->curr.nr_fbs++ > 8)
+            return PIPE_ERROR_OUT_OF_MEMORY;
+
+         ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_COLOR0 + i, curr->cbufs[i]);
+         if (ret != PIPE_OK)
+            return ret;
+         
+         pipe_surface_reference(&hw->cbufs[i], curr->cbufs[i]);
+      }
+   }
+
+   
+   if (curr->zsbuf != hw->zsbuf) {
+      ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_DEPTH, curr->zsbuf);
+      if (ret != PIPE_OK)
+         return ret;
+
+      if (curr->zsbuf &&
+          curr->zsbuf->format == PIPE_FORMAT_S8_USCALED_Z24_UNORM) {
+         ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_STENCIL, curr->zsbuf);
+         if (ret != PIPE_OK)
+            return ret;
+      }
+      else {
+         ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_STENCIL, NULL);
+         if (ret != PIPE_OK)
+            return ret;
+      }
+      
+      pipe_surface_reference(&hw->zsbuf, curr->zsbuf);
+   }
+
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_hw_framebuffer = 
+{
+   "hw framebuffer state",
+   SVGA_NEW_FRAME_BUFFER,
+   emit_framebuffer
+};
+
+
+
+
+/*********************************************************************** 
+ */
+
+static int emit_viewport( struct svga_context *svga,
+                          unsigned dirty )
+{
+   const struct pipe_viewport_state *viewport = &svga->curr.viewport;
+   struct svga_prescale prescale;
+   SVGA3dRect rect;
+   /* Not sure if this state is relevant with POSITIONT.  Probably
+    * not, but setting to 0,1 avoids some state pingponging.
+    */
+   float range_min = 0.0;
+   float range_max = 1.0;
+   float flip = -1.0;
+   boolean degenerate = FALSE;
+   enum pipe_error ret;
+
+   float fb_width = svga->curr.framebuffer.width;
+   float fb_height = svga->curr.framebuffer.height;
+
+   float fx =        viewport->scale[0] * -1.0 + viewport->translate[0];
+   float fy = flip * viewport->scale[1] * -1.0 + viewport->translate[1];
+   float fw =        viewport->scale[0] * 2; 
+   float fh = flip * viewport->scale[1] * 2; 
+
+   memset( &prescale, 0, sizeof(prescale) );
+
+   /* Examine gallium viewport transformation and produce a screen
+    * rectangle and possibly vertex shader pre-transformation to
+    * get the same results.
+    */
+
+   SVGA_DBG(DEBUG_VIEWPORT,
+            "\ninitial %f,%f %fx%f\n",
+            fx,
+            fy,
+            fw,
+            fh);
+
+   prescale.scale[0] = 1.0;
+   prescale.scale[1] = 1.0;
+   prescale.scale[2] = 1.0;
+   prescale.scale[3] = 1.0;
+   prescale.translate[0] = 0;
+   prescale.translate[1] = 0;
+   prescale.translate[2] = 0;
+   prescale.translate[3] = 0;
+   prescale.enabled = TRUE;
+
+
+
+   if (fw < 0) {
+      prescale.scale[0] *= -1.0;
+      prescale.translate[0] += -fw;
+      fw = -fw;
+      fx =        viewport->scale[0] * 1.0 + viewport->translate[0];
+   }
+
+   if (fh < 0) {
+      prescale.scale[1] *= -1.0;
+      prescale.translate[1] += -fh;
+      fh = -fh;
+      fy = flip * viewport->scale[1] * 1.0 + viewport->translate[1];
+   }
+
+   if (fx < 0) {
+      prescale.translate[0] += fx;
+      prescale.scale[0] *= fw / (fw + fx); 
+      fw += fx;
+      fx = 0;
+   }
+
+   if (fy < 0) {
+      prescale.translate[1] += fy;
+      prescale.scale[1] *= fh / (fh + fy); 
+      fh += fy;
+      fy = 0;
+   }
+
+   if (fx + fw > fb_width) {
+      prescale.scale[0] *= fw / (fb_width - fx); 
+      prescale.translate[0] -= fx * (fw / (fb_width - fx));
+      prescale.translate[0] += fx;
+      fw = fb_width - fx;
+      
+   }
+
+   if (fy + fh > fb_height) {
+      prescale.scale[1] *= fh / (fb_height - fy);
+      prescale.translate[1] -= fy * (fh / (fb_height - fy));
+      prescale.translate[1] += fy;
+      fh = fb_height - fy;
+   }
+
+   if (fw < 0 || fh < 0) {
+      fw = fh = fx = fy = 0;
+      degenerate = TRUE;
+      goto out;
+   }
+
+
+   /* D3D viewport is integer space.  Convert fx,fy,etc. to
+    * integers.
+    *
+    * TODO: adjust pretranslate correct for any subpixel error
+    * introduced converting to integers.
+    */
+   rect.x = fx;
+   rect.y = fy;
+   rect.w = fw;
+   rect.h = fh;
+
+   SVGA_DBG(DEBUG_VIEWPORT,
+            "viewport error %f,%f %fx%f\n",
+            fabs((float)rect.x - fx),
+            fabs((float)rect.y - fy),
+            fabs((float)rect.w - fw),
+            fabs((float)rect.h - fh));
+
+   SVGA_DBG(DEBUG_VIEWPORT,
+            "viewport %d,%d %dx%d\n",
+            rect.x,
+            rect.y,
+            rect.w,
+            rect.h);
+
+
+   /* Finally, to get GL rasterization rules, need to tweak the
+    * screen-space coordinates slightly relative to D3D which is
+    * what hardware implements natively.
+    */
+   if (svga->curr.rast->templ.gl_rasterization_rules) {
+      float adjust_x = 0.0;
+      float adjust_y = 0.0;
+
+      switch (svga->curr.reduced_prim) {
+      case PIPE_PRIM_LINES:
+         adjust_x = -0.5;
+         adjust_y = 0;
+         break;
+      case PIPE_PRIM_POINTS:
+      case PIPE_PRIM_TRIANGLES:
+         adjust_x = -0.375;
+         adjust_y = -0.5;
+         break;
+      }
+
+      prescale.translate[0] += adjust_x;
+      prescale.translate[1] += adjust_y;
+      prescale.translate[2] = 0.5; /* D3D clip space */
+      prescale.scale[2]     = 0.5; /* D3D clip space */
+   }
+
+
+   range_min = viewport->scale[2] * -1.0 + viewport->translate[2];
+   range_max = viewport->scale[2] *  1.0 + viewport->translate[2];
+
+   /* D3D (and by implication SVGA) doesn't like dealing with zmax
+    * less than zmin.  Detect that case, flip the depth range and
+    * invert our z-scale factor to achieve the same effect.
+    */
+   if (range_min > range_max) {
+      float range_tmp;
+      range_tmp = range_min; 
+      range_min = range_max; 
+      range_max = range_tmp;
+      prescale.scale[2]     = -prescale.scale[2];
+   }
+
+   if (prescale.enabled) {
+      float H[2];
+      float J[2];
+      int i;
+
+      SVGA_DBG(DEBUG_VIEWPORT,
+               "prescale %f,%f %fx%f\n",
+               prescale.translate[0],
+               prescale.translate[1],
+               prescale.scale[0],
+               prescale.scale[1]);
+
+      H[0] = (float)rect.w / 2.0;
+      H[1] = -(float)rect.h / 2.0;
+      J[0] = (float)rect.x + (float)rect.w / 2.0;
+      J[1] = (float)rect.y + (float)rect.h / 2.0;
+
+      SVGA_DBG(DEBUG_VIEWPORT,
+               "H %f,%f\n"
+               "J %fx%f\n",
+               H[0],
+               H[1],
+               J[0],
+               J[1]);
+
+      /* Adjust prescale to take into account the fact that it is
+       * going to be applied prior to the perspective divide and
+       * viewport transformation.
+       * 
+       * Vwin = H(Vc/Vc.w) + J
+       *
+       * We want to tweak Vwin with scale and translation from above,
+       * as in:
+       *
+       * Vwin' = S Vwin + T
+       *
+       * But we can only modify the values at Vc.  Plugging all the
+       * above together, and rearranging, eventually we get:
+       *
+       *   Vwin' = H(Vc'/Vc'.w) + J
+       * where:
+       *   Vc' = SVc + KVc.w
+       *   K = (T + (S-1)J) / H
+       *
+       * Overwrite prescale.translate with values for K:
+       */
+      for (i = 0; i < 2; i++) {
+         prescale.translate[i] = ((prescale.translate[i] +
+                                   (prescale.scale[i] - 1.0) * J[i]) / H[i]);
+      }
+
+      SVGA_DBG(DEBUG_VIEWPORT,
+               "clipspace %f,%f %fx%f\n",
+               prescale.translate[0],
+               prescale.translate[1],
+               prescale.scale[0],
+               prescale.scale[1]);
+   }
+
+out:
+   if (degenerate) {
+      rect.x = 0;
+      rect.y = 0;
+      rect.w = 1;
+      rect.h = 1;
+      prescale.enabled = FALSE;
+   }
+
+   if (memcmp(&rect, &svga->state.hw_clear.viewport, sizeof(rect)) != 0) {
+      ret = SVGA3D_SetViewport(svga->swc, &rect);
+      if(ret != PIPE_OK)
+         return ret;
+
+      memcpy(&svga->state.hw_clear.viewport, &rect, sizeof(rect));
+      assert(sizeof(rect) == sizeof(svga->state.hw_clear.viewport));
+   }
+
+   if (svga->state.hw_clear.depthrange.zmin != range_min ||
+       svga->state.hw_clear.depthrange.zmax != range_max) 
+   {
+      ret = SVGA3D_SetZRange(svga->swc, range_min, range_max );
+      if(ret != PIPE_OK)
+         return ret;
+
+      svga->state.hw_clear.depthrange.zmin = range_min;
+      svga->state.hw_clear.depthrange.zmax = range_max;
+   }
+
+   if (memcmp(&prescale, &svga->state.hw_clear.prescale, sizeof prescale) != 0) {
+      svga->dirty |= SVGA_NEW_PRESCALE;
+      svga->state.hw_clear.prescale = prescale;
+   }
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_hw_viewport = 
+{
+   "hw viewport state",
+   ( SVGA_NEW_FRAME_BUFFER |
+     SVGA_NEW_VIEWPORT |
+     SVGA_NEW_RAST |
+     SVGA_NEW_REDUCED_PRIMITIVE ),
+   emit_viewport
+};
+
+
+/***********************************************************************
+ * Scissor state
+ */
+static int emit_scissor_rect( struct svga_context *svga,
+                              unsigned dirty )
+{
+   const struct pipe_scissor_state *scissor = &svga->curr.scissor;
+   SVGA3dRect rect;
+
+   rect.x = scissor->minx;
+   rect.y = scissor->miny;
+   rect.w = scissor->maxx - scissor->minx; /* + 1 ?? */
+   rect.h = scissor->maxy - scissor->miny; /* + 1 ?? */
+   
+   return SVGA3D_SetScissorRect(svga->swc, &rect);
+}
+
+
+struct svga_tracked_state svga_hw_scissor = 
+{
+   "hw scissor state",
+   SVGA_NEW_SCISSOR,
+   emit_scissor_rect
+};
+
+
+/***********************************************************************
+ * Userclip state
+ */
+
+static int emit_clip_planes( struct svga_context *svga,
+                             unsigned dirty )
+{
+   unsigned i;
+   enum pipe_error ret;
+
+   /* TODO: just emit directly from svga_set_clip_state()?
+    */
+   for (i = 0; i < svga->curr.clip.nr; i++) {
+      ret = SVGA3D_SetClipPlane( svga->swc,
+                                 i,
+                                 svga->curr.clip.ucp[i] );
+      if(ret != PIPE_OK)
+         return ret;
+   }
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_hw_clip_planes = 
+{
+   "hw viewport state",
+   SVGA_NEW_CLIP,
+   emit_clip_planes
+};
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
new file mode 100644
index 0000000000..ad6f294713
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -0,0 +1,246 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_bitmask.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_cmd.h"
+#include "svga_tgsi.h"
+
+#include "svga_hw_reg.h"
+
+
+
+static INLINE int compare_fs_keys( const struct svga_fs_compile_key *a,
+                                   const struct svga_fs_compile_key *b )
+{
+   unsigned keysize_a = svga_fs_key_size( a );
+   unsigned keysize_b = svga_fs_key_size( b );
+
+   if (keysize_a != keysize_b) {
+      return (int)(keysize_a - keysize_b);
+   }
+   return memcmp( a, b, keysize_a );
+}
+
+
+static struct svga_shader_result *search_fs_key( struct svga_fragment_shader *fs,
+                                                 const struct svga_fs_compile_key *key )
+{
+   struct svga_shader_result *result = fs->base.results;
+
+   assert(key);
+
+   for ( ; result; result = result->next) {
+      if (compare_fs_keys( key, &result->key.fkey ) == 0)
+         return result;
+   }
+   
+   return NULL;
+}
+
+
+static enum pipe_error compile_fs( struct svga_context *svga,
+                                   struct svga_fragment_shader *fs,
+                                   const struct svga_fs_compile_key *key,
+                                   struct svga_shader_result **out_result )
+{
+   struct svga_shader_result *result;
+   enum pipe_error ret = PIPE_ERROR;
+
+   result = svga_translate_fragment_program( fs, key );
+   if (result == NULL) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto fail;
+   }
+
+   result->id = util_bitmask_add(svga->fs_bm);
+   if(result->id == UTIL_BITMASK_INVALID_INDEX) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto fail;
+   }
+
+   ret = SVGA3D_DefineShader(svga->swc, 
+                             result->id,
+                             SVGA3D_SHADERTYPE_PS,
+                             result->tokens, 
+                             result->nr_tokens * sizeof result->tokens[0]);
+   if (ret)
+      goto fail;
+
+   *out_result = result;
+   result->next = fs->base.results;
+   fs->base.results = result;
+   return PIPE_OK;
+
+fail:
+   if (result) {
+      if (result->id != UTIL_BITMASK_INVALID_INDEX)
+         util_bitmask_clear( svga->fs_bm, result->id );
+      svga_destroy_shader_result( result );
+   }
+   return ret;
+}
+
+
+/* SVGA_NEW_TEXTURE_BINDING
+ * SVGA_NEW_RAST
+ * SVGA_NEW_NEED_SWTNL
+ * SVGA_NEW_SAMPLER
+ */
+static int make_fs_key( const struct svga_context *svga,
+                        struct svga_fs_compile_key *key )
+{
+   int i;
+   int idx = 0;
+
+   memset(key, 0, sizeof *key);
+
+   /* Only need fragment shader fixup for twoside lighting if doing
+    * hwtnl.  Otherwise the draw module does the whole job for us.
+    *
+    * SVGA_NEW_SWTNL
+    */
+   if (!svga->state.sw.need_swtnl) {
+      /* SVGA_NEW_RAST
+       */
+      key->light_twoside = svga->curr.rast->templ.light_twoside;
+      key->front_ccw = svga->curr.rast->templ.front_ccw;
+   }
+
+   /* The blend workaround for simulating logicop xor behaviour
+    * requires that the incoming fragment color be white.  This change
+    * achieves that by creating a varient of the current fragment
+    * shader that overrides all output colors with 1,1,1,1
+    *   
+    * This will work for most shaders, including those containing
+    * TEXKIL and/or depth-write.  However, it will break on the
+    * combination of xor-logicop plus alphatest.
+    *
+    * Ultimately, we could implement alphatest in the shader using
+    * texkil prior to overriding the outgoing fragment color.
+    *   
+    * SVGA_NEW_BLEND
+    */
+   if (svga->curr.blend->need_white_fragments) {
+      key->white_fragments = 1;
+   }
+   
+   /* XXX: want to limit this to the textures that the shader actually
+    * refers to.
+    *
+    * SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_SAMPLER
+    */
+   for (i = 0; i < svga->curr.num_sampler_views; i++) {
+      if (svga->curr.sampler_views[i]) {
+         assert(svga->curr.sampler[i]);
+         assert(svga->curr.sampler_views[i]->texture);
+         key->tex[i].texture_target = svga->curr.sampler_views[i]->texture->target;
+         if (!svga->curr.sampler[i]->normalized_coords) {
+            key->tex[i].width_height_idx = idx++;
+            key->tex[i].unnormalized = TRUE;
+            ++key->num_unnormalized_coords;
+         }
+      }
+   }
+   key->num_textures = svga->curr.num_sampler_views;
+
+   idx = 0;
+   for (i = 0; i < svga->curr.num_samplers; ++i) {
+      if (svga->curr.sampler[i]) {
+         key->tex[i].compare_mode = svga->curr.sampler[i]->compare_mode;
+         key->tex[i].compare_func = svga->curr.sampler[i]->compare_func;
+      }
+   }
+
+   return 0;
+}
+
+
+
+static int emit_hw_fs( struct svga_context *svga,
+                       unsigned dirty )
+{
+   struct svga_shader_result *result = NULL;
+   unsigned id = SVGA3D_INVALID_ID;
+   int ret = 0;
+
+   struct svga_fragment_shader *fs = svga->curr.fs;
+   struct svga_fs_compile_key key;
+
+   /* SVGA_NEW_BLEND
+    * SVGA_NEW_TEXTURE_BINDING
+    * SVGA_NEW_RAST
+    * SVGA_NEW_NEED_SWTNL
+    * SVGA_NEW_SAMPLER
+    */
+   ret = make_fs_key( svga, &key );
+   if (ret)
+      return ret;
+
+   result = search_fs_key( fs, &key );
+   if (!result) {
+      ret = compile_fs( svga, fs, &key, &result );
+      if (ret)
+         return ret;
+   }
+
+   assert (result);
+   id = result->id;
+
+   assert(id != SVGA3D_INVALID_ID);
+
+   if (result != svga->state.hw_draw.fs) {
+      ret = SVGA3D_SetShader(svga->swc,
+                             SVGA3D_SHADERTYPE_PS,
+                             id );
+      if (ret)
+         return ret;
+
+      svga->dirty |= SVGA_NEW_FS_RESULT;
+      svga->state.hw_draw.fs = result;      
+   }
+
+   return 0;
+}
+
+struct svga_tracked_state svga_hw_fs = 
+{
+   "fragment shader (hwtnl)",
+   (SVGA_NEW_FS |
+    SVGA_NEW_TEXTURE_BINDING |
+    SVGA_NEW_NEED_SWTNL |
+    SVGA_NEW_RAST |
+    SVGA_NEW_SAMPLER |
+    SVGA_NEW_BLEND),
+   emit_hw_fs
+};
+
+
+
diff --git a/src/gallium/drivers/svga/svga_state_need_swtnl.c b/src/gallium/drivers/svga/svga_state_need_swtnl.c
new file mode 100644
index 0000000000..d34d68f535
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_need_swtnl.c
@@ -0,0 +1,200 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_state.h"
+
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_debug.h"
+#include "svga_hw_reg.h"
+
+/***********************************************************************
+ */
+
+static INLINE SVGA3dDeclType 
+svga_translate_vertex_format(enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_R32_FLOAT:            return SVGA3D_DECLTYPE_FLOAT1;
+   case PIPE_FORMAT_R32G32_FLOAT:         return SVGA3D_DECLTYPE_FLOAT2;
+   case PIPE_FORMAT_R32G32B32_FLOAT:      return SVGA3D_DECLTYPE_FLOAT3;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:   return SVGA3D_DECLTYPE_FLOAT4;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:       return SVGA3D_DECLTYPE_D3DCOLOR;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:     return SVGA3D_DECLTYPE_UBYTE4;
+   case PIPE_FORMAT_R16G16_SSCALED:       return SVGA3D_DECLTYPE_SHORT2;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED: return SVGA3D_DECLTYPE_SHORT4;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:       return SVGA3D_DECLTYPE_UBYTE4N;
+   case PIPE_FORMAT_R16G16_SNORM:         return SVGA3D_DECLTYPE_SHORT2N;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:   return SVGA3D_DECLTYPE_SHORT4N;
+   case PIPE_FORMAT_R16G16_UNORM:         return SVGA3D_DECLTYPE_USHORT2N;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:   return SVGA3D_DECLTYPE_USHORT4N;
+   case PIPE_FORMAT_R10G10B10X2_USCALED:  return SVGA3D_DECLTYPE_UDEC3;
+   case PIPE_FORMAT_R10G10B10X2_SNORM:    return SVGA3D_DECLTYPE_DEC3N;
+   case PIPE_FORMAT_R16G16_FLOAT:         return SVGA3D_DECLTYPE_FLOAT16_2;
+   case PIPE_FORMAT_R16G16B16A16_FLOAT:   return SVGA3D_DECLTYPE_FLOAT16_4;
+
+   default:
+      /* There are many formats without hardware support.  This case
+       * will be hit regularly, meaning we'll need swvfetch.
+       */
+      return SVGA3D_DECLTYPE_MAX;
+   }
+}
+
+
+static int update_need_swvfetch( struct svga_context *svga,
+                                 unsigned dirty )
+{
+   unsigned i;
+   boolean need_swvfetch = FALSE;
+
+   if (!svga->curr.velems) {
+      /* No vertex elements bound. */
+      return 0;
+   }
+
+   for (i = 0; i < svga->curr.velems->count; i++) {
+      svga->state.sw.ve_format[i] = svga_translate_vertex_format(svga->curr.velems->velem[i].src_format);
+      if (svga->state.sw.ve_format[i] == SVGA3D_DECLTYPE_MAX) {
+         need_swvfetch = TRUE;
+         break;
+      }
+   }
+
+   if (need_swvfetch != svga->state.sw.need_swvfetch) {
+      svga->state.sw.need_swvfetch = need_swvfetch;
+      svga->dirty |= SVGA_NEW_NEED_SWVFETCH;
+   }
+   
+   return 0;
+}
+
+struct svga_tracked_state svga_update_need_swvfetch = 
+{
+   "update need_swvfetch",
+   ( SVGA_NEW_VELEMENT ),
+   update_need_swvfetch
+};
+
+
+/*********************************************************************** 
+ */
+
+static int update_need_pipeline( struct svga_context *svga,
+                                 unsigned dirty )
+{
+   
+   boolean need_pipeline = FALSE;
+   struct svga_vertex_shader *vs = svga->curr.vs;
+
+   /* SVGA_NEW_RAST, SVGA_NEW_REDUCED_PRIMITIVE
+    */
+   if (svga->curr.rast->need_pipeline & (1 << svga->curr.reduced_prim)) {
+      SVGA_DBG(DEBUG_SWTNL, "%s: rast need_pipeline (%d) & prim (%x)\n", 
+                 __FUNCTION__,
+                 svga->curr.rast->need_pipeline,
+                 (1 << svga->curr.reduced_prim) );
+      need_pipeline = TRUE;
+   }
+
+   /* EDGEFLAGS
+    */
+    if (vs->base.info.writes_edgeflag) {
+      SVGA_DBG(DEBUG_SWTNL, "%s: edgeflags\n", __FUNCTION__);
+      need_pipeline = TRUE;
+   }
+
+   /* SVGA_NEW_CLIP 
+    */
+   if (svga->curr.clip.nr) {
+      SVGA_DBG(DEBUG_SWTNL, "%s: userclip\n", __FUNCTION__);
+      need_pipeline = TRUE;
+   }
+
+   if (need_pipeline != svga->state.sw.need_pipeline) {
+      svga->state.sw.need_pipeline = need_pipeline;
+      svga->dirty |= SVGA_NEW_NEED_PIPELINE;
+   }
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_update_need_pipeline = 
+{
+   "need pipeline",
+   (SVGA_NEW_RAST |
+    SVGA_NEW_CLIP |
+    SVGA_NEW_VS |
+    SVGA_NEW_REDUCED_PRIMITIVE),
+   update_need_pipeline
+};
+
+
+/*********************************************************************** 
+ */
+
+static int update_need_swtnl( struct svga_context *svga,
+                              unsigned dirty )
+{
+   boolean need_swtnl;
+
+   if (svga->debug.no_swtnl) {
+      svga->state.sw.need_swvfetch = 0;
+      svga->state.sw.need_pipeline = 0;
+   }
+
+   need_swtnl = (svga->state.sw.need_swvfetch ||
+                 svga->state.sw.need_pipeline);
+
+   if (svga->debug.force_swtnl) {
+      need_swtnl = 1;
+   }
+
+   if (need_swtnl != svga->state.sw.need_swtnl) {
+      SVGA_DBG(DEBUG_SWTNL|DEBUG_PERF,
+               "%s need_swvfetch: %s, need_pipeline %s\n",
+               __FUNCTION__,
+               svga->state.sw.need_swvfetch ? "true" : "false",
+               svga->state.sw.need_pipeline ? "true" : "false");
+
+      svga->state.sw.need_swtnl = need_swtnl;
+      svga->dirty |= SVGA_NEW_NEED_SWTNL;
+      svga->swtnl.new_vdecl = TRUE;
+   }
+  
+   return 0;
+}
+
+
+struct svga_tracked_state svga_update_need_swtnl =
+{
+   "need swtnl",
+   (SVGA_NEW_NEED_PIPELINE |
+    SVGA_NEW_NEED_SWVFETCH),
+   update_need_swtnl
+};
diff --git a/src/gallium/drivers/svga/svga_state_rss.c b/src/gallium/drivers/svga/svga_state_rss.c
new file mode 100644
index 0000000000..ab13f3fdf1
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_rss.c
@@ -0,0 +1,286 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_cmd.h"
+
+
+struct rs_queue {
+   unsigned rs_count;
+   SVGA3dRenderState rs[SVGA3D_RS_MAX];
+};
+
+
+#define EMIT_RS(svga, value, token, fail)                       \
+do {                                                            \
+   if (svga->state.hw_draw.rs[SVGA3D_RS_##token] != value) {    \
+      svga_queue_rs( &queue, SVGA3D_RS_##token, value );        \
+      svga->state.hw_draw.rs[SVGA3D_RS_##token] = value;        \
+   }                                                            \
+} while (0)
+
+#define EMIT_RS_FLOAT(svga, fvalue, token, fail)                \
+do {                                                            \
+   unsigned value = fui(fvalue);                                \
+   if (svga->state.hw_draw.rs[SVGA3D_RS_##token] != value) {    \
+      svga_queue_rs( &queue, SVGA3D_RS_##token, value );        \
+      svga->state.hw_draw.rs[SVGA3D_RS_##token] = value;        \
+   }                                                            \
+} while (0)
+
+
+static INLINE void
+svga_queue_rs( struct rs_queue *q,
+               unsigned rss,
+               unsigned value )
+{
+   q->rs[q->rs_count].state = rss;
+   q->rs[q->rs_count].uintValue = value;
+   q->rs_count++;
+}
+
+
+/* Compare old and new render states and emit differences between them
+ * to hardware.  Simplest implementation would be to emit the whole of
+ * the "to" state.
+ */
+static int emit_rss( struct svga_context *svga,
+                     unsigned dirty )
+{
+   struct rs_queue queue;
+
+   queue.rs_count = 0;
+
+   if (dirty & SVGA_NEW_BLEND) {
+      const struct svga_blend_state *curr = svga->curr.blend;
+
+      EMIT_RS( svga, curr->rt[0].writemask, COLORWRITEENABLE, fail );
+      EMIT_RS( svga, curr->rt[0].blend_enable, BLENDENABLE, fail );
+
+      if (curr->rt[0].blend_enable) {
+         EMIT_RS( svga, curr->rt[0].srcblend, SRCBLEND, fail );
+         EMIT_RS( svga, curr->rt[0].dstblend, DSTBLEND, fail );
+         EMIT_RS( svga, curr->rt[0].blendeq, BLENDEQUATION, fail );
+
+         EMIT_RS( svga, curr->rt[0].separate_alpha_blend_enable, 
+                  SEPARATEALPHABLENDENABLE, fail );
+
+         if (curr->rt[0].separate_alpha_blend_enable) {
+            EMIT_RS( svga, curr->rt[0].srcblend_alpha, SRCBLENDALPHA, fail );
+            EMIT_RS( svga, curr->rt[0].dstblend_alpha, DSTBLENDALPHA, fail );
+            EMIT_RS( svga, curr->rt[0].blendeq_alpha, BLENDEQUATIONALPHA, fail );
+         }
+      }
+   }
+
+   if (dirty & SVGA_NEW_BLEND_COLOR) {
+      uint32 color;
+      uint32 r = float_to_ubyte(svga->curr.blend_color.color[0]);
+      uint32 g = float_to_ubyte(svga->curr.blend_color.color[1]);
+      uint32 b = float_to_ubyte(svga->curr.blend_color.color[2]);
+      uint32 a = float_to_ubyte(svga->curr.blend_color.color[3]);
+
+      color = (a << 24) | (r << 16) | (g << 8) | b;
+
+      EMIT_RS( svga, color, BLENDCOLOR, fail );
+   }
+
+   if (dirty & (SVGA_NEW_DEPTH_STENCIL | SVGA_NEW_RAST)) {
+      const struct svga_depth_stencil_state *curr = svga->curr.depth; 
+      const struct svga_rasterizer_state *rast = svga->curr.rast; 
+
+      if (!curr->stencil[0].enabled) 
+      {
+         /* Stencil disabled
+          */
+         EMIT_RS( svga, FALSE, STENCILENABLE, fail );
+         EMIT_RS( svga, FALSE, STENCILENABLE2SIDED, fail );
+      }
+      else if (curr->stencil[0].enabled && !curr->stencil[1].enabled)
+      {
+         /* Regular stencil
+          */
+         EMIT_RS( svga, TRUE, STENCILENABLE, fail );
+         EMIT_RS( svga, FALSE, STENCILENABLE2SIDED, fail );
+
+         EMIT_RS( svga, curr->stencil[0].func,  STENCILFUNC, fail );
+         EMIT_RS( svga, curr->stencil[0].fail,  STENCILFAIL, fail );
+         EMIT_RS( svga, curr->stencil[0].zfail, STENCILZFAIL, fail );
+         EMIT_RS( svga, curr->stencil[0].pass,  STENCILPASS, fail );
+
+         EMIT_RS( svga, curr->stencil_mask, STENCILMASK, fail );
+         EMIT_RS( svga, curr->stencil_writemask, STENCILWRITEMASK, fail );
+      }
+      else 
+      {
+         int cw, ccw;
+
+         /* Hardware frontwinding is always CW, so if ours is also CW,
+          * then our definition of front face agrees with hardware.
+          * Otherwise need to flip.
+          */
+         if (rast->templ.front_ccw) {
+            ccw = 0;
+            cw = 1;
+         }
+         else {
+            ccw = 1;
+            cw = 0;
+         }
+
+         /* Twoside stencil
+          */
+         EMIT_RS( svga, TRUE, STENCILENABLE, fail );
+         EMIT_RS( svga, TRUE, STENCILENABLE2SIDED, fail );
+
+         EMIT_RS( svga, curr->stencil[cw].func,  STENCILFUNC, fail );
+         EMIT_RS( svga, curr->stencil[cw].fail,  STENCILFAIL, fail );
+         EMIT_RS( svga, curr->stencil[cw].zfail, STENCILZFAIL, fail );
+         EMIT_RS( svga, curr->stencil[cw].pass,  STENCILPASS, fail );
+
+         EMIT_RS( svga, curr->stencil[ccw].func,  CCWSTENCILFUNC, fail );
+         EMIT_RS( svga, curr->stencil[ccw].fail,  CCWSTENCILFAIL, fail );
+         EMIT_RS( svga, curr->stencil[ccw].zfail, CCWSTENCILZFAIL, fail );
+         EMIT_RS( svga, curr->stencil[ccw].pass,  CCWSTENCILPASS, fail );
+
+         EMIT_RS( svga, curr->stencil_mask, STENCILMASK, fail );
+         EMIT_RS( svga, curr->stencil_writemask, STENCILWRITEMASK, fail );
+      }
+
+      EMIT_RS( svga, curr->zenable, ZENABLE, fail );
+      if (curr->zenable) {
+         EMIT_RS( svga, curr->zfunc, ZFUNC, fail );
+         EMIT_RS( svga, curr->zwriteenable, ZWRITEENABLE, fail );
+      }
+
+      EMIT_RS( svga, curr->alphatestenable, ALPHATESTENABLE, fail );
+      if (curr->alphatestenable) {
+         EMIT_RS( svga, curr->alphafunc, ALPHAFUNC, fail );
+         EMIT_RS_FLOAT( svga, curr->alpharef, ALPHAREF, fail );
+      }
+   }
+
+   if (dirty & SVGA_NEW_STENCIL_REF) {
+      EMIT_RS( svga, svga->curr.stencil_ref.ref_value[0], STENCILREF, fail );
+   }
+
+   if (dirty & (SVGA_NEW_RAST | SVGA_NEW_NEED_PIPELINE))
+   {
+      const struct svga_rasterizer_state *curr = svga->curr.rast; 
+      unsigned cullmode = curr->cullmode;
+
+      /* Shademode: still need to rearrange index list to move
+       * flat-shading PV first vertex.
+       */
+      EMIT_RS( svga, curr->shademode, SHADEMODE, fail );
+
+      /* Don't do culling while the software pipeline is active.  It
+       * does it for us, and additionally introduces potentially
+       * back-facing triangles.
+       */
+      if (svga->state.sw.need_pipeline)
+         cullmode = SVGA3D_FACE_NONE;
+
+      EMIT_RS( svga, cullmode, CULLMODE, fail );
+      EMIT_RS( svga, curr->scissortestenable, SCISSORTESTENABLE, fail );
+      EMIT_RS( svga, curr->multisampleantialias, MULTISAMPLEANTIALIAS, fail );
+      EMIT_RS( svga, curr->lastpixel, LASTPIXEL, fail );
+      EMIT_RS( svga, curr->linepattern, LINEPATTERN, fail );
+      EMIT_RS_FLOAT( svga, curr->pointsize, POINTSIZE, fail );
+      /* XXX still need to set this? */
+      EMIT_RS_FLOAT( svga, 0.0, POINTSIZEMIN, fail );
+      EMIT_RS_FLOAT( svga, SVGA_MAX_POINTSIZE, POINTSIZEMAX, fail );
+   }
+
+   if (dirty & (SVGA_NEW_RAST | SVGA_NEW_FRAME_BUFFER | SVGA_NEW_NEED_PIPELINE))
+   {
+      const struct svga_rasterizer_state *curr = svga->curr.rast; 
+      float slope = 0.0;
+      float bias  = 0.0;
+
+      /* Need to modify depth bias according to bound depthbuffer
+       * format.  Don't do hardware depthbias while the software
+       * pipeline is active.
+       */
+      if (!svga->state.sw.need_pipeline &&
+          svga->curr.framebuffer.zsbuf)
+      {
+         slope = curr->slopescaledepthbias;
+         bias  = svga->curr.depthscale * curr->depthbias;
+      }
+
+      EMIT_RS_FLOAT( svga, slope, SLOPESCALEDEPTHBIAS, fail );
+      EMIT_RS_FLOAT( svga, bias, DEPTHBIAS, fail );
+   }
+
+
+   if (queue.rs_count) {
+      SVGA3dRenderState *rs;
+
+      if (SVGA3D_BeginSetRenderState( svga->swc,
+                                      &rs,
+                                      queue.rs_count ) != PIPE_OK)
+         goto fail;
+
+      memcpy( rs,
+              queue.rs,
+              queue.rs_count * sizeof queue.rs[0]);
+
+      SVGA_FIFOCommitAll( svga->swc );
+   }
+
+   return 0;
+
+fail:
+   /* XXX: need to poison cached hardware state on failure to ensure
+    * dirty state gets re-emitted.  Fix this by re-instating partial
+    * FIFOCommit command and only updating cached hw state once the
+    * initial allocation has succeeded.
+    */
+   memset(svga->state.hw_draw.rs, 0xcd, sizeof(svga->state.hw_draw.rs));
+
+   return PIPE_ERROR_OUT_OF_MEMORY;
+}
+
+
+struct svga_tracked_state svga_hw_rss = 
+{
+   "hw rss state",
+
+   (SVGA_NEW_BLEND |
+    SVGA_NEW_BLEND_COLOR |
+    SVGA_NEW_DEPTH_STENCIL |
+    SVGA_NEW_STENCIL_REF |
+    SVGA_NEW_RAST |
+    SVGA_NEW_FRAME_BUFFER |
+    SVGA_NEW_NEED_PIPELINE),
+
+   emit_rss
+};
diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c
new file mode 100644
index 0000000000..76a2dae143
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_tss.c
@@ -0,0 +1,279 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+
+#include "svga_sampler_view.h"
+#include "svga_winsys.h"
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_cmd.h"
+
+
+void svga_cleanup_tss_binding(struct svga_context *svga)
+{
+   int i;
+   unsigned count = MAX2( svga->curr.num_sampler_views,
+                          svga->state.hw_draw.num_views );
+
+   for (i = 0; i < count; i++) {
+      struct svga_hw_view_state *view = &svga->state.hw_draw.views[i];
+
+      svga_sampler_view_reference(&view->v, NULL);
+      pipe_sampler_view_reference( &svga->curr.sampler_views[i], NULL );
+      pipe_resource_reference( &view->texture, NULL );
+
+      view->dirty = 1;
+   }
+}
+
+
+static int
+update_tss_binding(struct svga_context *svga, 
+                   unsigned dirty )
+{
+   unsigned i;
+   unsigned count = MAX2( svga->curr.num_sampler_views,
+                          svga->state.hw_draw.num_views );
+   unsigned min_lod;
+   unsigned max_lod;
+
+
+   struct {
+      struct {
+         unsigned unit;
+         struct svga_hw_view_state *view;
+      } bind[PIPE_MAX_SAMPLERS];
+
+      unsigned bind_count;
+   } queue;
+
+   queue.bind_count = 0;
+   
+   for (i = 0; i < count; i++) {
+      const struct svga_sampler_state *s = svga->curr.sampler[i];
+      struct svga_hw_view_state *view = &svga->state.hw_draw.views[i];
+      struct pipe_resource *texture = NULL;
+
+      /* get min max lod */
+      if (svga->curr.sampler_views[i]) {
+         min_lod = MAX2(s->view_min_lod, 0);
+         max_lod = MIN2(s->view_max_lod, svga->curr.sampler_views[i]->texture->last_level);
+         texture = svga->curr.sampler_views[i]->texture;
+      } else {
+         min_lod = 0;
+         max_lod = 0;
+      }
+
+      if (view->texture != texture ||
+          view->min_lod != min_lod ||
+          view->max_lod != max_lod) {
+
+         svga_sampler_view_reference(&view->v, NULL);
+         pipe_resource_reference( &view->texture, texture );
+
+         view->dirty = TRUE;
+         view->min_lod = min_lod;
+         view->max_lod = max_lod;
+
+         if (texture)
+            view->v = svga_get_tex_sampler_view(&svga->pipe, 
+                                                texture, 
+                                                min_lod,
+                                                max_lod);
+      }
+
+      if (view->dirty) {
+         queue.bind[queue.bind_count].unit = i;
+         queue.bind[queue.bind_count].view = view;
+         queue.bind_count++;
+      } 
+      else if (view->v) {
+         svga_validate_sampler_view(svga, view->v);
+      }
+   }
+
+   svga->state.hw_draw.num_views = svga->curr.num_sampler_views;
+
+   if (queue.bind_count) {
+      SVGA3dTextureState *ts;
+
+      if (SVGA3D_BeginSetTextureState( svga->swc,
+                                       &ts,
+                                       queue.bind_count ) != PIPE_OK)
+         goto fail;
+
+      for (i = 0; i < queue.bind_count; i++) {
+         ts[i].stage = queue.bind[i].unit;
+         ts[i].name = SVGA3D_TS_BIND_TEXTURE;
+
+         if (queue.bind[i].view->v) {
+            svga->swc->surface_relocation(svga->swc,
+                                          &ts[i].value,
+                                          queue.bind[i].view->v->handle,
+                                          SVGA_RELOC_READ);
+         }
+         else {
+            ts[i].value = SVGA3D_INVALID_ID;
+         }
+         
+         queue.bind[i].view->dirty = FALSE;
+      }
+
+      SVGA_FIFOCommitAll( svga->swc );
+   }
+
+   return 0;
+
+fail:
+   return PIPE_ERROR_OUT_OF_MEMORY;
+}
+
+
+struct svga_tracked_state svga_hw_tss_binding = {
+   "texture binding emit",
+   SVGA_NEW_TEXTURE_BINDING |
+   SVGA_NEW_SAMPLER,
+   update_tss_binding
+};
+
+
+/***********************************************************************
+ */
+
+struct ts_queue {
+   unsigned ts_count;
+   SVGA3dTextureState ts[PIPE_MAX_SAMPLERS*SVGA3D_TS_MAX];
+};
+
+
+#define EMIT_TS(svga, unit, val, token, fail)                           \
+do {                                                                    \
+   if (svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] != val) {        \
+      svga_queue_tss( &queue, unit, SVGA3D_TS_##token, val );           \
+      svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] = val;            \
+   }                                                                    \
+} while (0)
+
+#define EMIT_TS_FLOAT(svga, unit, fvalue, token, fail)                  \
+do {                                                                    \
+   unsigned val = fui(fvalue);                                          \
+   if (svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] != val) {        \
+      svga_queue_tss( &queue, unit, SVGA3D_TS_##token, val );           \
+      svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] = val;            \
+   }                                                                    \
+} while (0)
+
+
+static INLINE void 
+svga_queue_tss( struct ts_queue *q,
+                unsigned unit,
+                unsigned tss,
+                unsigned value )
+{
+   assert(q->ts_count < sizeof(q->ts)/sizeof(q->ts[0]));
+   q->ts[q->ts_count].stage = unit;
+   q->ts[q->ts_count].name = tss;
+   q->ts[q->ts_count].value = value;
+   q->ts_count++;
+}
+
+
+static int
+update_tss(struct svga_context *svga, 
+           unsigned dirty )
+{
+   unsigned i;
+   struct ts_queue queue;
+
+   queue.ts_count = 0;
+   for (i = 0; i < svga->curr.num_samplers; i++) {
+      if (svga->curr.sampler[i]) {
+         const struct svga_sampler_state *curr = svga->curr.sampler[i];
+
+         EMIT_TS(svga, i, curr->mipfilter, MIPFILTER, fail);
+         EMIT_TS(svga, i, curr->min_lod, TEXTURE_MIPMAP_LEVEL, fail);
+         EMIT_TS(svga, i, curr->magfilter, MAGFILTER, fail);
+         EMIT_TS(svga, i, curr->minfilter, MINFILTER, fail);
+         EMIT_TS(svga, i, curr->aniso_level, TEXTURE_ANISOTROPIC_LEVEL, fail);
+         EMIT_TS_FLOAT(svga, i, curr->lod_bias, TEXTURE_LOD_BIAS, fail);
+         EMIT_TS(svga, i, curr->addressu, ADDRESSU, fail);
+         EMIT_TS(svga, i, curr->addressw, ADDRESSW, fail);
+         EMIT_TS(svga, i, curr->bordercolor, BORDERCOLOR, fail);
+         // TEXCOORDINDEX -- hopefully not needed
+
+         if (svga->curr.tex_flags.flag_1d & (1 << i)) {
+            debug_printf("wrap 1d tex %d\n", i);
+            EMIT_TS(svga, i, SVGA3D_TEX_ADDRESS_WRAP, ADDRESSV, fail);
+         }
+         else
+            EMIT_TS(svga, i, curr->addressv, ADDRESSV, fail);
+
+         if (svga->curr.tex_flags.flag_srgb & (1 << i))
+            EMIT_TS_FLOAT(svga, i, 2.2f, GAMMA, fail);
+         else
+            EMIT_TS_FLOAT(svga, i, 1.0f, GAMMA, fail);
+
+      }
+   }
+ 
+   if (queue.ts_count) {
+      SVGA3dTextureState *ts;
+
+      if (SVGA3D_BeginSetTextureState( svga->swc,
+                                       &ts,
+                                       queue.ts_count ) != PIPE_OK)
+         goto fail;
+
+      memcpy( ts,
+              queue.ts,
+              queue.ts_count * sizeof queue.ts[0]);
+      
+      SVGA_FIFOCommitAll( svga->swc );
+   }
+
+   return 0;
+
+fail:
+   /* XXX: need to poison cached hardware state on failure to ensure
+    * dirty state gets re-emitted.  Fix this by re-instating partial
+    * FIFOCommit command and only updating cached hw state once the
+    * initial allocation has succeeded.
+    */
+   memset(svga->state.hw_draw.ts, 0xcd, sizeof(svga->state.hw_draw.ts));
+
+   return PIPE_ERROR_OUT_OF_MEMORY;
+}
+
+
+struct svga_tracked_state svga_hw_tss = {
+   "texture state emit",
+   (SVGA_NEW_SAMPLER |
+    SVGA_NEW_TEXTURE_FLAGS),
+   update_tss
+};
+
diff --git a/src/gallium/drivers/svga/svga_state_vdecl.c b/src/gallium/drivers/svga/svga_state_vdecl.c
new file mode 100644
index 0000000000..3af7bf2b35
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_vdecl.c
@@ -0,0 +1,179 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_upload_mgr.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_draw.h"
+#include "svga_tgsi.h"
+#include "svga_screen.h"
+#include "svga_resource_buffer.h"
+
+#include "svga_hw_reg.h"
+
+
+static int
+upload_user_buffers( struct svga_context *svga )
+{
+   enum pipe_error ret = PIPE_OK;
+   int i;
+   int nr;
+
+   if (0) 
+      debug_printf("%s: %d\n", __FUNCTION__, svga->curr.num_vertex_buffers);
+
+   nr = svga->curr.num_vertex_buffers;
+
+   for (i = 0; i < nr; i++) 
+   {
+      if (svga_buffer_is_user_buffer(svga->curr.vb[i].buffer))
+      {
+         struct svga_buffer *buffer = svga_buffer(svga->curr.vb[i].buffer);
+
+         if (!buffer->uploaded.buffer) {
+            ret = u_upload_buffer( svga->upload_vb,
+                                   0,
+                                   buffer->b.b.width0,
+                                   &buffer->b.b,
+                                   &buffer->uploaded.offset,
+                                   &buffer->uploaded.buffer );
+            if (ret)
+               return ret;
+
+            if (0)
+               debug_printf("%s: %d: orig buf %p upl buf %p ofs %d sz %d\n",
+                            __FUNCTION__,
+                            i,
+                            buffer,
+                            buffer->uploaded.buffer,
+                            buffer->uploaded.offset,
+                            buffer->b.b.width0);
+         }
+
+         pipe_resource_reference( &svga->curr.vb[i].buffer, buffer->uploaded.buffer );
+         svga->curr.vb[i].buffer_offset = buffer->uploaded.offset;
+      }
+   }
+
+   if (0)
+      debug_printf("%s: DONE\n", __FUNCTION__);
+
+   return ret;
+}
+
+
+/***********************************************************************
+ */
+
+
+static int emit_hw_vs_vdecl( struct svga_context *svga,
+                             unsigned dirty )
+{
+   const struct pipe_vertex_element *ve = svga->curr.velems->velem;
+   SVGA3dVertexDecl decl;
+   unsigned i;
+
+   assert(svga->curr.velems->count >=
+          svga->curr.vs->base.info.file_count[TGSI_FILE_INPUT]);
+
+   svga_hwtnl_reset_vdecl( svga->hwtnl, 
+                           svga->curr.velems->count );
+
+   for (i = 0; i < svga->curr.velems->count; i++) {
+      const struct pipe_vertex_buffer *vb = &svga->curr.vb[ve[i].vertex_buffer_index];
+      unsigned usage, index;
+
+
+      svga_generate_vdecl_semantics( i, &usage, &index );
+
+      /* SVGA_NEW_VELEMENT
+       */
+      decl.identity.type = svga->state.sw.ve_format[i];
+      decl.identity.method = SVGA3D_DECLMETHOD_DEFAULT;
+      decl.identity.usage = usage;
+      decl.identity.usageIndex = index;
+      decl.array.stride = vb->stride;
+      decl.array.offset = (vb->buffer_offset +
+                           ve[i].src_offset);
+
+      svga_hwtnl_vdecl( svga->hwtnl,
+                        i,
+                        &decl,
+                        vb->buffer );
+   }
+
+   return 0;
+}
+
+
+static int emit_hw_vdecl( struct svga_context *svga,
+                          unsigned dirty )
+{
+   int ret = 0;
+
+   /* SVGA_NEW_NEED_SWTNL
+    */
+   if (svga->state.sw.need_swtnl)
+      return 0; /* Do not emit during swtnl */
+
+   /* If we get to here, we know that we're going to draw.  Upload
+    * userbuffers now and try to combine multiple userbuffers from
+    * multiple draw calls into a single host buffer for performance.
+    */
+   if (svga->curr.any_user_vertex_buffers &&
+       SVGA_COMBINE_USERBUFFERS)
+   {
+      ret = upload_user_buffers( svga );
+      if (ret)
+         return ret;
+
+      svga->curr.any_user_vertex_buffers = FALSE;
+   }
+
+   return emit_hw_vs_vdecl( svga, dirty );
+}
+
+
+struct svga_tracked_state svga_hw_vdecl = 
+{
+   "hw vertex decl state (hwtnl version)",
+   ( SVGA_NEW_NEED_SWTNL |
+     SVGA_NEW_VELEMENT |
+     SVGA_NEW_VBUFFER |
+     SVGA_NEW_RAST |
+     SVGA_NEW_FS |
+     SVGA_NEW_VS ),
+   emit_hw_vdecl
+};
+
+
+
+
+
+
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
new file mode 100644
index 0000000000..5133c70593
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -0,0 +1,256 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_bitmask.h"
+#include "translate/translate.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_cmd.h"
+#include "svga_tgsi.h"
+
+#include "svga_hw_reg.h"
+
+/***********************************************************************
+ */
+
+
+static INLINE int compare_vs_keys( const struct svga_vs_compile_key *a,
+                                   const struct svga_vs_compile_key *b )
+{
+   unsigned keysize = svga_vs_key_size( a );
+   return memcmp( a, b, keysize );
+}
+
+
+static struct svga_shader_result *search_vs_key( struct svga_vertex_shader *vs,
+                                                 const struct svga_vs_compile_key *key )
+{
+   struct svga_shader_result *result = vs->base.results;
+
+   assert(key);
+
+   for ( ; result; result = result->next) {
+      if (compare_vs_keys( key, &result->key.vkey ) == 0)
+         return result;
+   }
+   
+   return NULL;
+}
+
+
+static enum pipe_error compile_vs( struct svga_context *svga,
+                                   struct svga_vertex_shader *vs,
+                                   const struct svga_vs_compile_key *key,
+                                   struct svga_shader_result **out_result )
+{
+   struct svga_shader_result *result;
+   enum pipe_error ret = PIPE_ERROR;
+
+   result = svga_translate_vertex_program( vs, key );
+   if (result == NULL) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto fail;
+   }
+
+   result->id = util_bitmask_add(svga->vs_bm);
+   if(result->id == UTIL_BITMASK_INVALID_INDEX) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto fail;
+   }
+
+   ret = SVGA3D_DefineShader(svga->swc, 
+                             result->id,
+                             SVGA3D_SHADERTYPE_VS,
+                             result->tokens, 
+                             result->nr_tokens * sizeof result->tokens[0]);
+   if (ret)
+      goto fail;
+
+   *out_result = result;
+   result->next = vs->base.results;
+   vs->base.results = result;
+   return PIPE_OK;
+
+fail:
+   if (result) {
+      if (result->id != UTIL_BITMASK_INVALID_INDEX)
+         util_bitmask_clear( svga->vs_bm, result->id );
+      svga_destroy_shader_result( result );
+   }
+   return ret;
+}
+
+/* SVGA_NEW_PRESCALE, SVGA_NEW_RAST, SVGA_NEW_ZERO_STRIDE
+ */
+static int make_vs_key( struct svga_context *svga,
+                        struct svga_vs_compile_key *key )
+{
+   memset(key, 0, sizeof *key);
+   key->need_prescale = svga->state.hw_clear.prescale.enabled;
+   key->allow_psiz = svga->curr.rast->templ.point_size_per_vertex;
+   key->zero_stride_vertex_elements =
+      svga->curr.zero_stride_vertex_elements;
+   key->num_zero_stride_vertex_elements =
+      svga->curr.num_zero_stride_vertex_elements;
+   return 0;
+}
+
+
+
+static int emit_hw_vs( struct svga_context *svga,
+                       unsigned dirty )
+{
+   struct svga_shader_result *result = NULL;
+   unsigned id = SVGA3D_INVALID_ID;
+   int ret = 0;
+
+   /* SVGA_NEW_NEED_SWTNL */
+   if (!svga->state.sw.need_swtnl) {
+      struct svga_vertex_shader *vs = svga->curr.vs;
+      struct svga_vs_compile_key key;
+
+      ret = make_vs_key( svga, &key );
+      if (ret)
+         return ret;
+
+      result = search_vs_key( vs, &key );
+      if (!result) {
+         ret = compile_vs( svga, vs, &key, &result );
+         if (ret)
+            return ret;
+      }
+
+      assert (result);
+      id = result->id;
+   }
+
+   if (result != svga->state.hw_draw.vs) {
+      ret = SVGA3D_SetShader(svga->swc,
+                             SVGA3D_SHADERTYPE_VS,
+                             id );
+      if (ret)
+         return ret;
+
+      svga->dirty |= SVGA_NEW_VS_RESULT;
+      svga->state.hw_draw.vs = result;      
+   }
+
+   return 0;
+}
+
+struct svga_tracked_state svga_hw_vs = 
+{
+   "vertex shader (hwtnl)",
+   (SVGA_NEW_VS |
+    SVGA_NEW_PRESCALE |
+    SVGA_NEW_NEED_SWTNL |
+    SVGA_NEW_ZERO_STRIDE),
+   emit_hw_vs
+};
+
+
+/***********************************************************************
+ */
+static int update_zero_stride( struct svga_context *svga,
+                               unsigned dirty )
+{
+   unsigned i;
+
+   svga->curr.zero_stride_vertex_elements = 0;
+   svga->curr.num_zero_stride_vertex_elements = 0;
+
+   for (i = 0; i < svga->curr.velems->count; i++) {
+      const struct pipe_vertex_element *vel = &svga->curr.velems->velem[i];
+      const struct pipe_vertex_buffer *vbuffer = &svga->curr.vb[
+         vel->vertex_buffer_index];
+
+      if (vbuffer->stride == 0) {
+         unsigned const_idx =
+            svga->curr.num_zero_stride_vertex_elements;
+	 struct pipe_transfer *transfer;
+         struct translate *translate;
+         struct translate_key key;
+         void *mapped_buffer;
+
+         svga->curr.zero_stride_vertex_elements |= (1 << i);
+         ++svga->curr.num_zero_stride_vertex_elements;
+
+         key.output_stride = 4 * sizeof(float);
+         key.nr_elements = 1;
+         key.element[0].type = TRANSLATE_ELEMENT_NORMAL;
+         key.element[0].input_format = vel->src_format;
+         key.element[0].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+         key.element[0].input_buffer = vel->vertex_buffer_index;
+         key.element[0].input_offset = vel->src_offset;
+         key.element[0].instance_divisor = vel->instance_divisor;
+         key.element[0].output_offset = const_idx * 4 * sizeof(float);
+
+         translate_key_sanitize(&key);
+         /* translate_generic_create is technically private but
+          * we don't want to code-generate, just want generic
+          * translation */
+         translate = translate_generic_create(&key);
+
+         assert(vel->src_offset == 0);
+         
+         mapped_buffer = pipe_buffer_map_range(&svga->pipe, 
+                                               vbuffer->buffer,
+                                               vel->src_offset,
+                                               util_format_get_blocksize(vel->src_format),
+                                               PIPE_TRANSFER_READ,
+					       &transfer);
+
+         translate->set_buffer(translate, vel->vertex_buffer_index,
+                               mapped_buffer,
+                               vbuffer->stride, vbuffer->max_index);
+         translate->run(translate, 0, 1, 0,
+                        svga->curr.zero_stride_constants);
+
+         pipe_buffer_unmap(&svga->pipe,
+                           vbuffer->buffer,
+			   transfer);
+
+         translate->release(translate);
+      }
+   }
+
+   if (svga->curr.num_zero_stride_vertex_elements)
+      svga->dirty |= SVGA_NEW_ZERO_STRIDE;
+
+   return 0;
+}
+
+struct svga_tracked_state svga_hw_update_zero_stride =
+{
+   "update zero_stride",
+   ( SVGA_NEW_VELEMENT |
+     SVGA_NEW_VBUFFER ),
+   update_zero_stride
+};
diff --git a/src/gallium/drivers/svga/svga_surface.c b/src/gallium/drivers/svga/svga_surface.c
new file mode 100644
index 0000000000..b21dc5fd9a
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_surface.c
@@ -0,0 +1,359 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "os/os_thread.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_screen.h"
+#include "svga_context.h"
+#include "svga_resource_texture.h"
+#include "svga_surface.h"
+#include "svga_debug.h"
+
+
+void
+svga_texture_copy_handle(struct svga_context *svga,
+                         struct svga_winsys_surface *src_handle,
+                         unsigned src_x, unsigned src_y, unsigned src_z,
+                         unsigned src_level, unsigned src_face,
+                         struct svga_winsys_surface *dst_handle,
+                         unsigned dst_x, unsigned dst_y, unsigned dst_z,
+                         unsigned dst_level, unsigned dst_face,
+                         unsigned width, unsigned height, unsigned depth)
+{
+   struct svga_surface dst, src;
+   enum pipe_error ret;
+   SVGA3dCopyBox box, *boxes;
+
+   assert(svga);
+
+   src.handle = src_handle;
+   src.real_level = src_level;
+   src.real_face = src_face;
+   src.real_zslice = 0;
+
+   dst.handle = dst_handle;
+   dst.real_level = dst_level;
+   dst.real_face = dst_face;
+   dst.real_zslice = 0;
+
+   box.x = dst_x;
+   box.y = dst_y;
+   box.z = dst_z;
+   box.w = width;
+   box.h = height;
+   box.d = depth;
+   box.srcx = src_x;
+   box.srcy = src_y;
+   box.srcz = src_z;
+
+/*
+   SVGA_DBG(DEBUG_VIEWS, "mipcopy src: %p %u (%ux%ux%u), dst: %p %u (%ux%ux%u)\n",
+            src_handle, src_level, src_x, src_y, src_z,
+            dst_handle, dst_level, dst_x, dst_y, dst_z);
+*/
+
+   ret = SVGA3D_BeginSurfaceCopy(svga->swc,
+                                 &src.base,
+                                 &dst.base,
+                                 &boxes, 1);
+   if(ret != PIPE_OK) {
+      svga_context_flush(svga, NULL);
+      ret = SVGA3D_BeginSurfaceCopy(svga->swc,
+                                    &src.base,
+                                    &dst.base,
+                                    &boxes, 1);
+      assert(ret == PIPE_OK);
+   }
+   *boxes = box;
+   SVGA_FIFOCommitAll(svga->swc);
+}
+
+
+struct svga_winsys_surface *
+svga_texture_view_surface(struct pipe_context *pipe,
+                          struct svga_texture *tex,
+                          SVGA3dSurfaceFormat format,
+                          unsigned start_mip,
+                          unsigned num_mip,
+                          int face_pick,
+                          int zslice_pick,
+                          struct svga_host_surface_cache_key *key) /* OUT */
+{
+   struct svga_screen *ss = svga_screen(pipe->screen);
+   struct svga_winsys_surface *handle;
+   uint32_t i, j;
+   unsigned z_offset = 0;
+
+   SVGA_DBG(DEBUG_PERF, 
+            "svga: Create surface view: face %d zslice %d mips %d..%d\n",
+            face_pick, zslice_pick, start_mip, start_mip+num_mip-1);
+
+   key->flags = 0;
+   key->format = format;
+   key->numMipLevels = num_mip;
+   key->size.width = u_minify(tex->b.b.width0, start_mip);
+   key->size.height = u_minify(tex->b.b.height0, start_mip);
+   key->size.depth = zslice_pick < 0 ? u_minify(tex->b.b.depth0, start_mip) : 1;
+   key->cachable = 1;
+   assert(key->size.depth == 1);
+   
+   if(tex->b.b.target == PIPE_TEXTURE_CUBE && face_pick < 0) {
+      key->flags |= SVGA3D_SURFACE_CUBEMAP;
+      key->numFaces = 6;
+   } else {
+      key->numFaces = 1;
+   }
+
+   if(key->format == SVGA3D_FORMAT_INVALID) {
+      key->cachable = 0;
+      return NULL;
+   }
+
+   SVGA_DBG(DEBUG_DMA, "surface_create for texture view\n");
+   handle = svga_screen_surface_create(ss, key);
+   if (!handle) {
+      key->cachable = 0;
+      return NULL;
+   }
+
+   SVGA_DBG(DEBUG_DMA, " --> got sid %p (texture view)\n", handle);
+
+   if (face_pick < 0)
+      face_pick = 0;
+
+   if (zslice_pick >= 0)
+       z_offset = zslice_pick;
+
+   for (i = 0; i < key->numMipLevels; i++) {
+      for (j = 0; j < key->numFaces; j++) {
+         if(tex->defined[j + face_pick][i + start_mip]) {
+            unsigned depth = (zslice_pick < 0 ?
+                              u_minify(tex->b.b.depth0, i + start_mip) :
+                              1);
+
+            svga_texture_copy_handle(svga_context(pipe),
+                                     tex->handle, 
+                                     0, 0, z_offset, 
+                                     i + start_mip, 
+                                     j + face_pick,
+                                     handle, 0, 0, 0, i, j,
+                                     u_minify(tex->b.b.width0, i + start_mip),
+                                     u_minify(tex->b.b.height0, i + start_mip),
+                                     depth);
+         }
+      }
+   }
+
+   return handle;
+}
+
+
+static struct pipe_surface *
+svga_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_resource *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned flags)
+{
+   struct svga_texture *tex = svga_texture(pt);
+   struct svga_surface *s;
+   boolean render = (flags & (PIPE_BIND_RENDER_TARGET |
+			      PIPE_BIND_DEPTH_STENCIL)) ? TRUE : FALSE;
+   boolean view = FALSE;
+   SVGA3dSurfaceFormat format;
+
+   s = CALLOC_STRUCT(svga_surface);
+   if (!s)
+      return NULL;
+
+   pipe_reference_init(&s->base.reference, 1);
+   pipe_resource_reference(&s->base.texture, pt);
+   s->base.format = pt->format;
+   s->base.width = u_minify(pt->width0, level);
+   s->base.height = u_minify(pt->height0, level);
+   s->base.usage = flags;
+   s->base.level = level;
+   s->base.face = face;
+   s->base.zslice = zslice;
+
+   if (!render)
+      format = svga_translate_format(pt->format);
+   else
+      format = svga_translate_format_render(pt->format);
+
+   assert(format != SVGA3D_FORMAT_INVALID);
+
+   if (svga_screen(screen)->debug.force_surface_view)
+      view = TRUE;
+
+   /* Currently only used for compressed textures */
+   if (render && 
+       format != svga_translate_format(pt->format)) {
+      view = TRUE;
+   }
+
+   if (level != 0 && 
+       svga_screen(screen)->debug.force_level_surface_view)
+      view = TRUE;
+
+   if (pt->target == PIPE_TEXTURE_3D)
+      view = TRUE;
+
+   if (svga_screen(screen)->debug.no_surface_view)
+      view = FALSE;
+
+   if (view) {
+      SVGA_DBG(DEBUG_VIEWS, "svga: Surface view: yes %p, level %u face %u z %u, %p\n",
+               pt, level, face, zslice, s);
+
+      s->handle = svga_texture_view_surface(NULL, tex, format, level, 1, face, zslice,
+                                            &s->key);
+      s->real_face = 0;
+      s->real_level = 0;
+      s->real_zslice = 0;
+   } else {
+      SVGA_DBG(DEBUG_VIEWS, "svga: Surface view: no %p, level %u, face %u, z %u, %p\n",
+               pt, level, face, zslice, s);
+
+      memset(&s->key, 0, sizeof s->key);
+      s->handle = tex->handle;
+      s->real_face = face;
+      s->real_level = level;
+      s->real_zslice = zslice;
+   }
+
+   return &s->base;
+}
+
+
+static void
+svga_tex_surface_destroy(struct pipe_surface *surf)
+{
+   struct svga_surface *s = svga_surface(surf);
+   struct svga_texture *t = svga_texture(surf->texture);
+   struct svga_screen *ss = svga_screen(surf->texture->screen);
+
+   if(s->handle != t->handle) {
+      SVGA_DBG(DEBUG_DMA, "unref sid %p (tex surface)\n", s->handle);
+      svga_screen_surface_destroy(ss, &s->key, &s->handle);
+   }
+
+   pipe_resource_reference(&surf->texture, NULL);
+   FREE(surf);
+}
+
+
+static INLINE void 
+svga_mark_surface_dirty(struct pipe_surface *surf)
+{
+   struct svga_surface *s = svga_surface(surf);
+
+   if(!s->dirty) {
+      struct svga_texture *tex = svga_texture(surf->texture);
+
+      s->dirty = TRUE;
+
+      if (s->handle == tex->handle)
+         tex->defined[surf->face][surf->level] = TRUE;
+      else {
+         /* this will happen later in svga_propagate_surface */
+      }
+   }
+}
+
+
+void svga_mark_surfaces_dirty(struct svga_context *svga)
+{
+   unsigned i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      if (svga->curr.framebuffer.cbufs[i])
+         svga_mark_surface_dirty(svga->curr.framebuffer.cbufs[i]);
+   }
+   if (svga->curr.framebuffer.zsbuf)
+      svga_mark_surface_dirty(svga->curr.framebuffer.zsbuf);
+}
+
+
+/**
+ * Progagate any changes from surfaces to texture.
+ * pipe is optional context to inline the blit command in.
+ */
+void
+svga_propagate_surface(struct pipe_context *pipe, struct pipe_surface *surf)
+{
+   struct svga_surface *s = svga_surface(surf);
+   struct svga_texture *tex = svga_texture(surf->texture);
+   struct svga_screen *ss = svga_screen(surf->texture->screen);
+
+   if (!s->dirty)
+      return;
+
+   s->dirty = FALSE;
+   ss->texture_timestamp++;
+   tex->view_age[surf->level] = ++(tex->age);
+
+   if (s->handle != tex->handle) {
+      SVGA_DBG(DEBUG_VIEWS, "svga: Surface propagate: tex %p, level %u, from %p\n", tex, surf->level, surf);
+      svga_texture_copy_handle(svga_context(pipe),
+                               s->handle, 0, 0, 0, s->real_level, s->real_face,
+                               tex->handle, 0, 0, surf->zslice, surf->level, surf->face,
+                               u_minify(tex->b.b.width0, surf->level),
+                               u_minify(tex->b.b.height0, surf->level), 1);
+      tex->defined[surf->face][surf->level] = TRUE;
+   }
+}
+
+/**
+ * Check if we should call svga_propagate_surface on the surface.
+ */
+boolean
+svga_surface_needs_propagation(struct pipe_surface *surf)
+{
+   struct svga_surface *s = svga_surface(surf);
+   struct svga_texture *tex = svga_texture(surf->texture);
+
+   return s->dirty && s->handle != tex->handle;
+}
+
+
+
+
+
+
+void
+svga_screen_init_surface_functions(struct pipe_screen *screen)
+{
+   screen->get_tex_surface = svga_get_tex_surface;
+   screen->tex_surface_destroy = svga_tex_surface_destroy;
+}
+
diff --git a/src/gallium/drivers/svga/svga_surface.h b/src/gallium/drivers/svga/svga_surface.h
new file mode 100644
index 0000000000..13bd5b19b6
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_surface.h
@@ -0,0 +1,96 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_SURFACE_H
+#define SVGA_SURFACE_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "svga_screen_cache.h"
+
+struct pipe_context;
+struct pipe_screen;
+struct svga_context;
+struct svga_texture;
+struct svga_winsys_surface;
+enum SVGA3dSurfaceFormat;
+
+
+struct svga_surface
+{
+   struct pipe_surface base;
+
+   struct svga_host_surface_cache_key key;
+   struct svga_winsys_surface *handle;
+
+   unsigned real_face;
+   unsigned real_level;
+   unsigned real_zslice;
+
+   boolean dirty;
+};
+
+
+extern void
+svga_propagate_surface(struct pipe_context *pipe, struct pipe_surface *surf);
+
+extern boolean
+svga_surface_needs_propagation(struct pipe_surface *surf);
+
+struct svga_winsys_surface *
+svga_texture_view_surface(struct pipe_context *pipe,
+                          struct svga_texture *tex,
+                          SVGA3dSurfaceFormat format,
+                          unsigned start_mip,
+                          unsigned num_mip,
+                          int face_pick,
+                          int zslice_pick,
+                          struct svga_host_surface_cache_key *key); /* OUT */
+
+
+void
+svga_texture_copy_handle(struct svga_context *svga,
+                         struct svga_winsys_surface *src_handle,
+                         unsigned src_x, unsigned src_y, unsigned src_z,
+                         unsigned src_level, unsigned src_face,
+                         struct svga_winsys_surface *dst_handle,
+                         unsigned dst_x, unsigned dst_y, unsigned dst_z,
+                         unsigned dst_level, unsigned dst_face,
+                         unsigned width, unsigned height, unsigned depth);
+
+
+static INLINE struct svga_surface *
+svga_surface(struct pipe_surface *surface)
+{
+   assert(surface);
+   return (struct svga_surface *)surface;
+}
+
+void
+svga_screen_init_surface_functions(struct pipe_screen *screen);
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_swtnl.h b/src/gallium/drivers/svga/svga_swtnl.h
new file mode 100644
index 0000000000..65c675f99c
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_swtnl.h
@@ -0,0 +1,52 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_SWTNL_H
+#define SVGA_SWTNL_H
+
+#include "pipe/p_compiler.h"
+
+struct svga_context;
+struct pipe_context;
+struct vbuf_render;
+
+
+boolean svga_init_swtnl( struct svga_context *svga );
+void svga_destroy_swtnl( struct svga_context *svga );
+
+
+enum pipe_error
+svga_swtnl_draw_range_elements(struct svga_context *svga,
+                               struct pipe_resource *indexBuffer,
+                               unsigned indexSize,
+                               int indexBias,
+                               unsigned min_index,
+                               unsigned max_index,
+                               unsigned prim, 
+                               unsigned start, 
+                               unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_swtnl_backend.c b/src/gallium/drivers/svga/svga_swtnl_backend.c
new file mode 100644
index 0000000000..ff3da84272
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_swtnl_backend.c
@@ -0,0 +1,350 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "draw/draw_vbuf.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+
+#include "util/u_debug.h"
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_swtnl.h"
+
+#include "svga_types.h"
+#include "svga_reg.h"
+#include "svga3d_reg.h"
+#include "svga_draw.h"
+#include "svga_swtnl_private.h"
+
+
+static const struct vertex_info *
+svga_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   struct svga_context *svga = svga_render->svga;
+
+   svga_swtnl_update_vdecl(svga);
+
+   return &svga_render->vertex_info;
+}
+
+
+static boolean
+svga_vbuf_render_allocate_vertices( struct vbuf_render *render,
+                                    ushort vertex_size,
+                                    ushort nr_vertices )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   struct svga_context *svga = svga_render->svga;
+   struct pipe_screen *screen = svga->pipe.screen;
+   size_t size = (size_t)nr_vertices * (size_t)vertex_size;
+   boolean new_vbuf = FALSE;
+   boolean new_ibuf = FALSE;
+
+   if (svga_render->vertex_size != vertex_size)
+      svga->swtnl.new_vdecl = TRUE;
+   svga_render->vertex_size = (size_t)vertex_size;
+
+   if (svga->swtnl.new_vbuf)
+      new_ibuf = new_vbuf = TRUE;
+   svga->swtnl.new_vbuf = FALSE;
+
+   if (svga_render->vbuf_size < svga_render->vbuf_offset + svga_render->vbuf_used + size)
+      new_vbuf = TRUE;
+
+   if (new_vbuf)
+      pipe_resource_reference(&svga_render->vbuf, NULL);
+   if (new_ibuf)
+      pipe_resource_reference(&svga_render->ibuf, NULL);
+
+   if (!svga_render->vbuf) {
+      svga_render->vbuf_size = MAX2(size, svga_render->vbuf_alloc_size);
+      svga_render->vbuf = pipe_buffer_create(screen,
+                                             PIPE_BIND_VERTEX_BUFFER,
+                                             svga_render->vbuf_size);
+      if(!svga_render->vbuf) {
+         svga_context_flush(svga, NULL);
+         svga_render->vbuf = pipe_buffer_create(screen,
+                                                PIPE_BIND_VERTEX_BUFFER,
+                                                svga_render->vbuf_size);
+         assert(svga_render->vbuf);
+      }
+
+      svga->swtnl.new_vdecl = TRUE;
+      svga_render->vbuf_offset = 0;
+   } else {
+      svga_render->vbuf_offset += svga_render->vbuf_used;
+   }
+
+   svga_render->vbuf_used = 0;
+
+   if (svga->swtnl.new_vdecl)
+      svga_render->vdecl_offset = svga_render->vbuf_offset;
+
+   return TRUE;
+}
+
+static void *
+svga_vbuf_render_map_vertices( struct vbuf_render *render )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   struct svga_context *svga = svga_render->svga;
+
+   char *ptr = (char*)pipe_buffer_map(&svga->pipe,
+                                      svga_render->vbuf,
+                                      PIPE_TRANSFER_WRITE | 
+                                      PIPE_TRANSFER_FLUSH_EXPLICIT |
+                                      PIPE_TRANSFER_DISCARD |
+                                      PIPE_TRANSFER_UNSYNCHRONIZED,
+				      &svga_render->vbuf_transfer);
+   return ptr + svga_render->vbuf_offset;
+}
+
+static void
+svga_vbuf_render_unmap_vertices( struct vbuf_render *render,
+                                 ushort min_index,
+                                 ushort max_index )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   struct svga_context *svga = svga_render->svga;
+   unsigned offset, length;
+   size_t used = svga_render->vertex_size * ((size_t)max_index + 1);
+
+   offset = svga_render->vbuf_offset + svga_render->vertex_size * min_index;
+   length = svga_render->vertex_size * (max_index + 1 - min_index);
+   pipe_buffer_flush_mapped_range(&svga->pipe,
+				  svga_render->vbuf_transfer,
+				  offset, length);
+   pipe_buffer_unmap(&svga->pipe, svga_render->vbuf, svga_render->vbuf_transfer);
+   svga_render->min_index = min_index;
+   svga_render->max_index = max_index;
+   svga_render->vbuf_used = MAX2(svga_render->vbuf_used, used);
+}
+
+static boolean
+svga_vbuf_render_set_primitive( struct vbuf_render *render,
+                                unsigned prim )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   svga_render->prim = prim;
+
+   return TRUE;
+}
+
+static void
+svga_vbuf_sumbit_state( struct svga_vbuf_render *svga_render )
+{
+   struct svga_context *svga = svga_render->svga;
+   SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
+   enum pipe_error ret;
+   int i;
+
+   /* if the vdecl or vbuf hasn't changed do nothing */
+   if (!svga->swtnl.new_vdecl)
+      return;
+
+   memcpy(vdecl, svga_render->vdecl, sizeof(vdecl));
+
+   /* flush the hw state */
+   ret = svga_hwtnl_flush(svga->hwtnl);
+   if (ret) {
+      svga_context_flush(svga, NULL);
+      ret = svga_hwtnl_flush(svga->hwtnl);
+      /* if we hit this path we might become synced with hw */
+      svga->swtnl.new_vbuf = TRUE;
+      assert(ret == 0);
+   }
+
+   svga_hwtnl_reset_vdecl(svga->hwtnl, svga_render->vdecl_count);
+
+   for (i = 0; i < svga_render->vdecl_count; i++) {
+      vdecl[i].array.offset += svga_render->vdecl_offset;
+
+      svga_hwtnl_vdecl( svga->hwtnl,
+                        i,
+                        &vdecl[i],
+                        svga_render->vbuf );
+   }
+
+   /* We have already taken care of flatshading, so let the hwtnl
+    * module use whatever is most convenient:
+    */
+   if (svga->state.sw.need_pipeline) {
+      svga_hwtnl_set_flatshade(svga->hwtnl, FALSE, FALSE);
+      svga_hwtnl_set_unfilled(svga->hwtnl, PIPE_POLYGON_MODE_FILL);
+   }
+   else {
+      svga_hwtnl_set_flatshade( svga->hwtnl,
+                                svga->curr.rast->templ.flatshade,
+                                svga->curr.rast->templ.flatshade_first );
+
+      svga_hwtnl_set_unfilled( svga->hwtnl,
+                               svga->curr.rast->hw_unfilled );
+   }
+
+   svga->swtnl.new_vdecl = FALSE;
+}
+
+static void
+svga_vbuf_render_draw_arrays( struct vbuf_render *render,
+                              unsigned start,
+                              uint nr )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   struct svga_context *svga = svga_render->svga;
+   unsigned bias = (svga_render->vbuf_offset - svga_render->vdecl_offset) / svga_render->vertex_size;
+   enum pipe_error ret = 0;
+
+   svga_vbuf_sumbit_state(svga_render);
+
+   /* Need to call update_state() again as the draw module may have
+    * altered some of our state behind our backs.  Testcase:
+    * redbook/polys.c
+    */
+   svga_update_state_retry( svga, SVGA_STATE_HW_DRAW );
+
+   ret = svga_hwtnl_draw_arrays(svga->hwtnl, svga_render->prim, start + bias, nr);
+   if (ret != PIPE_OK) {
+      svga_context_flush(svga, NULL);
+      ret = svga_hwtnl_draw_arrays(svga->hwtnl, svga_render->prim, start + bias, nr);
+      svga->swtnl.new_vbuf = TRUE;
+      assert(ret == PIPE_OK);
+   }
+}
+
+
+static void
+svga_vbuf_render_draw_elements( struct vbuf_render *render,
+                                const ushort *indices,
+                                uint nr_indices)
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   struct svga_context *svga = svga_render->svga;
+   struct pipe_screen *screen = svga->pipe.screen;
+   int bias = (svga_render->vbuf_offset - svga_render->vdecl_offset) / svga_render->vertex_size;
+   boolean ret;
+   size_t size = 2 * nr_indices;
+
+   assert(( svga_render->vbuf_offset - svga_render->vdecl_offset) % svga_render->vertex_size == 0);
+   
+   if (svga_render->ibuf_size < svga_render->ibuf_offset + size)
+      pipe_resource_reference(&svga_render->ibuf, NULL);
+
+   if (!svga_render->ibuf) {
+      svga_render->ibuf_size = MAX2(size, svga_render->ibuf_alloc_size);
+      svga_render->ibuf = pipe_buffer_create(screen,
+                                             PIPE_BIND_INDEX_BUFFER,
+                                             svga_render->ibuf_size);
+      svga_render->ibuf_offset = 0;
+   }
+
+   pipe_buffer_write_nooverlap(&svga->pipe, svga_render->ibuf,
+			       svga_render->ibuf_offset, 2 * nr_indices, indices);
+
+
+   /* off to hardware */
+   svga_vbuf_sumbit_state(svga_render);
+
+   /* Need to call update_state() again as the draw module may have
+    * altered some of our state behind our backs.  Testcase:
+    * redbook/polys.c
+    */
+   svga_update_state_retry( svga, SVGA_STATE_HW_DRAW );
+
+   ret = svga_hwtnl_draw_range_elements(svga->hwtnl,
+                                        svga_render->ibuf,
+                                        2,
+                                        bias,
+                                        svga_render->min_index,
+                                        svga_render->max_index,
+                                        svga_render->prim,
+                                        svga_render->ibuf_offset / 2, nr_indices);
+   if(ret != PIPE_OK) {
+      svga_context_flush(svga, NULL);
+      ret = svga_hwtnl_draw_range_elements(svga->hwtnl,
+                                           svga_render->ibuf,
+                                           2,
+                                           bias,
+                                           svga_render->min_index,
+                                           svga_render->max_index,
+                                           svga_render->prim,
+                                           svga_render->ibuf_offset / 2, nr_indices);
+      svga->swtnl.new_vbuf = TRUE;
+      assert(ret == PIPE_OK);
+   }
+
+   svga_render->ibuf_offset += size;
+}
+
+
+static void
+svga_vbuf_render_release_vertices( struct vbuf_render *render )
+{
+
+}
+
+
+static void
+svga_vbuf_render_destroy( struct vbuf_render *render )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+
+   pipe_resource_reference(&svga_render->vbuf, NULL);
+   pipe_resource_reference(&svga_render->ibuf, NULL);
+   FREE(svga_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+struct vbuf_render *
+svga_vbuf_render_create( struct svga_context *svga )
+{
+   struct svga_vbuf_render *svga_render = CALLOC_STRUCT(svga_vbuf_render);
+
+   svga_render->svga = svga;
+   svga_render->ibuf_size = 0;
+   svga_render->vbuf_size = 0;
+   svga_render->ibuf_alloc_size = 4*1024;
+   svga_render->vbuf_alloc_size = 64*1024;
+   svga_render->base.max_vertex_buffer_bytes = 64*1024/10;
+   svga_render->base.max_indices = 65536;
+   svga_render->base.get_vertex_info = svga_vbuf_render_get_vertex_info;
+   svga_render->base.allocate_vertices = svga_vbuf_render_allocate_vertices;
+   svga_render->base.map_vertices = svga_vbuf_render_map_vertices;
+   svga_render->base.unmap_vertices = svga_vbuf_render_unmap_vertices;
+   svga_render->base.set_primitive = svga_vbuf_render_set_primitive;
+   svga_render->base.draw_elements = svga_vbuf_render_draw_elements;
+   svga_render->base.draw_arrays = svga_vbuf_render_draw_arrays;
+   svga_render->base.release_vertices = svga_vbuf_render_release_vertices;
+   svga_render->base.destroy = svga_vbuf_render_destroy;
+
+   return &svga_render->base;
+}
diff --git a/src/gallium/drivers/svga/svga_swtnl_draw.c b/src/gallium/drivers/svga/svga_swtnl_draw.c
new file mode 100644
index 0000000000..eb71c23195
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_swtnl_draw.c
@@ -0,0 +1,178 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "util/u_inlines.h"
+#include "pipe/p_state.h"
+
+#include "svga_context.h"
+#include "svga_swtnl.h"
+#include "svga_state.h"
+#include "svga_swtnl_private.h"
+
+
+
+enum pipe_error
+svga_swtnl_draw_range_elements(struct svga_context *svga,
+                               struct pipe_resource *indexBuffer,
+                               unsigned indexSize,
+                               int indexBias,
+                               unsigned min_index,
+                               unsigned max_index,
+                               unsigned prim, unsigned start, unsigned count)
+{
+   struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS];
+   struct pipe_transfer *ib_transfer = NULL;
+   struct pipe_transfer *cb_transfer = NULL;
+   struct draw_context *draw = svga->swtnl.draw;
+   unsigned i;
+   const void *map;
+   enum pipe_error ret;
+
+   assert(!svga->dirty);
+   assert(svga->state.sw.need_swtnl);
+   assert(draw);
+
+   ret = svga_update_state(svga, SVGA_STATE_SWTNL_DRAW);
+   if (ret) {
+      svga_context_flush(svga, NULL);
+      ret = svga_update_state(svga, SVGA_STATE_SWTNL_DRAW);
+      svga->swtnl.new_vbuf = TRUE;
+      assert(ret == PIPE_OK);
+   }
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < svga->curr.num_vertex_buffers; i++) {
+      map = pipe_buffer_map(&svga->pipe,
+                            svga->curr.vb[i].buffer,
+                            PIPE_TRANSFER_READ,
+			    &vb_transfer[i]);
+
+      draw_set_mapped_vertex_buffer(draw, i, map);
+   }
+
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      map = pipe_buffer_map(&svga->pipe, indexBuffer,
+                            PIPE_TRANSFER_READ,
+			    &ib_transfer);
+
+      draw_set_mapped_element_buffer_range(draw, 
+                                           indexSize, indexBias,
+                                           min_index,
+                                           max_index,
+                                           map);
+   }
+   
+   if (svga->curr.cb[PIPE_SHADER_VERTEX]) {
+      map = pipe_buffer_map(&svga->pipe,
+                            svga->curr.cb[PIPE_SHADER_VERTEX],
+                            PIPE_TRANSFER_READ,
+			    &cb_transfer);
+      assert(map);
+      draw_set_mapped_constant_buffer(
+         draw, PIPE_SHADER_VERTEX, 0,
+         map,
+         svga->curr.cb[PIPE_SHADER_VERTEX]->width0);
+   }
+
+   draw_arrays(svga->swtnl.draw, prim, start, count);
+
+   draw_flush(svga->swtnl.draw);
+
+   /* Ensure the draw module didn't touch this */
+   assert(i == svga->curr.num_vertex_buffers);
+   
+   /*
+    * unmap vertex/index buffers
+    */
+   for (i = 0; i < svga->curr.num_vertex_buffers; i++) {
+      pipe_buffer_unmap(&svga->pipe, svga->curr.vb[i].buffer, 
+			vb_transfer[i]);
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+   }
+
+   if (indexBuffer) {
+      pipe_buffer_unmap(&svga->pipe, indexBuffer, ib_transfer);
+      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
+   }
+
+   if (svga->curr.cb[PIPE_SHADER_VERTEX]) {
+      pipe_buffer_unmap(&svga->pipe,
+                        svga->curr.cb[PIPE_SHADER_VERTEX],
+			cb_transfer);
+   }
+
+   return ret;
+}
+
+
+
+
+boolean svga_init_swtnl( struct svga_context *svga )
+{
+   svga->swtnl.backend = svga_vbuf_render_create(svga);
+   if(!svga->swtnl.backend)
+      goto fail;
+
+   /*
+    * Create drawing context and plug our rendering stage into it.
+    */
+   svga->swtnl.draw = draw_create(&svga->pipe);
+   if (svga->swtnl.draw == NULL)
+      goto fail;
+
+
+   draw_set_rasterize_stage(svga->swtnl.draw, 
+                            draw_vbuf_stage( svga->swtnl.draw, svga->swtnl.backend ));
+
+   draw_set_render(svga->swtnl.draw, svga->swtnl.backend);
+
+   draw_install_aaline_stage(svga->swtnl.draw, &svga->pipe);
+   draw_install_aapoint_stage(svga->swtnl.draw, &svga->pipe);
+   draw_install_pstipple_stage(svga->swtnl.draw, &svga->pipe);
+
+   draw_set_driver_clipping(svga->swtnl.draw, debug_get_bool_option("SVGA_SWTNL_FSE", FALSE));
+
+   return TRUE;
+
+fail:
+   if (svga->swtnl.backend)
+      svga->swtnl.backend->destroy( svga->swtnl.backend );
+
+   if (svga->swtnl.draw)
+      draw_destroy( svga->swtnl.draw );
+
+   return FALSE;
+}
+
+
+void svga_destroy_swtnl( struct svga_context *svga )
+{
+   draw_destroy( svga->swtnl.draw );
+}
diff --git a/src/gallium/drivers/svga/svga_swtnl_private.h b/src/gallium/drivers/svga/svga_swtnl_private.h
new file mode 100644
index 0000000000..8d08070843
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_swtnl_private.h
@@ -0,0 +1,95 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_SWTNL_PRIVATE_H
+#define SVGA_SWTNL_PRIVATE_H
+
+#include "svga_swtnl.h"
+#include "draw/draw_vertex.h"
+
+#include "svga_types.h"
+#include "svga3d_reg.h"
+
+/**
+ * Primitive renderer for svga.
+ */
+struct svga_vbuf_render {
+   struct vbuf_render base;
+
+   struct svga_context *svga;
+   struct vertex_info vertex_info;
+
+   unsigned vertex_size;
+
+   unsigned prim;
+
+   struct pipe_resource *vbuf;
+   struct pipe_resource *ibuf;
+   struct pipe_transfer *vbuf_transfer;
+   struct pipe_transfer *ibuf_transfer;
+
+   /* current size of buffer */
+   size_t vbuf_size;
+   size_t ibuf_size;
+
+   /* size of that the buffer should be */
+   size_t vbuf_alloc_size;
+   size_t ibuf_alloc_size;
+
+   /* current write place */
+   size_t vbuf_offset;
+   size_t ibuf_offset;
+
+   /* currently used */
+   size_t vbuf_used;
+
+   SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
+   unsigned vdecl_offset;
+   unsigned vdecl_count;
+
+   ushort min_index;
+   ushort max_index;
+};
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct svga_vbuf_render *
+svga_vbuf_render( struct vbuf_render *render )
+{
+   assert(render);
+   return (struct svga_vbuf_render *)render;
+}
+
+
+struct vbuf_render *
+svga_vbuf_render_create( struct svga_context *svga );
+
+
+int
+svga_swtnl_update_vdecl( struct svga_context *svga );
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_swtnl_state.c b/src/gallium/drivers/svga/svga_swtnl_state.c
new file mode 100644
index 0000000000..a759238293
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_swtnl_state.c
@@ -0,0 +1,237 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "util/u_inlines.h"
+#include "pipe/p_state.h"
+
+#include "svga_context.h"
+#include "svga_swtnl.h"
+#include "svga_state.h"
+
+#include "svga_swtnl_private.h"
+
+
+#define SVGA_POINT_ADJ_X -0.375
+#define SVGA_POINT_ADJ_Y -0.5
+
+#define SVGA_LINE_ADJ_X -0.5
+#define SVGA_LINE_ADJ_Y -0.5
+
+#define SVGA_TRIANGLE_ADJ_X -0.375
+#define SVGA_TRIANGLE_ADJ_Y -0.5
+
+
+static void set_draw_viewport( struct svga_context *svga )
+{
+   struct pipe_viewport_state vp = svga->curr.viewport;
+   float adjx = 0;
+   float adjy = 0;
+
+   switch (svga->curr.reduced_prim) {
+   case PIPE_PRIM_POINTS:
+      adjx = SVGA_POINT_ADJ_X;
+      adjy = SVGA_POINT_ADJ_Y;
+      break;
+   case PIPE_PRIM_LINES:
+      /* XXX: This is to compensate for the fact that wide lines are
+       * going to be drawn with triangles, but we're not catching all
+       * cases where that will happen.
+       */
+      if (svga->curr.rast->templ.line_width > 1.0) 
+      {
+         adjx = SVGA_LINE_ADJ_X + 0.175;
+         adjy = SVGA_LINE_ADJ_Y - 0.175;
+      }
+      else {
+         adjx = SVGA_LINE_ADJ_X;
+         adjy = SVGA_LINE_ADJ_Y;
+      }
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      adjx += SVGA_TRIANGLE_ADJ_X;
+      adjy += SVGA_TRIANGLE_ADJ_Y;
+      break;
+   }
+
+   vp.translate[0] += adjx;
+   vp.translate[1] += adjy;
+
+   draw_set_viewport_state(svga->swtnl.draw, &vp);
+}
+
+static int update_swtnl_draw( struct svga_context *svga,
+                              unsigned dirty )
+{
+   draw_flush( svga->swtnl.draw );
+
+   if (dirty & SVGA_NEW_VS) 
+      draw_bind_vertex_shader(svga->swtnl.draw,
+                              svga->curr.vs->draw_shader);
+
+   if (dirty & SVGA_NEW_VBUFFER)
+      draw_set_vertex_buffers(svga->swtnl.draw, 
+                              svga->curr.num_vertex_buffers, 
+                              svga->curr.vb);
+
+   if (dirty & SVGA_NEW_VELEMENT)
+      draw_set_vertex_elements(svga->swtnl.draw, 
+                               svga->curr.velems->count, 
+                               svga->curr.velems->velem );
+
+   if (dirty & SVGA_NEW_CLIP)
+      draw_set_clip_state(svga->swtnl.draw, 
+                          &svga->curr.clip);
+
+   if (dirty & (SVGA_NEW_VIEWPORT |
+                SVGA_NEW_REDUCED_PRIMITIVE | 
+                SVGA_NEW_RAST))
+      set_draw_viewport( svga );
+
+   if (dirty & SVGA_NEW_RAST)
+      draw_set_rasterizer_state(svga->swtnl.draw,
+                                &svga->curr.rast->templ,
+                                (void *) svga->curr.rast);
+
+   if (dirty & SVGA_NEW_FRAME_BUFFER)
+      draw_set_mrd(svga->swtnl.draw, 
+                   svga->curr.depthscale);
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_update_swtnl_draw =
+{
+   "update draw module state",
+   (SVGA_NEW_VS |
+    SVGA_NEW_VBUFFER |
+    SVGA_NEW_VELEMENT |
+    SVGA_NEW_CLIP |
+    SVGA_NEW_VIEWPORT |
+    SVGA_NEW_RAST |
+    SVGA_NEW_FRAME_BUFFER |
+    SVGA_NEW_REDUCED_PRIMITIVE),
+   update_swtnl_draw
+};
+
+
+int svga_swtnl_update_vdecl( struct svga_context *svga )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(svga->swtnl.backend);
+   struct draw_context *draw = svga->swtnl.draw;
+   struct vertex_info *vinfo = &svga_render->vertex_info;
+   SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
+   const enum interp_mode colorInterp =
+      svga->curr.rast->templ.flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+   const struct svga_fragment_shader *fs = svga->curr.fs;
+   int offset = 0;
+   int nr_decls = 0;
+   int src, i;
+
+   memset(vinfo, 0, sizeof(*vinfo));
+   memset(vdecl, 0, sizeof(vdecl));
+
+   /* always add position */
+   src = draw_find_shader_output(draw, TGSI_SEMANTIC_POSITION, 0);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
+   vinfo->attrib[0].emit = EMIT_4F;
+   vdecl[0].array.offset = offset;
+   vdecl[0].identity.type = SVGA3D_DECLTYPE_FLOAT4;
+   vdecl[0].identity.usage = SVGA3D_DECLUSAGE_POSITIONT;
+   vdecl[0].identity.usageIndex = 0;
+   offset += 16;
+   nr_decls++;
+
+   for (i = 0; i < fs->base.info.num_inputs; i++) {
+      unsigned name = fs->base.info.input_semantic_name[i];
+      unsigned index = fs->base.info.input_semantic_index[i];
+      src = draw_find_shader_output(draw, name, index);
+      vdecl[nr_decls].array.offset = offset;
+      vdecl[nr_decls].identity.usageIndex = fs->base.info.input_semantic_index[i];
+
+      switch (name) {
+      case TGSI_SEMANTIC_COLOR:
+         draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_COLOR;
+         vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4;
+         offset += 16;
+         nr_decls++;
+         break;
+      case TGSI_SEMANTIC_GENERIC:
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD;
+         vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4;
+         vdecl[nr_decls].identity.usageIndex += 1;
+         offset += 16;
+         nr_decls++;
+         break;
+      case TGSI_SEMANTIC_FOG:
+         draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+         vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD;
+         vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT1;
+         assert(vdecl[nr_decls].identity.usageIndex == 0);
+         offset += 4;
+         nr_decls++;
+         break;
+      case TGSI_SEMANTIC_POSITION:
+         /* generated internally, not a vertex shader output */
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   draw_compute_vertex_size(vinfo);
+
+   svga_render->vdecl_count = nr_decls;
+   for (i = 0; i < svga_render->vdecl_count; i++)
+      vdecl[i].array.stride = offset;
+
+   if (memcmp(svga_render->vdecl, vdecl, sizeof(vdecl)) == 0)
+      return 0;
+
+   memcpy(svga_render->vdecl, vdecl, sizeof(vdecl));
+   svga->swtnl.new_vdecl = TRUE;
+
+   return 0;
+}
+
+
+static int update_swtnl_vdecl( struct svga_context *svga,
+                               unsigned dirty )
+{
+   return svga_swtnl_update_vdecl( svga );
+}
+
+
+struct svga_tracked_state svga_update_swtnl_vdecl =
+{
+   "update draw module vdecl",
+   (SVGA_NEW_VS |
+    SVGA_NEW_FS),
+   update_swtnl_vdecl
+};
diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
new file mode 100644
index 0000000000..0cd620189b
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -0,0 +1,282 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_defines.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+#include "util/u_memory.h"
+#include "util/u_bitmask.h"
+
+#include "svgadump/svga_shader_dump.h"
+
+#include "svga_context.h"
+#include "svga_tgsi.h"
+#include "svga_tgsi_emit.h"
+#include "svga_debug.h"
+
+#include "svga_hw_reg.h"
+#include "svga3d_shaderdefs.h"
+
+
+/* Sinkhole used only in error conditions.
+ */
+static char err_buf[128];
+
+#if 0
+static void svga_destroy_shader_emitter( struct svga_shader_emitter *emit )
+{
+   if (emit->buf != err_buf)
+      FREE(emit->buf);
+}
+#endif
+
+
+static boolean svga_shader_expand( struct svga_shader_emitter *emit )
+{
+   char *new_buf;
+   unsigned newsize = emit->size * 2;
+
+   if(emit->buf != err_buf)
+      new_buf = REALLOC(emit->buf, emit->size, newsize);
+   else
+      new_buf = NULL;
+
+   if (new_buf == NULL) {
+      emit->ptr = err_buf;
+      emit->buf = err_buf;
+      emit->size = sizeof(err_buf);
+      return FALSE;
+   }
+
+   emit->size = newsize;
+   emit->ptr = new_buf + (emit->ptr - emit->buf);
+   emit->buf = new_buf;
+   return TRUE;
+}   
+
+static INLINE boolean reserve(  struct svga_shader_emitter *emit,
+                                unsigned nr_dwords )
+{
+   if (emit->ptr - emit->buf + nr_dwords * sizeof(unsigned) >= emit->size) {
+      if (!svga_shader_expand( emit ))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+boolean svga_shader_emit_dword( struct svga_shader_emitter *emit,
+                                unsigned dword )
+{
+   if (!reserve(emit, 1))
+      return FALSE;
+
+   *(unsigned *)emit->ptr = dword;
+   emit->ptr += sizeof dword;
+   return TRUE;
+}
+
+boolean svga_shader_emit_dwords( struct svga_shader_emitter *emit,
+                                 const unsigned *dwords,
+                                 unsigned nr )
+{
+   if (!reserve(emit, nr))
+      return FALSE;
+
+   memcpy( emit->ptr, dwords, nr * sizeof *dwords );
+   emit->ptr += nr * sizeof *dwords;
+   return TRUE;
+}
+
+boolean svga_shader_emit_opcode( struct svga_shader_emitter *emit,
+                                 unsigned opcode )
+{
+   SVGA3dShaderInstToken *here;
+
+   if (!reserve(emit, 1))
+      return FALSE;
+
+   here = (SVGA3dShaderInstToken *)emit->ptr;
+   here->value = opcode;
+
+   if (emit->insn_offset) {
+      SVGA3dShaderInstToken *prev = (SVGA3dShaderInstToken *)(emit->buf + 
+                                                              emit->insn_offset);
+      prev->size = (here - prev) - 1;
+   }
+   
+   emit->insn_offset = emit->ptr - emit->buf;
+   emit->ptr += sizeof(unsigned);
+   return TRUE;
+}
+
+#define SVGA3D_PS_2X (SVGA3D_PS_20 | 1)
+#define SVGA3D_VS_2X (SVGA3D_VS_20 | 1)
+
+static boolean svga_shader_emit_header( struct svga_shader_emitter *emit )
+{
+   SVGA3dShaderVersion header;
+
+   memset( &header, 0, sizeof header );
+
+   switch (emit->unit) {
+   case PIPE_SHADER_FRAGMENT:
+      header.value = emit->use_sm30 ? SVGA3D_PS_30 : SVGA3D_PS_2X;
+      break;
+   case PIPE_SHADER_VERTEX:
+      header.value = emit->use_sm30 ? SVGA3D_VS_30 : SVGA3D_VS_2X;
+      break;
+   }
+ 
+   return svga_shader_emit_dword( emit, header.value );
+}
+
+
+
+
+
+/* Parse TGSI shader and translate to SVGA/DX9 serialized
+ * representation.  
+ *
+ * In this function SVGA shader is emitted to an in-memory buffer that
+ * can be dynamically grown.  Once we've finished and know how large
+ * it is, it will be copied to a hardware buffer for upload.
+ */
+static struct svga_shader_result *
+svga_tgsi_translate( const struct svga_shader *shader,
+                     union svga_compile_key key,
+                     unsigned unit )
+{
+   struct svga_shader_result *result = NULL;
+   struct svga_shader_emitter emit;
+   int ret = 0;
+
+   memset(&emit, 0, sizeof(emit));
+
+   emit.use_sm30 = shader->use_sm30;
+   emit.size = 1024;
+   emit.buf = MALLOC(emit.size);
+   if (emit.buf == NULL) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto fail;
+   }
+
+   emit.ptr = emit.buf;
+   emit.unit = unit;
+   emit.key = key;
+
+   tgsi_scan_shader( shader->tokens, &emit.info);
+
+   emit.imm_start = emit.info.file_max[TGSI_FILE_CONSTANT] + 1;
+   
+   if (unit == PIPE_SHADER_FRAGMENT)
+      emit.imm_start += key.fkey.num_unnormalized_coords;
+
+   if (unit == PIPE_SHADER_VERTEX) {
+      emit.imm_start += key.vkey.need_prescale ? 2 : 0;
+      emit.imm_start += key.vkey.num_zero_stride_vertex_elements;
+   }
+
+   emit.nr_hw_const = (emit.imm_start + emit.info.file_max[TGSI_FILE_IMMEDIATE] + 1);
+
+   emit.nr_hw_temp = emit.info.file_max[TGSI_FILE_TEMPORARY] + 1;
+   emit.in_main_func = TRUE;
+
+   if (!svga_shader_emit_header( &emit ))
+      goto fail;
+
+   if (!svga_shader_emit_instructions( &emit, shader->tokens ))
+      goto fail;
+   
+   result = CALLOC_STRUCT(svga_shader_result);
+   if (result == NULL)
+      goto fail;
+
+   result->shader = shader;
+   result->tokens = (const unsigned *)emit.buf;
+   result->nr_tokens = (emit.ptr - emit.buf) / sizeof(unsigned);
+   memcpy(&result->key, &key, sizeof key);
+   result->id = UTIL_BITMASK_INVALID_INDEX;
+
+   if (SVGA_DEBUG & DEBUG_TGSI) 
+   {
+      debug_printf( "#####################################\n" );
+      debug_printf( "Shader %u below\n", shader->id );
+      tgsi_dump( shader->tokens, 0 );
+      if (SVGA_DEBUG & DEBUG_TGSI) {
+         debug_printf( "Shader %u compiled below\n", shader->id );
+         svga_shader_dump( result->tokens,
+                           result->nr_tokens ,
+                           FALSE );
+      }
+      debug_printf( "#####################################\n" );
+   }
+
+   return result;
+
+fail:
+   FREE(result);
+   FREE(emit.buf);
+   return NULL;
+}
+
+
+
+
+struct svga_shader_result *
+svga_translate_fragment_program( const struct svga_fragment_shader *fs,
+                                 const struct svga_fs_compile_key *fkey )
+{
+   union svga_compile_key key;
+   memcpy(&key.fkey, fkey, sizeof *fkey);
+
+   return svga_tgsi_translate( &fs->base, 
+                               key,
+                               PIPE_SHADER_FRAGMENT );
+}
+
+struct svga_shader_result *
+svga_translate_vertex_program( const struct svga_vertex_shader *vs,
+                               const struct svga_vs_compile_key *vkey )
+{
+   union svga_compile_key key;
+   memcpy(&key.vkey, vkey, sizeof *vkey);
+
+   return svga_tgsi_translate( &vs->base, 
+                               key,
+                               PIPE_SHADER_VERTEX );
+}
+
+
+void svga_destroy_shader_result( struct svga_shader_result *result )
+{
+   FREE((unsigned *)result->tokens);
+   FREE(result);
+}
+
diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h
new file mode 100644
index 0000000000..7ea909c37b
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_tgsi.h
@@ -0,0 +1,137 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_TGSI_H
+#define SVGA_TGSI_H
+
+#include "pipe/p_state.h"
+
+#include "svga_hw_reg.h"
+
+struct svga_fragment_shader;
+struct svga_vertex_shader;
+struct svga_shader;
+struct tgsi_shader_info;
+struct tgsi_token;
+
+
+struct svga_vs_compile_key
+{
+   unsigned zero_stride_vertex_elements;
+   unsigned need_prescale:1;
+   unsigned allow_psiz:1;
+   unsigned num_zero_stride_vertex_elements:6;
+};
+
+struct svga_fs_compile_key
+{
+   unsigned light_twoside:1;
+   unsigned front_ccw:1;
+   unsigned white_fragments:1;
+   unsigned num_textures:8;
+   unsigned num_unnormalized_coords:8;
+   struct {
+      unsigned compare_mode:1;
+      unsigned compare_func:3;
+      unsigned unnormalized:1;
+      unsigned width_height_idx:7;
+      unsigned texture_target:8;
+   } tex[PIPE_MAX_SAMPLERS];
+};
+
+union svga_compile_key {
+   struct svga_vs_compile_key vkey;
+   struct svga_fs_compile_key fkey;
+};
+
+struct svga_shader_result
+{
+   const struct svga_shader *shader;
+
+   /* Parameters used to generate this compilation result:
+    */
+   union svga_compile_key key;
+
+   /* Compiled shader tokens:
+    */
+   const unsigned *tokens;
+   unsigned nr_tokens;
+
+   /* SVGA Shader ID:
+    */
+   unsigned id;
+   
+   /* Next compilation result:
+    */
+   struct svga_shader_result *next;
+};
+
+
+/* TGSI doesn't provide use with VS input semantics (they're actually
+ * pretty meaningless), so we just generate some plausible ones here.
+ * This is called both from within the TGSI translator and when
+ * building vdecls to ensure they match up.
+ *
+ * The real use of this information is matching vertex elements to
+ * fragment shader inputs in the case where vertex shader is disabled.
+ */
+static INLINE void svga_generate_vdecl_semantics( unsigned idx,
+                                                  unsigned *usage,
+                                                  unsigned *usage_index )
+{
+   if (idx == 0) {
+      *usage = SVGA3D_DECLUSAGE_POSITION;
+      *usage_index = 0;
+   }
+   else {
+      *usage = SVGA3D_DECLUSAGE_TEXCOORD;
+      *usage_index = idx - 1;
+   }
+}
+
+
+
+static INLINE unsigned svga_vs_key_size( const struct svga_vs_compile_key *key )
+{
+   return sizeof *key;
+}
+
+static INLINE unsigned svga_fs_key_size( const struct svga_fs_compile_key *key )
+{
+   return (const char *)&key->tex[key->num_textures] - (const char *)key;
+}
+
+struct svga_shader_result *
+svga_translate_fragment_program( const struct svga_fragment_shader *fs,
+                                 const struct svga_fs_compile_key *fkey );
+
+struct svga_shader_result *
+svga_translate_vertex_program( const struct svga_vertex_shader *fs,
+                               const struct svga_vs_compile_key *vkey );
+
+
+void svga_destroy_shader_result( struct svga_shader_result *result );
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm20.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm20.c
new file mode 100644
index 0000000000..1ae9906761
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm20.c
@@ -0,0 +1,277 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "util/u_memory.h"
+
+#include "svga_tgsi_emit.h"
+
+
+static boolean ps20_input( struct svga_shader_emitter *emit,
+                           struct tgsi_declaration_semantic semantic,
+                           unsigned idx )
+{
+   struct src_register reg;
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   switch (semantic.Name) {
+   case TGSI_SEMANTIC_POSITION:
+      /* Special case:
+       */
+      reg = src_register( SVGA3DREG_MISCTYPE, 
+                          SVGA3DMISCREG_POSITION );
+      break;
+   case TGSI_SEMANTIC_COLOR:
+      reg = src_register( SVGA3DREG_INPUT, 
+                          semantic.Index );
+      break;
+   case TGSI_SEMANTIC_FOG:
+      assert(semantic.Index == 0);
+      reg = src_register( SVGA3DREG_TEXTURE, 0 );
+      break;
+   case TGSI_SEMANTIC_GENERIC:
+      reg = src_register( SVGA3DREG_TEXTURE,
+                          semantic.Index + 1 );
+      break;
+   default:
+      assert(0);
+      return TRUE;
+   }
+
+   emit->input_map[idx] = reg;
+
+   dcl.dst = dst( reg );
+
+   dcl.usage = 0;
+   dcl.index = 0;
+
+   dcl.values[0] |= 1<<31;
+
+   return  (emit_instruction(emit, opcode) &&
+            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+}
+
+
+static boolean ps20_output( struct svga_shader_emitter *emit,
+                            struct tgsi_declaration_semantic semantic,
+                            unsigned idx )
+{
+   SVGA3dShaderDestToken reg;
+
+   switch (semantic.Name) {
+   case TGSI_SEMANTIC_COLOR:
+      if (semantic.Index < PIPE_MAX_COLOR_BUFS) {
+         unsigned cbuf = semantic.Index;
+
+         emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                               emit->nr_hw_temp++ );
+         emit->temp_col[cbuf] = emit->output_map[idx];
+         emit->true_col[cbuf] = dst_register( SVGA3DREG_COLOROUT, 
+                                              semantic.Index );
+      }
+      else {
+         assert(0);
+         reg = dst_register( SVGA3DREG_COLOROUT, 0 );
+      }
+      break;
+   case TGSI_SEMANTIC_POSITION:
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++ );
+      emit->temp_pos = emit->output_map[idx];
+      emit->true_pos = dst_register( SVGA3DREG_DEPTHOUT, 
+                                     semantic.Index );
+      break;
+   default:
+      assert(0);
+      reg = dst_register( SVGA3DREG_COLOROUT, 0 );
+      break;
+   }
+
+   return TRUE;
+}
+
+
+static boolean vs20_input( struct svga_shader_emitter *emit,
+                           struct tgsi_declaration_semantic semantic,
+                           unsigned idx )
+{
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   emit->input_map[idx] = src_register( SVGA3DREG_INPUT, idx );
+   dcl.dst = dst_register( SVGA3DREG_INPUT, idx );
+
+   assert(dcl.dst.reserved0);
+
+   /* Mesa doesn't provide use with VS input semantics (they're
+    * actually pretty meaningless), so we just generate some plausible
+    * ones here.  This has to match what we declare in the vdecl code
+    * in svga_pipe_vertex.c.
+    */
+   if (idx == 0) {
+      dcl.usage = SVGA3D_DECLUSAGE_POSITION;
+      dcl.index = 0;
+   }
+   else {
+      dcl.usage = SVGA3D_DECLUSAGE_TEXCOORD;
+      dcl.index = idx - 1;
+   }
+
+   dcl.values[0] |= 1<<31;
+
+   return  (emit_instruction(emit, opcode) &&
+            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+}
+
+
+static boolean vs20_output( struct svga_shader_emitter *emit,
+                         struct tgsi_declaration_semantic semantic,
+                         unsigned idx )
+{
+   /* Don't emit dcl instruction for vs20 inputs
+    */
+
+   /* Just build the register map table: 
+    */
+   switch (semantic.Name) {
+   case TGSI_SEMANTIC_POSITION:
+      assert(semantic.Index == 0);
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++ );
+      emit->temp_pos = emit->output_map[idx];
+      emit->true_pos = dst_register( SVGA3DREG_RASTOUT, 
+                                     SVGA3DRASTOUT_POSITION);
+      break;
+   case TGSI_SEMANTIC_PSIZE:
+      assert(semantic.Index == 0);
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++ );
+      emit->temp_psiz = emit->output_map[idx];
+      emit->true_psiz = dst_register( SVGA3DREG_RASTOUT, 
+                                      SVGA3DRASTOUT_PSIZE );
+      break;
+   case TGSI_SEMANTIC_FOG:
+      assert(semantic.Index == 0);
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEXCRDOUT, 0 );
+      break;
+   case TGSI_SEMANTIC_COLOR:
+      /* oD0 */
+      emit->output_map[idx] = dst_register( SVGA3DREG_ATTROUT,
+                                            semantic.Index );
+      break;
+   case TGSI_SEMANTIC_GENERIC:
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEXCRDOUT,
+                                            semantic.Index + 1 );
+      break;
+   default:
+      assert(0);
+      emit->output_map[idx] = dst_register(  SVGA3DREG_TEMP, 0 );
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+static boolean ps20_sampler( struct svga_shader_emitter *emit,
+                          struct tgsi_declaration_semantic semantic,
+                          unsigned idx )
+{
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   dcl.dst = dst_register( SVGA3DREG_SAMPLER, idx );
+   dcl.type = svga_tgsi_sampler_type( emit, idx );
+
+   return  (emit_instruction(emit, opcode) &&
+            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+}
+
+
+boolean svga_translate_decl_sm20( struct svga_shader_emitter *emit,
+                             const struct tgsi_full_declaration *decl )
+{
+   unsigned first = decl->Range.First;
+   unsigned last = decl->Range.Last;
+   unsigned semantic = 0;
+   unsigned semantic_idx = 0;
+   unsigned idx;
+   
+   if (decl->Declaration.Semantic) {
+      semantic = decl->Semantic.Name;
+      semantic_idx = decl->Semantic.Index;
+   }
+
+   for( idx = first; idx <= last; idx++ ) {
+      boolean ok;
+
+      switch (decl->Declaration.File) {
+      case TGSI_FILE_SAMPLER:
+         assert (emit->unit == PIPE_SHADER_FRAGMENT);
+         ok = ps20_sampler( emit, decl->Semantic, idx );
+         break;
+
+      case TGSI_FILE_INPUT:
+         if (emit->unit == PIPE_SHADER_VERTEX)
+            ok = vs20_input( emit, decl->Semantic, idx );
+         else
+            ok = ps20_input( emit, decl->Semantic, idx );
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         if (emit->unit == PIPE_SHADER_VERTEX)
+            ok = vs20_output( emit, decl->Semantic, idx );
+         else
+            ok = ps20_output( emit, decl->Semantic, idx );
+         break;
+
+      default:
+         /* don't need to declare other vars */
+         ok = TRUE;
+      }
+
+      if (!ok)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+
diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
new file mode 100644
index 0000000000..73102a72a8
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
@@ -0,0 +1,395 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "util/u_memory.h"
+
+#include "svga_tgsi_emit.h"
+
+static boolean translate_vs_ps_semantic( struct tgsi_declaration_semantic semantic,
+                                         unsigned *usage,
+                                         unsigned *idx )
+{
+   switch (semantic.Name) {
+   case TGSI_SEMANTIC_POSITION:  
+      *idx = semantic.Index;
+      *usage = SVGA3D_DECLUSAGE_POSITION;
+      break;
+   case TGSI_SEMANTIC_COLOR:     
+
+      *idx = semantic.Index;
+      *usage = SVGA3D_DECLUSAGE_COLOR;
+      break;
+   case TGSI_SEMANTIC_BCOLOR:
+      *idx = semantic.Index + 2; /* sharing with COLOR */
+      *usage = SVGA3D_DECLUSAGE_COLOR;
+      break;
+   case TGSI_SEMANTIC_FOG:       
+      *idx = 0;
+      assert(semantic.Index == 0);
+      *usage = SVGA3D_DECLUSAGE_TEXCOORD;
+      break;
+   case TGSI_SEMANTIC_PSIZE:     
+      *idx = semantic.Index;
+      *usage = SVGA3D_DECLUSAGE_PSIZE;
+      break;
+   case TGSI_SEMANTIC_GENERIC:   
+      *idx = semantic.Index + 1; /* texcoord[0] is reserved for fog */
+      *usage = SVGA3D_DECLUSAGE_TEXCOORD;
+      break;
+   case TGSI_SEMANTIC_NORMAL:    
+      *idx = semantic.Index;
+      *usage = SVGA3D_DECLUSAGE_NORMAL;
+      break;
+   default:
+      assert(0);
+      *usage = SVGA3D_DECLUSAGE_TEXCOORD;
+      *idx = 0;
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+static boolean emit_decl( struct svga_shader_emitter *emit,
+                          SVGA3dShaderDestToken reg,
+                          unsigned usage, 
+                          unsigned index )
+{
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   dcl.dst = reg;
+   dcl.usage = usage;
+   dcl.index = index;
+   dcl.values[0] |= 1<<31;
+
+   return  (emit_instruction(emit, opcode) &&
+            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+}
+
+static boolean emit_vface_decl( struct svga_shader_emitter *emit )
+{
+   if (!emit->emitted_vface) {
+      SVGA3dShaderDestToken reg =
+         dst_register( SVGA3DREG_MISCTYPE,
+                       SVGA3DMISCREG_FACE );
+
+      if (!emit_decl( emit, reg, 0, 0 ))
+         return FALSE;
+
+      emit->emitted_vface = TRUE;
+   }
+   return TRUE;
+}
+
+static boolean ps30_input( struct svga_shader_emitter *emit,
+                           struct tgsi_declaration_semantic semantic,
+                           unsigned idx )
+{
+   unsigned usage, index;
+   SVGA3dShaderDestToken reg;
+
+   if (semantic.Name == TGSI_SEMANTIC_POSITION) {
+      emit->input_map[idx] = src_register( SVGA3DREG_MISCTYPE,
+                                           SVGA3DMISCREG_POSITION );
+
+      emit->input_map[idx].base.swizzle = TRANSLATE_SWIZZLE( TGSI_SWIZZLE_X,
+                                                             TGSI_SWIZZLE_Y,
+                                                             TGSI_SWIZZLE_Y,
+                                                             TGSI_SWIZZLE_Y );
+
+      reg = writemask( dst(emit->input_map[idx]),
+                       TGSI_WRITEMASK_XY );
+
+      return emit_decl( emit, reg, 0, 0 );
+   }
+   else if (emit->key.fkey.light_twoside &&
+            (semantic.Name == TGSI_SEMANTIC_COLOR)) {
+
+      if (!translate_vs_ps_semantic( semantic, &usage, &index ))
+         return FALSE;
+
+      emit->internal_color_idx[emit->internal_color_count] = idx;
+      emit->input_map[idx] = src_register( SVGA3DREG_INPUT, emit->ps30_input_count );
+      emit->ps30_input_count++;
+      emit->internal_color_count++;
+
+      reg = dst( emit->input_map[idx] );
+
+      if (!emit_decl( emit, reg, usage, index ))
+         return FALSE;
+
+      semantic.Name = TGSI_SEMANTIC_BCOLOR;
+      if (!translate_vs_ps_semantic( semantic, &usage, &index ))
+         return FALSE;
+
+      reg = dst_register( SVGA3DREG_INPUT, emit->ps30_input_count++ );
+
+      if (!emit_decl( emit, reg, usage, index ))
+         return FALSE;
+
+      if (!emit_vface_decl( emit ))
+         return FALSE;
+
+      return TRUE;
+   }
+   else if (semantic.Name == TGSI_SEMANTIC_FACE) {
+      if (!emit_vface_decl( emit ))
+         return FALSE;
+      emit->emit_frontface = TRUE;
+      emit->internal_frontface_idx = idx;
+      return TRUE;
+   }
+   else {
+
+      if (!translate_vs_ps_semantic( semantic, &usage, &index ))
+         return FALSE;
+
+      emit->input_map[idx] = src_register( SVGA3DREG_INPUT, emit->ps30_input_count++ );
+      reg = dst( emit->input_map[idx] );
+
+      return emit_decl( emit, reg, usage, index );
+   }
+
+}
+
+
+/* PS output registers are the same as 2.0
+ */
+static boolean ps30_output( struct svga_shader_emitter *emit,
+                            struct tgsi_declaration_semantic semantic,
+                            unsigned idx )
+{
+   SVGA3dShaderDestToken reg;
+
+   switch (semantic.Name) {
+   case TGSI_SEMANTIC_COLOR:
+      if (emit->unit == PIPE_SHADER_FRAGMENT &&
+          emit->key.fkey.white_fragments) {
+
+         emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                               emit->nr_hw_temp++ );
+         emit->temp_col[idx] = emit->output_map[idx];
+         emit->true_col[idx] = dst_register( SVGA3DREG_COLOROUT, 
+                                              semantic.Index );
+      }
+      else {
+         emit->output_map[idx] = dst_register( SVGA3DREG_COLOROUT, 
+                                               semantic.Index );
+      }
+      break;
+   case TGSI_SEMANTIC_POSITION:
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++ );
+      emit->temp_pos = emit->output_map[idx];
+      emit->true_pos = dst_register( SVGA3DREG_DEPTHOUT, 
+                                     semantic.Index );
+      break;
+   default:
+      assert(0);
+      reg = dst_register( SVGA3DREG_COLOROUT, 0 );
+      break;
+   }
+
+   return TRUE;
+}
+
+
+/* We still make up the input semantics the same as in 2.0
+ */
+static boolean vs30_input( struct svga_shader_emitter *emit,
+                           struct tgsi_declaration_semantic semantic,
+                           unsigned idx )
+{
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+   unsigned usage, index;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   if (emit->key.vkey.zero_stride_vertex_elements & (1 << idx)) {
+      unsigned i;
+      unsigned offset = 0;
+      unsigned start_idx = emit->info.file_max[TGSI_FILE_CONSTANT] + 1;
+      /* adjust for prescale constants */
+      start_idx += emit->key.vkey.need_prescale ? 2 : 0;
+      /* compute the offset from the start of zero stride constants */
+      for (i = 0; i < PIPE_MAX_ATTRIBS && i < idx; ++i) {
+         if (emit->key.vkey.zero_stride_vertex_elements & (1<<i))
+            ++offset;
+      }
+      emit->input_map[idx] = src_register( SVGA3DREG_CONST,
+                                           start_idx + offset );
+   } else {
+      emit->input_map[idx] = src_register( SVGA3DREG_INPUT, idx );
+      dcl.dst = dst_register( SVGA3DREG_INPUT, idx );
+
+      assert(dcl.dst.reserved0);
+
+      svga_generate_vdecl_semantics( idx, &usage, &index );
+
+      dcl.usage = usage;
+      dcl.index = index;
+      dcl.values[0] |= 1<<31;
+
+      return  (emit_instruction(emit, opcode) &&
+               svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+   }
+   return TRUE;
+}
+
+/* VS3.0 outputs have proper declarations and semantic info for
+ * matching against PS inputs.
+ */
+static boolean vs30_output( struct svga_shader_emitter *emit,
+                         struct tgsi_declaration_semantic semantic,
+                         unsigned idx )
+{
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+   unsigned usage, index;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   if (!translate_vs_ps_semantic( semantic, &usage, &index ))
+      return FALSE;
+
+   dcl.dst = dst_register( SVGA3DREG_OUTPUT, idx );
+   dcl.usage = usage;
+   dcl.index = index;
+   dcl.values[0] |= 1<<31;
+
+   if (semantic.Name == TGSI_SEMANTIC_POSITION) {
+      assert(idx == 0);
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++ );
+      emit->temp_pos = emit->output_map[idx];
+      emit->true_pos = dcl.dst;
+   }
+   else if (semantic.Name == TGSI_SEMANTIC_PSIZE) {
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++ );
+      emit->temp_psiz = emit->output_map[idx];
+
+      /* This has the effect of not declaring psiz (below) and not 
+       * emitting the final MOV to true_psiz in the postamble.
+       */
+      if (!emit->key.vkey.allow_psiz)
+         return TRUE;
+
+      emit->true_psiz = dcl.dst;
+   }
+   else {
+      emit->output_map[idx] = dcl.dst;
+   }
+
+
+   return  (emit_instruction(emit, opcode) &&
+            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+}
+
+static boolean ps30_sampler( struct svga_shader_emitter *emit,
+                          struct tgsi_declaration_semantic semantic,
+                          unsigned idx )
+{
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   dcl.dst = dst_register( SVGA3DREG_SAMPLER, idx );
+   dcl.type = svga_tgsi_sampler_type( emit, idx );
+   dcl.values[0] |= 1<<31;
+
+   return  (emit_instruction(emit, opcode) &&
+            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+}
+
+
+boolean svga_translate_decl_sm30( struct svga_shader_emitter *emit,
+                             const struct tgsi_full_declaration *decl )
+{
+   unsigned first = decl->Range.First;
+   unsigned last = decl->Range.Last;
+   unsigned semantic = 0;
+   unsigned semantic_idx = 0;
+   unsigned idx;
+
+   if (decl->Declaration.Semantic) {
+      semantic = decl->Semantic.Name;
+      semantic_idx = decl->Semantic.Index;
+   }
+
+   for( idx = first; idx <= last; idx++ ) {
+      boolean ok;
+
+      switch (decl->Declaration.File) {
+      case TGSI_FILE_SAMPLER:
+         assert (emit->unit == PIPE_SHADER_FRAGMENT);
+         ok = ps30_sampler( emit, decl->Semantic, idx );
+         break;
+
+      case TGSI_FILE_INPUT:
+         if (emit->unit == PIPE_SHADER_VERTEX)
+            ok = vs30_input( emit, decl->Semantic, idx );
+         else
+            ok = ps30_input( emit, decl->Semantic, idx );
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         if (emit->unit == PIPE_SHADER_VERTEX)
+            ok = vs30_output( emit, decl->Semantic, idx );
+         else
+            ok = ps30_output( emit, decl->Semantic, idx );
+         break;
+
+      default:
+         /* don't need to declare other vars */
+         ok = TRUE;
+      }
+
+      if (!ok)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
new file mode 100644
index 0000000000..48eced2ece
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -0,0 +1,366 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_TGSI_EMIT_H
+#define SVGA_TGSI_EMIT_H
+
+#include "tgsi/tgsi_scan.h"
+#include "svga_hw_reg.h"
+#include "svga_tgsi.h"
+#include "svga3d_shaderdefs.h"
+
+struct src_register
+{
+   SVGA3dShaderSrcToken base;
+   SVGA3dShaderSrcToken indirect;
+};
+
+
+struct svga_arl_consts {
+   int number;
+   int idx;
+   int swizzle;
+   int arl_num;
+};
+
+/* Internal functions:
+ */
+
+struct svga_shader_emitter
+{
+   boolean use_sm30;
+   
+   unsigned size;
+   char *buf;
+   char *ptr;
+
+   union svga_compile_key key;
+   struct tgsi_shader_info info;
+   int unit;
+
+   int imm_start;
+
+   int nr_hw_const;
+   int nr_hw_temp;
+   
+   int insn_offset;
+
+   int internal_temp_count;
+   int internal_imm_count;
+
+   int internal_color_idx[2]; /* diffuse, specular */
+   int internal_color_count;
+
+   boolean emitted_vface;
+   boolean emit_frontface;
+   int internal_frontface_idx;
+
+   int ps30_input_count;
+
+   int dynamic_branching_level;
+
+   boolean in_main_func;
+
+   boolean created_zero_immediate;
+   int zero_immediate_idx;
+
+   boolean created_loop_const;
+   int loop_const_idx;
+
+   boolean created_sincos_consts;
+   int sincos_consts_idx;
+
+   unsigned label[32];
+   unsigned nr_labels;
+
+   struct src_register input_map[PIPE_MAX_ATTRIBS];
+   SVGA3dShaderDestToken output_map[PIPE_MAX_ATTRIBS];
+
+   struct src_register imm_0055;
+   SVGA3dShaderDestToken temp_pos;
+   SVGA3dShaderDestToken true_pos;
+
+   SVGA3dShaderDestToken temp_col[PIPE_MAX_COLOR_BUFS];
+   SVGA3dShaderDestToken true_col[PIPE_MAX_COLOR_BUFS];
+
+   SVGA3dShaderDestToken temp_psiz;
+   SVGA3dShaderDestToken true_psiz;
+
+   struct svga_arl_consts arl_consts[12];
+   int num_arl_consts;
+   int current_arl;
+};
+
+
+boolean svga_shader_emit_dword( struct svga_shader_emitter *emit,
+                                unsigned dword );
+
+boolean svga_shader_emit_dwords( struct svga_shader_emitter *emit,
+                                 const unsigned *dwords,
+                                 unsigned nr );
+
+boolean svga_shader_emit_opcode( struct svga_shader_emitter *emit,
+                                 unsigned opcode );
+
+boolean svga_shader_emit_instructions( struct svga_shader_emitter *emit,
+                                       const struct tgsi_token *tokens );
+
+boolean svga_translate_decl_sm20( struct svga_shader_emitter *emit,
+                               const struct tgsi_full_declaration *decl );
+
+boolean svga_translate_decl_sm30( struct svga_shader_emitter *emit,
+                               const struct tgsi_full_declaration *decl );
+
+
+static INLINE boolean emit_dst( struct svga_shader_emitter *emit,
+                         SVGA3dShaderDestToken dest )
+{
+   assert(dest.reserved0);
+   assert(dest.mask);
+   return svga_shader_emit_dword( emit, dest.value );
+}
+
+static INLINE boolean emit_src( struct svga_shader_emitter *emit,
+                         const struct src_register src )
+{
+   if (src.base.relAddr) {
+      assert(src.base.reserved0);
+      assert(src.indirect.reserved0);
+      return (svga_shader_emit_dword( emit, src.base.value ) &&
+              svga_shader_emit_dword( emit, src.indirect.value ));
+   }
+   else {
+      assert(src.base.reserved0);
+      return svga_shader_emit_dword( emit, src.base.value );
+   }
+}
+
+
+static INLINE boolean emit_instruction( struct svga_shader_emitter *emit,
+                                 SVGA3dShaderInstToken opcode )
+{
+   return svga_shader_emit_opcode( emit, opcode.value );
+}
+
+
+static INLINE boolean emit_op1( struct svga_shader_emitter *emit,
+                         SVGA3dShaderInstToken inst,
+                         SVGA3dShaderDestToken dest,
+                         struct src_register src0 )
+{
+   return (emit_instruction( emit, inst ) &&
+           emit_dst( emit, dest ) &&
+           emit_src( emit, src0 ));
+}
+
+static INLINE boolean emit_op2( struct svga_shader_emitter *emit,
+                     SVGA3dShaderInstToken inst,
+                     SVGA3dShaderDestToken dest,
+                     struct src_register src0,
+                     struct src_register src1 )
+{
+   return (emit_instruction( emit, inst ) &&
+           emit_dst( emit, dest ) &&
+           emit_src( emit, src0 ) &&
+           emit_src( emit, src1 ));
+}
+
+static INLINE boolean emit_op3( struct svga_shader_emitter *emit,
+                         SVGA3dShaderInstToken inst,
+                         SVGA3dShaderDestToken dest,
+                         struct src_register src0,
+                         struct src_register src1,
+                         struct src_register src2 )
+{
+   return (emit_instruction( emit, inst ) &&
+           emit_dst( emit, dest ) &&
+           emit_src( emit, src0 ) &&
+           emit_src( emit, src1 ) &&
+           emit_src( emit, src2 ));
+}
+
+
+static INLINE boolean emit_op4( struct svga_shader_emitter *emit,
+                                SVGA3dShaderInstToken inst,
+                                SVGA3dShaderDestToken dest,
+                                struct src_register src0,
+                                struct src_register src1,
+                                struct src_register src2,
+                                struct src_register src3)
+{
+   return (emit_instruction( emit, inst ) &&
+           emit_dst( emit, dest ) &&
+           emit_src( emit, src0 ) &&
+           emit_src( emit, src1 ) &&
+           emit_src( emit, src2 ) &&
+           emit_src( emit, src3 ));
+}
+
+
+#define TRANSLATE_SWIZZLE(x,y,z,w)  ((x) | ((y) << 2) | ((z) << 4) | ((w) << 6))
+#define SWIZZLE_XYZW  \
+ TRANSLATE_SWIZZLE(TGSI_SWIZZLE_X,TGSI_SWIZZLE_Y,TGSI_SWIZZLE_Z,TGSI_SWIZZLE_W)
+#define SWIZZLE_XXXX  \
+ TRANSLATE_SWIZZLE(TGSI_SWIZZLE_X,TGSI_SWIZZLE_X,TGSI_SWIZZLE_X,TGSI_SWIZZLE_X)
+#define SWIZZLE_YYYY  \
+ TRANSLATE_SWIZZLE(TGSI_SWIZZLE_Y,TGSI_SWIZZLE_Y,TGSI_SWIZZLE_Y,TGSI_SWIZZLE_Y)
+#define SWIZZLE_ZZZZ  \
+ TRANSLATE_SWIZZLE(TGSI_SWIZZLE_Z,TGSI_SWIZZLE_Z,TGSI_SWIZZLE_Z,TGSI_SWIZZLE_Z)
+#define SWIZZLE_WWWW  \
+ TRANSLATE_SWIZZLE(TGSI_SWIZZLE_W,TGSI_SWIZZLE_W,TGSI_SWIZZLE_W,TGSI_SWIZZLE_W)
+
+
+
+static INLINE SVGA3dShaderInstToken
+inst_token( unsigned opcode )
+{
+   SVGA3dShaderInstToken inst;
+
+   inst.value = 0;
+   inst.op = opcode;
+
+   return inst;
+}
+
+static INLINE SVGA3dShaderDestToken 
+dst_register( unsigned file,
+              int number )
+{
+   SVGA3dShaderDestToken dest;
+
+   dest.value = 0;
+   dest.num = number;
+   dest.type_upper = file >> 3;
+   dest.relAddr = 0;
+   dest.reserved1 = 0;
+   dest.mask = 0xf;
+   dest.dstMod = 0;
+   dest.shfScale = 0;
+   dest.type_lower = file & 0x7;
+   dest.reserved0 = 1;          /* is_reg */
+   
+   return dest;
+}
+
+static INLINE SVGA3dShaderDestToken
+writemask( SVGA3dShaderDestToken dest,
+           unsigned mask )
+{
+   assert(dest.mask & mask);
+   dest.mask &= mask;
+   return dest;
+}
+
+
+static INLINE SVGA3dShaderSrcToken 
+src_token( unsigned file, int number )
+{
+   SVGA3dShaderSrcToken src;
+
+   src.value = 0;
+   src.num = number;
+   src.type_upper = file >> 3;
+   src.relAddr = 0;
+   src.reserved1 = 0;
+   src.swizzle = SWIZZLE_XYZW;
+   src.srcMod = 0;
+   src.type_lower = file & 0x7;
+   src.reserved0 = 1;           /* is_reg */
+
+   return src;
+}
+
+
+static INLINE struct src_register 
+absolute( struct src_register src )
+{
+   src.base.srcMod = SVGA3DSRCMOD_ABS;
+
+   return src;
+}
+
+
+static INLINE struct src_register 
+negate( struct src_register src )
+{
+   switch (src.base.srcMod) {
+   case SVGA3DSRCMOD_ABS:
+      src.base.srcMod = SVGA3DSRCMOD_ABSNEG;
+      break;
+   case SVGA3DSRCMOD_ABSNEG:
+      src.base.srcMod = SVGA3DSRCMOD_ABS;
+      break;
+   case SVGA3DSRCMOD_NEG:
+      src.base.srcMod = SVGA3DSRCMOD_NONE;
+      break;
+   case SVGA3DSRCMOD_NONE:
+      src.base.srcMod = SVGA3DSRCMOD_NEG;
+      break;
+   }
+   return src;
+}
+
+
+static INLINE struct src_register 
+src_register( unsigned file, int number )
+{
+   struct src_register src;
+   
+   src.base = src_token( file, number );
+   src.indirect.value = 0;
+
+   return src;
+}
+
+static INLINE SVGA3dShaderDestToken dst( struct src_register src )
+{
+   return dst_register( SVGA3dShaderGetRegType( src.base.value ),
+                        src.base.num );
+}
+
+static INLINE struct src_register src( SVGA3dShaderDestToken dst )
+{
+   return src_register( SVGA3dShaderGetRegType( dst.value ),
+                        dst.num );
+}
+
+static INLINE ubyte svga_tgsi_sampler_type( struct svga_shader_emitter *emit,
+                                            int idx )
+{
+   switch (emit->key.fkey.tex[idx].texture_target) {
+   case PIPE_TEXTURE_1D:
+      return SVGA3DSAMP_2D;
+   case PIPE_TEXTURE_2D:
+      return SVGA3DSAMP_2D;
+   case PIPE_TEXTURE_3D:
+      return SVGA3DSAMP_VOLUME;
+   case PIPE_TEXTURE_CUBE:
+      return SVGA3DSAMP_CUBE;
+   }
+
+   return SVGA3DSAMP_UNKNOWN;
+}
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
new file mode 100644
index 0000000000..67e1f22a70
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -0,0 +1,2928 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "util/u_memory.h"
+
+#include "svga_tgsi_emit.h"
+#include "svga_context.h"
+
+
+static boolean emit_vs_postamble( struct svga_shader_emitter *emit );
+static boolean emit_ps_postamble( struct svga_shader_emitter *emit );
+
+
+
+ 
+static unsigned
+translate_opcode(
+   uint opcode )
+{
+   switch (opcode) {
+   case TGSI_OPCODE_ABS:        return SVGA3DOP_ABS;
+   case TGSI_OPCODE_ADD:        return SVGA3DOP_ADD;
+   case TGSI_OPCODE_BREAKC:     return SVGA3DOP_BREAKC;
+   case TGSI_OPCODE_DP2A:       return SVGA3DOP_DP2ADD;
+   case TGSI_OPCODE_DP3:        return SVGA3DOP_DP3;
+   case TGSI_OPCODE_DP4:        return SVGA3DOP_DP4;
+   case TGSI_OPCODE_FRC:        return SVGA3DOP_FRC;
+   case TGSI_OPCODE_MAD:        return SVGA3DOP_MAD;
+   case TGSI_OPCODE_MAX:        return SVGA3DOP_MAX;
+   case TGSI_OPCODE_MIN:        return SVGA3DOP_MIN;
+   case TGSI_OPCODE_MOV:        return SVGA3DOP_MOV;
+   case TGSI_OPCODE_MUL:        return SVGA3DOP_MUL;
+   case TGSI_OPCODE_NOP:        return SVGA3DOP_NOP;
+   case TGSI_OPCODE_NRM4:       return SVGA3DOP_NRM;
+   case TGSI_OPCODE_SSG:        return SVGA3DOP_SGN;
+   default:
+      debug_printf("Unkown opcode %u\n", opcode);
+      assert( 0 );
+      return SVGA3DOP_LAST_INST;
+   }
+}
+
+
+static unsigned translate_file( unsigned file )
+{
+   switch (file) {
+   case TGSI_FILE_TEMPORARY: return SVGA3DREG_TEMP;
+   case TGSI_FILE_INPUT:     return SVGA3DREG_INPUT;
+   case TGSI_FILE_OUTPUT:    return SVGA3DREG_OUTPUT; /* VS3.0+ only */
+   case TGSI_FILE_IMMEDIATE: return SVGA3DREG_CONST;
+   case TGSI_FILE_CONSTANT:  return SVGA3DREG_CONST;
+   case TGSI_FILE_SAMPLER:   return SVGA3DREG_SAMPLER;
+   case TGSI_FILE_ADDRESS:   return SVGA3DREG_ADDR;
+   default:
+      assert( 0 );
+      return SVGA3DREG_TEMP;
+   }
+}
+
+
+
+
+
+
+static SVGA3dShaderDestToken 
+translate_dst_register( struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn,
+                        unsigned idx )
+{
+   const struct tgsi_full_dst_register *reg = &insn->Dst[idx];
+   SVGA3dShaderDestToken dest;
+
+   switch (reg->Register.File) {
+   case TGSI_FILE_OUTPUT:
+      /* Output registers encode semantic information in their name.
+       * Need to lookup a table built at decl time:
+       */
+      dest = emit->output_map[reg->Register.Index];
+      break;
+
+   default:
+      dest = dst_register( translate_file( reg->Register.File ),
+                           reg->Register.Index );
+      break;
+   }
+
+   dest.mask = reg->Register.WriteMask;
+   assert(dest.mask);
+
+   if (insn->Instruction.Saturate) 
+      dest.dstMod = SVGA3DDSTMOD_SATURATE;
+
+   return dest;
+}
+
+
+static struct src_register 
+swizzle( struct src_register src,
+         int x,
+         int y,
+         int z,
+         int w )
+{
+   x = (src.base.swizzle >> (x * 2)) & 0x3;
+   y = (src.base.swizzle >> (y * 2)) & 0x3;
+   z = (src.base.swizzle >> (z * 2)) & 0x3;
+   w = (src.base.swizzle >> (w * 2)) & 0x3;
+
+   src.base.swizzle = TRANSLATE_SWIZZLE(x,y,z,w);
+
+   return src;
+}
+
+static struct src_register
+scalar( struct src_register src,
+        int comp )
+{
+   return swizzle( src, comp, comp, comp, comp );
+}
+
+static INLINE boolean
+svga_arl_needs_adjustment( const struct svga_shader_emitter *emit )
+{
+   int i;
+
+   for (i = 0; i < emit->num_arl_consts; ++i) {
+      if (emit->arl_consts[i].arl_num == emit->current_arl)
+         return TRUE;
+   }
+   return FALSE;
+}
+
+static INLINE int
+svga_arl_adjustment( const struct svga_shader_emitter *emit )
+{
+   int i;
+
+   for (i = 0; i < emit->num_arl_consts; ++i) {
+      if (emit->arl_consts[i].arl_num == emit->current_arl)
+         return emit->arl_consts[i].number;
+   }
+   return 0;
+}
+
+static struct src_register 
+translate_src_register( const struct svga_shader_emitter *emit,
+                        const struct tgsi_full_src_register *reg )
+{
+   struct src_register src;
+
+   switch (reg->Register.File) {
+   case TGSI_FILE_INPUT:
+      /* Input registers are referred to by their semantic name rather
+       * than by index.  Use the mapping build up from the decls:
+       */
+      src = emit->input_map[reg->Register.Index];
+      break;
+       
+   case TGSI_FILE_IMMEDIATE:
+      /* Immediates are appended after TGSI constants in the D3D
+       * constant buffer.
+       */
+      src = src_register( translate_file( reg->Register.File ),
+                          reg->Register.Index + 
+                          emit->imm_start );
+      break;
+
+   default:
+      src = src_register( translate_file( reg->Register.File ),
+                          reg->Register.Index );
+
+      break;
+   }
+
+   /* Indirect addressing (for coninstant buffer lookups only)
+    */
+   if (reg->Register.Indirect)
+   {
+      /* we shift the offset towards the minimum */
+      if (svga_arl_needs_adjustment( emit )) {
+         src.base.num -= svga_arl_adjustment( emit );
+      }
+      src.base.relAddr = 1;
+
+      /* Not really sure what should go in the second token:
+       */
+      src.indirect = src_token( SVGA3DREG_ADDR,
+                                reg->Indirect.Index );
+
+      src.indirect.swizzle = SWIZZLE_XXXX;
+   }
+
+   src = swizzle( src,
+                  reg->Register.SwizzleX,
+                  reg->Register.SwizzleY,
+                  reg->Register.SwizzleZ,
+                  reg->Register.SwizzleW );
+
+   /* src.mod isn't a bitfield, unfortunately:
+    * See tgsi_util_get_full_src_register_sign_mode for implementation details.
+    */
+   if (reg->Register.Absolute) {
+      if (reg->Register.Negate)
+         src.base.srcMod = SVGA3DSRCMOD_ABSNEG;
+      else
+         src.base.srcMod = SVGA3DSRCMOD_ABS;
+   }
+   else {
+      if (reg->Register.Negate)
+         src.base.srcMod = SVGA3DSRCMOD_NEG;
+      else
+         src.base.srcMod = SVGA3DSRCMOD_NONE;
+   }
+
+   return src;
+}
+
+
+/*
+ * Get a temporary register, return -1 if none available
+ */
+static INLINE SVGA3dShaderDestToken 
+get_temp( struct svga_shader_emitter *emit )
+{
+   int i = emit->nr_hw_temp + emit->internal_temp_count++;
+
+   return dst_register( SVGA3DREG_TEMP, i );
+}
+
+/* Release a single temp.  Currently only effective if it was the last
+ * allocated temp, otherwise release will be delayed until the next
+ * call to reset_temp_regs().
+ */
+static INLINE void 
+release_temp( struct svga_shader_emitter *emit,
+              SVGA3dShaderDestToken temp )
+{
+   if (temp.num == emit->internal_temp_count - 1)
+      emit->internal_temp_count--;
+}
+
+static void reset_temp_regs( struct svga_shader_emitter *emit )
+{
+   emit->internal_temp_count = 0;
+}
+   
+
+static boolean submit_op0( struct svga_shader_emitter *emit,
+                           SVGA3dShaderInstToken inst,
+                           SVGA3dShaderDestToken dest )
+{
+   return (emit_instruction( emit, inst ) && 
+           emit_dst( emit, dest ));
+}
+
+static boolean submit_op1( struct svga_shader_emitter *emit,
+                           SVGA3dShaderInstToken inst,
+                           SVGA3dShaderDestToken dest,
+                           struct src_register src0 )
+{
+   return emit_op1( emit, inst, dest, src0 );
+}
+
+
+/* SVGA shaders may not refer to >1 constant register in a single
+ * instruction.  This function checks for that usage and inserts a
+ * move to temporary if detected.
+ *
+ * The same applies to input registers -- at most a single input
+ * register may be read by any instruction.
+ */
+static boolean submit_op2( struct svga_shader_emitter *emit,
+                           SVGA3dShaderInstToken inst,
+                           SVGA3dShaderDestToken dest,
+                           struct src_register src0,
+                           struct src_register src1 )
+{
+   SVGA3dShaderDestToken temp;
+   SVGA3dShaderRegType type0, type1;
+   boolean need_temp = FALSE;
+
+   temp.value = 0;
+   type0 = SVGA3dShaderGetRegType( src0.base.value );
+   type1 = SVGA3dShaderGetRegType( src1.base.value );
+
+   if (type0 == SVGA3DREG_CONST &&
+       type1 == SVGA3DREG_CONST &&
+       src0.base.num != src1.base.num)
+      need_temp = TRUE;
+
+   if (type0 == SVGA3DREG_INPUT &&
+       type1 == SVGA3DREG_INPUT &&
+       src0.base.num != src1.base.num)
+      need_temp = TRUE;
+
+   if (need_temp)
+   {
+      temp = get_temp( emit );
+
+      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp, src0 ))
+         return FALSE;
+
+      src0 = src( temp );
+   }
+
+   if (!emit_op2( emit, inst, dest, src0, src1 ))
+      return FALSE;
+
+   if (need_temp)
+      release_temp( emit, temp );
+
+   return TRUE;
+}
+
+
+/* SVGA shaders may not refer to >1 constant register in a single
+ * instruction.  This function checks for that usage and inserts a
+ * move to temporary if detected.
+ */
+static boolean submit_op3( struct svga_shader_emitter *emit,
+                           SVGA3dShaderInstToken inst,
+                           SVGA3dShaderDestToken dest,
+                           struct src_register src0,
+                           struct src_register src1,
+                           struct src_register src2 )
+{
+   SVGA3dShaderDestToken temp0;
+   SVGA3dShaderDestToken temp1;
+   boolean need_temp0 = FALSE;
+   boolean need_temp1 = FALSE;
+   SVGA3dShaderRegType type0, type1, type2;
+
+   temp0.value = 0;
+   temp1.value = 0;
+   type0 = SVGA3dShaderGetRegType( src0.base.value );
+   type1 = SVGA3dShaderGetRegType( src1.base.value );
+   type2 = SVGA3dShaderGetRegType( src2.base.value );
+
+   if (inst.op != SVGA3DOP_SINCOS) {
+      if (type0 == SVGA3DREG_CONST &&
+          ((type1 == SVGA3DREG_CONST && src0.base.num != src1.base.num) ||
+           (type2 == SVGA3DREG_CONST && src0.base.num != src2.base.num)))
+         need_temp0 = TRUE;
+
+      if (type1 == SVGA3DREG_CONST &&
+          (type2 == SVGA3DREG_CONST && src1.base.num != src2.base.num))
+         need_temp1 = TRUE;
+   }
+
+   if (type0 == SVGA3DREG_INPUT &&
+       ((type1 == SVGA3DREG_INPUT && src0.base.num != src1.base.num) ||
+        (type2 == SVGA3DREG_INPUT && src0.base.num != src2.base.num)))
+      need_temp0 = TRUE;
+
+   if (type1 == SVGA3DREG_INPUT &&
+       (type2 == SVGA3DREG_INPUT && src1.base.num != src2.base.num))
+      need_temp1 = TRUE;
+
+   if (need_temp0)
+   {
+      temp0 = get_temp( emit );
+ 
+      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp0, src0 ))
+         return FALSE;
+         
+      src0 = src( temp0 );
+   }
+
+   if (need_temp1)
+   {
+      temp1 = get_temp( emit );
+
+      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp1, src1 ))
+         return FALSE;
+
+      src1 = src( temp1 );
+   }
+
+   if (!emit_op3( emit, inst, dest, src0, src1, src2 ))
+      return FALSE;
+
+   if (need_temp1)
+      release_temp( emit, temp1 );
+   if (need_temp0)
+      release_temp( emit, temp0 );
+   return TRUE;
+}
+
+
+
+
+/* SVGA shaders may not refer to >1 constant register in a single
+ * instruction.  This function checks for that usage and inserts a
+ * move to temporary if detected.
+ */
+static boolean submit_op4( struct svga_shader_emitter *emit,
+                           SVGA3dShaderInstToken inst,
+                           SVGA3dShaderDestToken dest,
+                           struct src_register src0,
+                           struct src_register src1,
+                           struct src_register src2,
+                           struct src_register src3)
+{
+   SVGA3dShaderDestToken temp0;
+   SVGA3dShaderDestToken temp3;
+   boolean need_temp0 = FALSE;
+   boolean need_temp3 = FALSE;
+   SVGA3dShaderRegType type0, type1, type2, type3;
+
+   temp0.value = 0;
+   temp3.value = 0;
+   type0 = SVGA3dShaderGetRegType( src0.base.value );
+   type1 = SVGA3dShaderGetRegType( src1.base.value );
+   type2 = SVGA3dShaderGetRegType( src2.base.value );
+   type3 = SVGA3dShaderGetRegType( src2.base.value );
+
+   /* Make life a little easier - this is only used by the TXD
+    * instruction which is guaranteed not to have a constant/input reg
+    * in one slot at least:
+    */
+   assert(type1 == SVGA3DREG_SAMPLER);
+
+   if (type0 == SVGA3DREG_CONST &&
+       ((type3 == SVGA3DREG_CONST && src0.base.num != src3.base.num) ||
+        (type2 == SVGA3DREG_CONST && src0.base.num != src2.base.num)))
+      need_temp0 = TRUE;
+
+   if (type3 == SVGA3DREG_CONST &&
+       (type2 == SVGA3DREG_CONST && src3.base.num != src2.base.num))
+      need_temp3 = TRUE;
+
+   if (type0 == SVGA3DREG_INPUT &&
+       ((type3 == SVGA3DREG_INPUT && src0.base.num != src3.base.num) ||
+        (type2 == SVGA3DREG_INPUT && src0.base.num != src2.base.num)))
+      need_temp0 = TRUE;
+
+   if (type3 == SVGA3DREG_INPUT &&
+       (type2 == SVGA3DREG_INPUT && src3.base.num != src2.base.num))
+      need_temp3 = TRUE;
+
+   if (need_temp0)
+   {
+      temp0 = get_temp( emit );
+ 
+      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp0, src0 ))
+         return FALSE;
+         
+      src0 = src( temp0 );
+   }
+
+   if (need_temp3)
+   {
+      temp3 = get_temp( emit );
+
+      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp3, src3 ))
+         return FALSE;
+
+      src3 = src( temp3 );
+   }
+
+   if (!emit_op4( emit, inst, dest, src0, src1, src2, src3 ))
+      return FALSE;
+
+   if (need_temp3)
+      release_temp( emit, temp3 );
+   if (need_temp0)
+      release_temp( emit, temp0 );
+   return TRUE;
+}
+
+
+static boolean emit_def_const( struct svga_shader_emitter *emit,
+                               SVGA3dShaderConstType type,
+                               unsigned idx,
+                               float a,
+                               float b,
+                               float c,
+                               float d )
+{
+   SVGA3DOpDefArgs def;
+   SVGA3dShaderInstToken opcode;
+
+   switch (type) {
+   case SVGA3D_CONST_TYPE_FLOAT:
+      opcode = inst_token( SVGA3DOP_DEF );
+      def.dst = dst_register( SVGA3DREG_CONST, idx );
+      def.constValues[0] = a;
+      def.constValues[1] = b;
+      def.constValues[2] = c;
+      def.constValues[3] = d;
+      break;
+   case SVGA3D_CONST_TYPE_INT:
+      opcode = inst_token( SVGA3DOP_DEFI );
+      def.dst = dst_register( SVGA3DREG_CONSTINT, idx );
+      def.constIValues[0] = (int)a;
+      def.constIValues[1] = (int)b;
+      def.constIValues[2] = (int)c;
+      def.constIValues[3] = (int)d;
+      break;
+   default:
+      assert(0);
+      opcode = inst_token( SVGA3DOP_NOP );
+      break;
+   }
+
+   if (!emit_instruction(emit, opcode) ||
+       !svga_shader_emit_dwords( emit, def.values, Elements(def.values)))
+      return FALSE;
+
+   return TRUE;
+}
+
+static INLINE boolean
+create_zero_immediate( struct svga_shader_emitter *emit )
+{
+   unsigned idx = emit->nr_hw_const++;
+
+   if (!emit_def_const( emit, SVGA3D_CONST_TYPE_FLOAT,
+                        idx, 0, 0, 0, 1 ))
+      return FALSE;
+
+   emit->zero_immediate_idx = idx;
+   emit->created_zero_immediate = TRUE;
+
+   return TRUE;
+}
+
+static INLINE boolean
+create_loop_const( struct svga_shader_emitter *emit )
+{
+   unsigned idx = emit->nr_hw_const++;
+
+   if (!emit_def_const( emit, SVGA3D_CONST_TYPE_INT, idx,
+                        255, /* iteration count */
+                        0, /* initial value */
+                        1, /* step size */
+                        0 /* not used, must be 0 */))
+      return FALSE;
+
+   emit->loop_const_idx = idx;
+   emit->created_loop_const = TRUE;
+
+   return TRUE;
+}
+
+static INLINE boolean
+create_sincos_consts( struct svga_shader_emitter *emit )
+{
+   unsigned idx = emit->nr_hw_const++;
+
+   if (!emit_def_const( emit, SVGA3D_CONST_TYPE_FLOAT, idx,
+                        -1.5500992e-006f,
+                        -2.1701389e-005f,
+                        0.0026041667f,
+                        0.00026041668f ))
+      return FALSE;
+
+   emit->sincos_consts_idx = idx;
+   idx = emit->nr_hw_const++;
+
+   if (!emit_def_const( emit, SVGA3D_CONST_TYPE_FLOAT, idx,
+                        -0.020833334f,
+                        -0.12500000f,
+                        1.0f,
+                        0.50000000f ))
+      return FALSE;
+
+   emit->created_sincos_consts = TRUE;
+
+   return TRUE;
+}
+
+static INLINE boolean
+create_arl_consts( struct svga_shader_emitter *emit )
+{
+   int i;
+
+   for (i = 0; i < emit->num_arl_consts; i += 4) {
+      int j;
+      unsigned idx = emit->nr_hw_const++;
+      float vals[4];
+      for (j = 0; j < 4 && (j + i) < emit->num_arl_consts; ++j) {
+         vals[j] = emit->arl_consts[i + j].number;
+         emit->arl_consts[i + j].idx = idx;
+         switch (j) {
+         case 0:
+            emit->arl_consts[i + 0].swizzle = TGSI_SWIZZLE_X;
+            break;
+         case 1:
+            emit->arl_consts[i + 0].swizzle = TGSI_SWIZZLE_Y;
+            break;
+         case 2:
+            emit->arl_consts[i + 0].swizzle = TGSI_SWIZZLE_Z;
+            break;
+         case 3:
+            emit->arl_consts[i + 0].swizzle = TGSI_SWIZZLE_W;
+            break;
+         }
+      }
+      while (j < 4)
+         vals[j++] = 0;
+
+      if (!emit_def_const( emit, SVGA3D_CONST_TYPE_FLOAT, idx,
+                           vals[0], vals[1],
+                           vals[2], vals[3]))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+static INLINE struct src_register
+get_vface( struct svga_shader_emitter *emit )
+{
+   assert(emit->emitted_vface);
+   return src_register(SVGA3DREG_MISCTYPE, 
+                       SVGA3DMISCREG_FACE);
+}
+
+/* returns {0, 0, 0, 1} immediate */
+static INLINE struct src_register
+get_zero_immediate( struct svga_shader_emitter *emit )
+{
+   assert(emit->created_zero_immediate);
+   assert(emit->zero_immediate_idx >= 0);
+   return src_register( SVGA3DREG_CONST,
+                        emit->zero_immediate_idx );
+}
+
+/* returns the loop const */
+static INLINE struct src_register
+get_loop_const( struct svga_shader_emitter *emit )
+{
+   assert(emit->created_loop_const);
+   assert(emit->loop_const_idx >= 0);
+   return src_register( SVGA3DREG_CONSTINT,
+                        emit->loop_const_idx );
+}
+
+/* returns a sincos const */
+static INLINE struct src_register
+get_sincos_const( struct svga_shader_emitter *emit,
+                  unsigned index )
+{
+   assert(emit->created_sincos_consts);
+   assert(emit->sincos_consts_idx >= 0);
+   assert(index == 0 || index == 1);
+   return src_register( SVGA3DREG_CONST,
+                        emit->sincos_consts_idx + index );
+}
+
+static INLINE struct src_register
+get_fake_arl_const( struct svga_shader_emitter *emit )
+{
+   struct src_register reg;
+   int idx = 0, swizzle = 0, i;
+
+   for (i = 0; i < emit->num_arl_consts; ++ i) {
+      if (emit->arl_consts[i].arl_num == emit->current_arl) {
+         idx = emit->arl_consts[i].idx;
+         swizzle = emit->arl_consts[i].swizzle;
+      }
+   }
+
+   reg = src_register( SVGA3DREG_CONST, idx );
+   return scalar(reg, swizzle);
+}
+
+static INLINE struct src_register
+get_tex_dimensions( struct svga_shader_emitter *emit, int sampler_num )
+{
+   int idx;
+   struct src_register reg;
+
+   /* the width/height indexes start right after constants */
+   idx = emit->key.fkey.tex[sampler_num].width_height_idx +
+         emit->info.file_max[TGSI_FILE_CONSTANT] + 1;
+
+   reg = src_register( SVGA3DREG_CONST, idx );
+   return reg;
+}
+
+static boolean emit_fake_arl(struct svga_shader_emitter *emit,
+                             const struct tgsi_full_instruction *insn)
+{
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   struct src_register src1 = get_fake_arl_const( emit );
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   SVGA3dShaderDestToken tmp = get_temp( emit );
+
+   if (!submit_op1(emit, inst_token( SVGA3DOP_MOV ), tmp, src0))
+      return FALSE;
+
+   if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), tmp, src( tmp ),
+                    src1))
+      return FALSE;
+
+   /* replicate the original swizzle */
+   src1 = src(tmp);
+   src1.base.swizzle = src0.base.swizzle;
+
+   return submit_op1( emit, inst_token( SVGA3DOP_MOVA ),
+                      dst, src1 );
+}
+
+static boolean emit_if(struct svga_shader_emitter *emit,
+                       const struct tgsi_full_instruction *insn)
+{
+   const struct src_register src = translate_src_register(
+      emit, &insn->Src[0] );
+   struct src_register zero = get_zero_immediate( emit );
+   SVGA3dShaderInstToken if_token = inst_token( SVGA3DOP_IFC );
+
+   if_token.control = SVGA3DOPCOMPC_NE;
+   zero = scalar(zero, TGSI_SWIZZLE_X);
+
+   emit->dynamic_branching_level++;
+
+   return (emit_instruction( emit, if_token ) &&
+           emit_src( emit, src ) &&
+           emit_src( emit, zero ) );
+}
+
+static boolean emit_endif(struct svga_shader_emitter *emit,
+                       const struct tgsi_full_instruction *insn)
+{
+   emit->dynamic_branching_level--;
+
+   return (emit_instruction( emit,
+                             inst_token( SVGA3DOP_ENDIF )));
+}
+
+static boolean emit_else(struct svga_shader_emitter *emit,
+                         const struct tgsi_full_instruction *insn)
+{
+   return (emit_instruction( emit,
+                             inst_token( SVGA3DOP_ELSE )));
+}
+
+/* Translate the following TGSI FLR instruction.
+ *    FLR  DST, SRC
+ * To the following SVGA3D instruction sequence.
+ *    FRC  TMP, SRC
+ *    SUB  DST, SRC, TMP
+ */
+static boolean emit_floor(struct svga_shader_emitter *emit,
+                          const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+
+   /* FRC  TMP, SRC */
+   if (!submit_op1( emit, inst_token( SVGA3DOP_FRC ), temp, src0 ))
+      return FALSE;
+
+   /* SUB  DST, SRC, TMP */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst, src0,
+                    negate( src( temp ) ) ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+/* Translate the following TGSI CMP instruction.
+ *    CMP  DST, SRC0, SRC1, SRC2
+ * To the following SVGA3D instruction sequence.
+ *    CMP  DST, SRC0, SRC2, SRC1
+ */
+static boolean emit_cmp(struct svga_shader_emitter *emit,
+                          const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   const struct src_register src1 = translate_src_register(
+      emit, &insn->Src[1] );
+   const struct src_register src2 = translate_src_register(
+      emit, &insn->Src[2] );
+
+   /* CMP  DST, SRC0, SRC2, SRC1 */
+   return submit_op3( emit, inst_token( SVGA3DOP_CMP ), dst, src0, src2, src1);
+}
+
+
+
+/* Translate the following TGSI DIV instruction.
+ *    DIV  DST.xy, SRC0, SRC1
+ * To the following SVGA3D instruction sequence.
+ *    RCP  TMP.x, SRC1.xxxx
+ *    RCP  TMP.y, SRC1.yyyy
+ *    MUL  DST.xy, SRC0, TMP
+ */
+static boolean emit_div(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   const struct src_register src1 = translate_src_register(
+      emit, &insn->Src[1] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+   int i;
+
+   /* For each enabled element, perform a RCP instruction.  Note that
+    * RCP is scalar in SVGA3D:
+    */
+   for (i = 0; i < 4; i++) {
+      unsigned channel = 1 << i;
+      if (dst.mask & channel) {
+         /* RCP  TMP.?, SRC1.???? */
+         if (!submit_op1( emit, inst_token( SVGA3DOP_RCP ), 
+                          writemask(temp, channel), 
+                          scalar(src1, i) ))
+            return FALSE;
+      }
+   }
+
+   /* Then multiply them out with a single mul:
+    *
+    * MUL  DST, SRC0, TMP
+    */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ), dst, src0,
+                    src( temp ) ))
+      return FALSE;
+
+   return TRUE;
+}
+
+/* Translate the following TGSI DP2 instruction.
+ *    DP2  DST, SRC1, SRC2
+ * To the following SVGA3D instruction sequence.
+ *    MUL  TMP, SRC1, SRC2
+ *    ADD  DST, TMP.xxxx, TMP.yyyy
+ */
+static boolean emit_dp2(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   const struct src_register src1 = translate_src_register(
+      emit, &insn->Src[1] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+   struct src_register temp_src0, temp_src1;
+
+   /* MUL  TMP, SRC1, SRC2 */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ), temp, src0, src1 ))
+      return FALSE;
+
+   temp_src0 = scalar(src( temp ), TGSI_SWIZZLE_X);
+   temp_src1 = scalar(src( temp ), TGSI_SWIZZLE_Y);
+
+   /* ADD  DST, TMP.xxxx, TMP.yyyy */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst,
+                    temp_src0, temp_src1 ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+/* Translate the following TGSI DPH instruction.
+ *    DPH  DST, SRC1, SRC2
+ * To the following SVGA3D instruction sequence.
+ *    DP3  TMP, SRC1, SRC2
+ *    ADD  DST, TMP, SRC2.wwww
+ */
+static boolean emit_dph(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   struct src_register src1 = translate_src_register(
+      emit, &insn->Src[1] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+
+   /* DP3  TMP, SRC1, SRC2 */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_DP3 ), temp, src0, src1 ))
+      return FALSE;
+
+   src1 = scalar(src1, TGSI_SWIZZLE_W);
+
+   /* ADD  DST, TMP, SRC2.wwww */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst,
+                    src( temp ), src1 ))
+      return FALSE;
+
+   return TRUE;
+}
+
+/* Translate the following TGSI DST instruction.
+ *    NRM  DST, SRC
+ * To the following SVGA3D instruction sequence.
+ *    DP3  TMP, SRC, SRC
+ *    RSQ  TMP, TMP
+ *    MUL  DST, SRC, TMP
+ */
+static boolean emit_nrm(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+
+   /* DP3  TMP, SRC, SRC */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_DP3 ), temp, src0, src0 ))
+      return FALSE;
+
+   /* RSQ  TMP, TMP */
+   if (!submit_op1( emit, inst_token( SVGA3DOP_RSQ ), temp, src( temp )))
+      return FALSE;
+
+   /* MUL  DST, SRC, TMP */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ), dst,
+                    src0, src( temp )))
+      return FALSE;
+
+   return TRUE;
+
+}
+
+static boolean do_emit_sincos(struct svga_shader_emitter *emit,
+                              SVGA3dShaderDestToken dst,
+                              struct src_register src0)
+{
+   src0 = scalar(src0, TGSI_SWIZZLE_X);
+
+   if (emit->use_sm30) {
+      return submit_op1( emit, inst_token( SVGA3DOP_SINCOS ),
+                         dst, src0 );
+   } else {
+      struct src_register const1 = get_sincos_const( emit, 0 );
+      struct src_register const2 = get_sincos_const( emit, 1 );
+
+      return submit_op3( emit, inst_token( SVGA3DOP_SINCOS ),
+                         dst, src0, const1, const2 );
+   }
+}
+
+static boolean emit_sincos(struct svga_shader_emitter *emit,
+                           const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+
+   /* SCS TMP SRC */
+   if (!do_emit_sincos(emit, writemask(temp, TGSI_WRITEMASK_XY), src0 ))
+      return FALSE;
+
+   /* MOV DST TMP */
+   if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), dst, src( temp ) ))
+      return FALSE;
+
+   return TRUE;
+}
+
+/*
+ * SCS TMP SRC
+ * MOV DST TMP.yyyy
+ */
+static boolean emit_sin(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+
+   /* SCS TMP SRC */
+   if (!do_emit_sincos(emit, writemask(temp, TGSI_WRITEMASK_Y), src0))
+      return FALSE;
+
+   src0 = scalar(src( temp ), TGSI_SWIZZLE_Y);
+
+   /* MOV DST TMP.yyyy */
+   if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), dst, src0 ))
+      return FALSE;
+
+   return TRUE;
+}
+
+/*
+ * SCS TMP SRC
+ * MOV DST TMP.xxxx
+ */
+static boolean emit_cos(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+
+   /* SCS TMP SRC */
+   if (!do_emit_sincos( emit, writemask(temp, TGSI_WRITEMASK_X), src0 ))
+      return FALSE;
+
+   src0 = scalar(src( temp ), TGSI_SWIZZLE_X);
+
+   /* MOV DST TMP.xxxx */
+   if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), dst, src0 ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+/*
+ * ADD DST SRC0, negate(SRC0)
+ */
+static boolean emit_sub(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   struct src_register src1 = translate_src_register(
+      emit, &insn->Src[1] );
+
+   src1 = negate(src1);
+
+   if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst,
+                    src0, src1 ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+static boolean emit_kil(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst;
+   const struct tgsi_full_src_register *reg = &insn->Src[0];
+   struct src_register src0;
+
+   inst = inst_token( SVGA3DOP_TEXKILL );
+   src0 = translate_src_register( emit, reg );
+
+   if (reg->Register.Absolute ||
+       reg->Register.Negate ||
+       reg->Register.Indirect ||
+       reg->Register.SwizzleX != 0 ||
+       reg->Register.SwizzleY != 1 ||
+       reg->Register.SwizzleZ != 2 ||
+       reg->Register.File != TGSI_FILE_TEMPORARY)
+   {
+      SVGA3dShaderDestToken temp = get_temp( emit );
+
+      submit_op1( emit, inst_token( SVGA3DOP_MOV ), temp, src0 );
+      src0 = src( temp );
+   }
+
+   return submit_op0( emit, inst, dst(src0) );
+}
+
+
+/* mesa state tracker always emits kilp as an unconditional
+ * kil */
+static boolean emit_kilp(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst;
+   SVGA3dShaderDestToken temp;
+   struct src_register one = scalar( get_zero_immediate( emit ),
+                                     TGSI_SWIZZLE_W );
+
+   inst = inst_token( SVGA3DOP_TEXKILL );
+
+   /* texkill doesn't allow negation on the operand so lets move
+    * negation of {1} to a temp register */
+   temp = get_temp( emit );
+   if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), temp,
+                    negate( one ) ))
+      return FALSE;
+
+   return submit_op0( emit, inst, temp );
+}
+
+/* Implement conditionals by initializing destination reg to 'fail',
+ * then set predicate reg with UFOP_SETP, then move 'pass' to dest
+ * based on predicate reg.
+ *
+ * SETP src0, cmp, src1  -- do this first to avoid aliasing problems.
+ * MOV dst, fail
+ * MOV dst, pass, p0 
+ */
+static boolean
+emit_conditional(struct svga_shader_emitter *emit,
+                 unsigned compare_func,
+                 SVGA3dShaderDestToken dst,
+                 struct src_register src0,
+                 struct src_register src1,
+                 struct src_register pass,
+                 struct src_register fail)
+{
+   SVGA3dShaderDestToken pred_reg = dst_register( SVGA3DREG_PREDICATE, 0 );
+   SVGA3dShaderInstToken setp_token, mov_token;
+   setp_token = inst_token( SVGA3DOP_SETP );
+
+   switch (compare_func) {
+   case PIPE_FUNC_NEVER:
+      return submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                         dst, fail );
+      break;
+   case PIPE_FUNC_LESS:
+      setp_token.control = SVGA3DOPCOMP_LT;
+      break;
+   case PIPE_FUNC_EQUAL:
+      setp_token.control = SVGA3DOPCOMP_EQ;
+      break;
+   case PIPE_FUNC_LEQUAL:
+      setp_token.control = SVGA3DOPCOMP_LE;
+      break;
+   case PIPE_FUNC_GREATER:
+      setp_token.control = SVGA3DOPCOMP_GT;
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      setp_token.control = SVGA3DOPCOMPC_NE;
+      break;
+   case PIPE_FUNC_GEQUAL:
+      setp_token.control = SVGA3DOPCOMP_GE;
+      break;
+   case PIPE_FUNC_ALWAYS:
+      return submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                         dst, pass );
+      break;
+   }
+
+   /* SETP src0, COMPOP, src1 */
+   if (!submit_op2( emit, setp_token, pred_reg,
+                    src0, src1 ))
+      return FALSE;
+
+   mov_token = inst_token( SVGA3DOP_MOV );
+
+   /* MOV dst, fail */
+   if (!submit_op1( emit, mov_token, dst,
+                    fail ))
+      return FALSE;
+
+   /* MOV dst, pass (predicated)
+    *
+    * Note that the predicate reg (and possible modifiers) is passed
+    * as the first source argument.
+    */
+   mov_token.predicated = 1;
+   if (!submit_op2( emit, mov_token, dst,
+                    src( pred_reg ), pass ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+static boolean
+emit_select(struct svga_shader_emitter *emit,
+            unsigned compare_func,
+            SVGA3dShaderDestToken dst,
+            struct src_register src0,
+            struct src_register src1 )
+{
+   /* There are some SVGA instructions which implement some selects
+    * directly, but they are only available in the vertex shader.
+    */
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      switch (compare_func) {
+      case PIPE_FUNC_GEQUAL:
+         return submit_op2( emit, inst_token( SVGA3DOP_SGE ), dst, src0, src1 );
+      case PIPE_FUNC_LEQUAL:
+         return submit_op2( emit, inst_token( SVGA3DOP_SGE ), dst, src1, src0 );
+      case PIPE_FUNC_GREATER:
+         return submit_op2( emit, inst_token( SVGA3DOP_SLT ), dst, src1, src0 );
+      case PIPE_FUNC_LESS:
+         return submit_op2( emit, inst_token( SVGA3DOP_SLT ), dst, src0, src1 );
+      default:
+         break;
+      }
+   }
+
+
+   /* Otherwise, need to use the setp approach:
+    */
+   {
+      struct src_register one, zero;
+      /* zero immediate is 0,0,0,1 */
+      zero = get_zero_immediate( emit );
+      one  = scalar( zero, TGSI_SWIZZLE_W );
+      zero = scalar( zero, TGSI_SWIZZLE_X );
+
+      return emit_conditional(
+         emit,
+         compare_func,
+         dst,
+         src0,
+         src1,
+         one, zero);
+   }
+}
+
+
+static boolean emit_select_op(struct svga_shader_emitter *emit,
+                              unsigned compare,
+                              const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   struct src_register src1 = translate_src_register(
+      emit, &insn->Src[1] );
+      
+   return emit_select( emit, compare, dst, src0, src1 );
+}
+
+
+/* Translate texture instructions to SVGA3D representation.
+ */
+static boolean emit_tex2(struct svga_shader_emitter *emit,
+                         const struct tgsi_full_instruction *insn,
+                         SVGA3dShaderDestToken dst )
+{
+   SVGA3dShaderInstToken inst;
+   struct src_register texcoord;
+   struct src_register sampler;
+   SVGA3dShaderDestToken tmp;
+   
+   inst.value = 0;
+
+   switch (insn->Instruction.Opcode) {
+   case TGSI_OPCODE_TEX:
+      inst.op = SVGA3DOP_TEX;
+      break;
+   case TGSI_OPCODE_TXP:
+      inst.op = SVGA3DOP_TEX;
+      inst.control = SVGA3DOPCONT_PROJECT;
+      break;
+   case TGSI_OPCODE_TXB:
+      inst.op = SVGA3DOP_TEX;
+      inst.control = SVGA3DOPCONT_BIAS;
+      break;
+   case TGSI_OPCODE_TXL:
+      inst.op = SVGA3DOP_TEXLDL;
+      break;
+   default:
+      assert(0);
+      return FALSE;
+   }
+
+   texcoord = translate_src_register( emit, &insn->Src[0] );
+   sampler = translate_src_register( emit, &insn->Src[1] );
+
+   if (emit->key.fkey.tex[sampler.base.num].unnormalized ||
+       emit->dynamic_branching_level > 0)
+      tmp = get_temp( emit );
+
+   /* Can't do mipmapping inside dynamic branch constructs.  Force LOD
+    * zero in that case.
+    */
+   if (emit->dynamic_branching_level > 0 &&
+       inst.op == SVGA3DOP_TEX &&
+       SVGA3dShaderGetRegType(texcoord.base.value) == SVGA3DREG_TEMP) {
+      struct src_register zero = get_zero_immediate( emit );
+
+      /* MOV  tmp, texcoord */
+      if (!submit_op1( emit,
+                       inst_token( SVGA3DOP_MOV ),
+                       tmp,
+                       texcoord ))
+         return FALSE;
+
+      /* MOV  tmp.w, zero */
+      if (!submit_op1( emit, 
+                       inst_token( SVGA3DOP_MOV ),
+                       writemask( tmp, TGSI_WRITEMASK_W ), 
+                       scalar( zero, TGSI_SWIZZLE_X )))
+         return FALSE;
+      
+      texcoord = src( tmp );
+      inst.op = SVGA3DOP_TEXLDL;
+   }
+
+   /* Explicit normalization of texcoords:
+    */
+   if (emit->key.fkey.tex[sampler.base.num].unnormalized) {
+      struct src_register wh = get_tex_dimensions( emit, sampler.base.num );
+
+      /* MUL  tmp, SRC0, WH */
+      if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ),
+                       tmp, texcoord, wh ))
+         return FALSE;
+
+      texcoord = src( tmp );
+   }
+
+   return submit_op2( emit, inst, dst, texcoord, sampler );
+}
+
+
+
+
+/* Translate texture instructions to SVGA3D representation.
+ */
+static boolean emit_tex4(struct svga_shader_emitter *emit,
+                         const struct tgsi_full_instruction *insn,
+                         SVGA3dShaderDestToken dst )
+{
+   SVGA3dShaderInstToken inst;
+   struct src_register texcoord;
+   struct src_register ddx;
+   struct src_register ddy;
+   struct src_register sampler;
+
+   texcoord = translate_src_register( emit, &insn->Src[0] );
+   ddx      = translate_src_register( emit, &insn->Src[1] );
+   ddy      = translate_src_register( emit, &insn->Src[2] );
+   sampler  = translate_src_register( emit, &insn->Src[3] );
+
+   inst.value = 0;
+
+   switch (insn->Instruction.Opcode) {
+   case TGSI_OPCODE_TXD: 
+      inst.op = SVGA3DOP_TEXLDD; /* 4 args! */
+      break;
+   default:
+      assert(0);
+      return FALSE;
+   }
+
+   return submit_op4( emit, inst, dst, texcoord, sampler, ddx, ddy );
+}
+
+
+static boolean emit_tex(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = 
+      translate_dst_register( emit, insn, 0 );
+   struct src_register src0 =
+      translate_src_register( emit, &insn->Src[0] );
+   struct src_register src1 =
+      translate_src_register( emit, &insn->Src[1] );
+
+   SVGA3dShaderDestToken tex_result;
+
+   /* check for shadow samplers */
+   boolean compare = (emit->key.fkey.tex[src1.base.num].compare_mode ==
+                      PIPE_TEX_COMPARE_R_TO_TEXTURE);
+
+
+   /* If doing compare processing, need to put this value into a
+    * temporary so it can be used as a source later on.
+    */
+   if (compare ||
+       (!emit->use_sm30 && dst.mask != TGSI_WRITEMASK_XYZW) ) {
+      tex_result = get_temp( emit );
+   }
+   else {
+      tex_result = dst;
+   }
+
+   switch(insn->Instruction.Opcode) {
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_TXL:
+      if (!emit_tex2( emit, insn, tex_result ))
+         return FALSE;
+      break;
+   case TGSI_OPCODE_TXD:
+      if (!emit_tex4( emit, insn, tex_result ))
+         return FALSE;
+      break;
+   default:
+      assert(0);
+   }
+
+
+   if (compare) {
+      if (dst.mask & TGSI_WRITEMASK_XYZ) {
+         SVGA3dShaderDestToken src0_zdivw = get_temp( emit );
+         struct src_register tex_src_x = scalar(src(tex_result), TGSI_SWIZZLE_Y);
+
+         /* Divide texcoord R by Q */
+         if (!submit_op1( emit, inst_token( SVGA3DOP_RCP ),
+                          writemask(src0_zdivw, TGSI_WRITEMASK_X),
+                          scalar(src0, TGSI_SWIZZLE_W) ))
+            return FALSE;
+
+         if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ),
+                          writemask(src0_zdivw, TGSI_WRITEMASK_X),
+                          scalar(src0, TGSI_SWIZZLE_Z),
+                          scalar(src(src0_zdivw), TGSI_SWIZZLE_X) ))
+            return FALSE;
+
+         if (!emit_select(
+                emit,
+                emit->key.fkey.tex[src1.base.num].compare_func,
+                writemask( dst, TGSI_WRITEMASK_XYZ ),
+                scalar(src(src0_zdivw), TGSI_SWIZZLE_X),
+                tex_src_x))
+            return FALSE;
+      }
+
+      if (dst.mask & TGSI_WRITEMASK_W) {
+         struct src_register one =
+            scalar( get_zero_immediate( emit ), TGSI_SWIZZLE_W );
+
+        if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                         writemask( dst, TGSI_WRITEMASK_W ),
+                         one ))
+           return FALSE;
+      }
+
+      return TRUE;
+   }
+   else if (!emit->use_sm30 && dst.mask != TGSI_WRITEMASK_XYZW) 
+   {
+      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), dst, src(tex_result) ))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+static boolean emit_bgnloop2( struct svga_shader_emitter *emit,
+                              const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst = inst_token( SVGA3DOP_LOOP );
+   struct src_register loop_reg = src_register( SVGA3DREG_LOOP, 0 );
+   struct src_register const_int = get_loop_const( emit );
+
+   emit->dynamic_branching_level++;
+
+   return (emit_instruction( emit, inst ) &&
+           emit_src( emit, loop_reg ) &&
+           emit_src( emit, const_int ) );
+}
+
+static boolean emit_endloop2( struct svga_shader_emitter *emit,
+                              const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst = inst_token( SVGA3DOP_ENDLOOP );
+
+   emit->dynamic_branching_level--;
+
+   return emit_instruction( emit, inst );
+}
+
+static boolean emit_brk( struct svga_shader_emitter *emit,
+                         const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst = inst_token( SVGA3DOP_BREAK );
+   return emit_instruction( emit, inst );
+}
+
+static boolean emit_scalar_op1( struct svga_shader_emitter *emit,
+                                unsigned opcode,
+                                const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst;
+   SVGA3dShaderDestToken dst;
+   struct src_register src;
+
+   inst = inst_token( opcode );
+   dst = translate_dst_register( emit, insn, 0 );
+   src = translate_src_register( emit, &insn->Src[0] );
+   src = scalar( src, TGSI_SWIZZLE_X );
+
+   return submit_op1( emit, inst, dst, src );
+}
+
+
+static boolean emit_simple_instruction(struct svga_shader_emitter *emit,
+                                       unsigned opcode,
+                                       const struct tgsi_full_instruction *insn )
+{
+   const struct tgsi_full_src_register *src = insn->Src;
+   SVGA3dShaderInstToken inst;
+   SVGA3dShaderDestToken dst;
+
+   inst = inst_token( opcode );
+   dst = translate_dst_register( emit, insn, 0 );
+
+   switch (insn->Instruction.NumSrcRegs) {
+   case 0:
+      return submit_op0( emit, inst, dst );
+   case 1:
+      return submit_op1( emit, inst, dst,
+                         translate_src_register( emit, &src[0] ));
+   case 2:
+      return submit_op2( emit, inst, dst,
+                         translate_src_register( emit, &src[0] ),
+                         translate_src_register( emit, &src[1] ) );
+   case 3:
+      return submit_op3( emit, inst, dst,
+                         translate_src_register( emit, &src[0] ),
+                         translate_src_register( emit, &src[1] ),
+                         translate_src_register( emit, &src[2] ) );
+   default:
+      assert(0);
+      return FALSE;
+   }
+}
+
+
+static boolean emit_deriv(struct svga_shader_emitter *emit,
+                          const struct tgsi_full_instruction *insn )
+{
+   if (emit->dynamic_branching_level > 0 &&
+       insn->Src[0].Register.File == TGSI_FILE_TEMPORARY) 
+   {
+      struct src_register zero = get_zero_immediate( emit );
+      SVGA3dShaderDestToken dst = 
+         translate_dst_register( emit, insn, 0 );
+
+      /* Deriv opcodes not valid inside dynamic branching, workaround
+       * by zeroing out the destination.
+       */
+      if (!submit_op1(emit, 
+                      inst_token( SVGA3DOP_MOV ), 
+                      dst,
+                      scalar(zero, TGSI_SWIZZLE_X)))
+         return FALSE;
+      
+      return TRUE;
+   }
+   else {
+      unsigned opcode;
+
+      switch (insn->Instruction.Opcode) {
+      case TGSI_OPCODE_DDX:
+         opcode = SVGA3DOP_DSX;
+         break;
+      case TGSI_OPCODE_DDY:
+         opcode = SVGA3DOP_DSY;
+         break;
+      default:
+         return FALSE;
+      }
+
+      return emit_simple_instruction( emit, opcode, insn );
+   }
+}
+
+static boolean emit_arl(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   ++emit->current_arl;
+   if (svga_arl_needs_adjustment( emit )) {
+      return emit_fake_arl( emit, insn );
+   } else {
+      /* no need to adjust, just emit straight arl */
+      return emit_simple_instruction(emit, SVGA3DOP_MOVA, insn);
+   }
+}
+
+static boolean alias_src_dst( struct src_register src,
+                              SVGA3dShaderDestToken dst )
+{
+   if (src.base.num != dst.num)
+      return FALSE;
+
+   if (SVGA3dShaderGetRegType(dst.value) != 
+       SVGA3dShaderGetRegType(src.base.value))
+      return FALSE;
+
+   return TRUE;
+}
+
+static boolean emit_pow(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   struct src_register src1 = translate_src_register(
+      emit, &insn->Src[1] );
+   boolean need_tmp = FALSE;
+   
+   /* POW can only output to a temporary */
+   if (insn->Dst[0].Register.File != TGSI_FILE_TEMPORARY)
+      need_tmp = TRUE;
+   
+   /* POW src1 must not be the same register as dst */
+   if (alias_src_dst( src1, dst ))
+      need_tmp = TRUE;
+
+   /* it's a scalar op */
+   src0 = scalar( src0, TGSI_SWIZZLE_X );
+   src1 = scalar( src1, TGSI_SWIZZLE_X );
+
+   if (need_tmp) {
+      SVGA3dShaderDestToken tmp = writemask(get_temp( emit ), TGSI_WRITEMASK_X );
+
+      if (!submit_op2(emit, inst_token( SVGA3DOP_POW ), tmp, src0, src1))
+         return FALSE;
+
+      return submit_op1(emit, inst_token( SVGA3DOP_MOV ), dst, scalar(src(tmp), 0) );
+   } 
+   else {
+      return submit_op2(emit, inst_token( SVGA3DOP_POW ), dst, src0, src1);
+   }
+}
+
+static boolean emit_xpd(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   const struct src_register src1 = translate_src_register(
+      emit, &insn->Src[1] );
+   boolean need_dst_tmp = FALSE;
+
+   /* XPD can only output to a temporary */
+   if (SVGA3dShaderGetRegType(dst.value) != SVGA3DREG_TEMP) 
+      need_dst_tmp = TRUE;
+
+   /* The dst reg must not be the same as src0 or src1*/
+   if (alias_src_dst(src0, dst) ||
+       alias_src_dst(src1, dst))
+      need_dst_tmp = TRUE;
+
+   if (need_dst_tmp) {
+      SVGA3dShaderDestToken tmp = get_temp( emit );
+
+      /* Obey DX9 restrictions on mask:
+       */
+      tmp.mask = dst.mask & TGSI_WRITEMASK_XYZ;
+
+      if (!submit_op2(emit, inst_token( SVGA3DOP_CRS ), tmp, src0, src1))
+         return FALSE;
+
+      if (!submit_op1(emit, inst_token( SVGA3DOP_MOV ), dst, src( tmp )))
+         return FALSE;
+   } 
+   else {
+      if (!submit_op2(emit, inst_token( SVGA3DOP_CRS ), dst, src0, src1))
+         return FALSE;
+   }
+
+   /* Need to emit 1.0 to dst.w?
+    */
+   if (dst.mask & TGSI_WRITEMASK_W) {
+      struct src_register zero = get_zero_immediate( emit );
+
+      if (!submit_op1(emit, 
+                      inst_token( SVGA3DOP_MOV ), 
+                      writemask(dst, TGSI_WRITEMASK_W),
+                      zero))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+static boolean emit_lrp(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   SVGA3dShaderDestToken tmp;
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   const struct src_register src1 = translate_src_register(
+      emit, &insn->Src[1] );
+   const struct src_register src2 = translate_src_register(
+      emit, &insn->Src[2] );
+   boolean need_dst_tmp = FALSE;
+
+   /* The dst reg must not be the same as src0 or src2 */
+   if (alias_src_dst(src0, dst) ||
+       alias_src_dst(src2, dst))
+      need_dst_tmp = TRUE;
+
+   if (need_dst_tmp) {
+      tmp = get_temp( emit );
+      tmp.mask = dst.mask;
+   }
+   else {
+      tmp = dst;
+   }
+
+   if (!submit_op3(emit, inst_token( SVGA3DOP_LRP ), tmp, src0, src1, src2))
+      return FALSE;
+
+   if (need_dst_tmp) {
+      if (!submit_op1(emit, inst_token( SVGA3DOP_MOV ), dst, src( tmp )))
+         return FALSE;      
+   } 
+
+   return TRUE;
+}
+
+
+static boolean emit_dst_insn(struct svga_shader_emitter *emit,
+                             const struct tgsi_full_instruction *insn )
+{
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      /* SVGA/DX9 has a DST instruction, but only for vertex shaders:
+       */
+      return emit_simple_instruction(emit, SVGA3DOP_DST, insn);
+   }
+   else {
+
+      /* result[0] = 1    * 1;
+       * result[1] = a[1] * b[1];
+       * result[2] = a[2] * 1;
+       * result[3] = 1    * b[3];
+       */
+
+      SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+      SVGA3dShaderDestToken tmp;
+      const struct src_register src0 = translate_src_register(
+         emit, &insn->Src[0] );
+      const struct src_register src1 = translate_src_register(
+         emit, &insn->Src[1] );
+      struct src_register zero = get_zero_immediate( emit );
+      boolean need_tmp = FALSE;
+
+      if (SVGA3dShaderGetRegType(dst.value) != SVGA3DREG_TEMP ||
+          alias_src_dst(src0, dst) ||
+          alias_src_dst(src1, dst))
+         need_tmp = TRUE;
+
+      if (need_tmp) {
+         tmp = get_temp( emit );
+      }
+      else {
+         tmp = dst;
+      }
+
+      /* tmp.xw = 1.0
+       */
+      if (tmp.mask & TGSI_WRITEMASK_XW) {
+         if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), 
+                          writemask(tmp, TGSI_WRITEMASK_XW ),
+                          scalar( zero, 3 )))
+            return FALSE;
+      }
+      
+      /* tmp.yz = src0
+       */
+      if (tmp.mask & TGSI_WRITEMASK_YZ) {
+         if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), 
+                          writemask(tmp, TGSI_WRITEMASK_YZ ),
+                          src0))
+            return FALSE;
+      }
+
+      /* tmp.yw = tmp * src1
+       */
+      if (tmp.mask & TGSI_WRITEMASK_YW) {
+         if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ), 
+                          writemask(tmp, TGSI_WRITEMASK_YW ),
+                          src(tmp),
+                          src1))
+            return FALSE;
+      }
+
+      /* dst = tmp
+       */
+      if (need_tmp) {
+         if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), 
+                          dst,
+                          src(tmp)))
+            return FALSE;
+      }      
+   }
+   
+   return TRUE;
+}
+
+
+static boolean emit_exp(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 =
+      translate_src_register( emit, &insn->Src[0] );
+   struct src_register zero = get_zero_immediate( emit );
+   SVGA3dShaderDestToken fraction;
+
+   if (dst.mask & TGSI_WRITEMASK_Y)
+      fraction = dst;
+   else if (dst.mask & TGSI_WRITEMASK_X)
+      fraction = get_temp( emit );
+   else
+      fraction.value = 0;
+
+   /* If y is being written, fill it with src0 - floor(src0).
+    */
+   if (dst.mask & TGSI_WRITEMASK_XY) {
+      if (!submit_op1( emit, inst_token( SVGA3DOP_FRC ),
+                       writemask( fraction, TGSI_WRITEMASK_Y ),
+                       src0 ))
+         return FALSE;
+   }
+
+   /* If x is being written, fill it with 2 ^ floor(src0).
+    */
+   if (dst.mask & TGSI_WRITEMASK_X) {
+      if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ),
+                       writemask( dst, TGSI_WRITEMASK_X ),
+                       src0,
+                       scalar( negate( src( fraction ) ), TGSI_SWIZZLE_Y ) ) )
+         return FALSE;
+
+      if (!submit_op1( emit, inst_token( SVGA3DOP_EXP ),
+                       writemask( dst, TGSI_WRITEMASK_X ),
+                       scalar( src( dst ), TGSI_SWIZZLE_X ) ) )
+         return FALSE;
+
+      if (!(dst.mask & TGSI_WRITEMASK_Y))
+         release_temp( emit, fraction );
+   }
+
+   /* If z is being written, fill it with 2 ^ src0 (partial precision).
+    */
+   if (dst.mask & TGSI_WRITEMASK_Z) {
+      if (!submit_op1( emit, inst_token( SVGA3DOP_EXPP ),
+                       writemask( dst, TGSI_WRITEMASK_Z ),
+                       src0 ) )
+         return FALSE;
+   }
+
+   /* If w is being written, fill it with one.
+    */
+   if (dst.mask & TGSI_WRITEMASK_W) {
+      if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                       writemask(dst, TGSI_WRITEMASK_W),
+                       scalar( zero, TGSI_SWIZZLE_W ) ))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+static boolean emit_lit(struct svga_shader_emitter *emit,
+                             const struct tgsi_full_instruction *insn )
+{
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      /* SVGA/DX9 has a LIT instruction, but only for vertex shaders:
+       */
+      return emit_simple_instruction(emit, SVGA3DOP_LIT, insn);
+   }
+   else {
+
+      /* D3D vs. GL semantics can be fairly easily accomodated by
+       * variations on this sequence.
+       *
+       * GL:
+       *   tmp.y = src.x
+       *   tmp.z = pow(src.y,src.w)
+       *   p0 = src0.xxxx > 0
+       *   result = zero.wxxw
+       *   (p0) result.yz = tmp
+       *
+       * D3D:
+       *   tmp.y = src.x
+       *   tmp.z = pow(src.y,src.w)
+       *   p0 = src0.xxyy > 0
+       *   result = zero.wxxw
+       *   (p0) result.yz = tmp
+       *
+       * Will implement the GL version for now.
+       */
+
+      SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+      SVGA3dShaderDestToken tmp = get_temp( emit );
+      const struct src_register src0 = translate_src_register(
+         emit, &insn->Src[0] );
+      struct src_register zero = get_zero_immediate( emit );
+
+      /* tmp = pow(src.y, src.w)
+       */
+      if (dst.mask & TGSI_WRITEMASK_Z) {
+         if (!submit_op2(emit, inst_token( SVGA3DOP_POW ), 
+                         tmp, 
+                         scalar(src0, 1), 
+                         scalar(src0, 3)))
+            return FALSE;
+      }
+
+      /* tmp.y = src.x
+       */
+      if (dst.mask & TGSI_WRITEMASK_Y) {
+         if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), 
+                          writemask(tmp, TGSI_WRITEMASK_Y ),
+                          scalar(src0, 0)))
+            return FALSE;
+      }
+      
+      /* Can't quite do this with emit conditional due to the extra
+       * writemask on the predicated mov:
+       */
+      {
+         SVGA3dShaderDestToken pred_reg = dst_register( SVGA3DREG_PREDICATE, 0 );
+         SVGA3dShaderInstToken setp_token, mov_token;
+         struct src_register predsrc;
+
+         setp_token = inst_token( SVGA3DOP_SETP );
+         mov_token = inst_token( SVGA3DOP_MOV );
+
+         setp_token.control = SVGA3DOPCOMP_GT;
+
+         /* D3D vs GL semantics:
+          */
+         if (0)
+            predsrc = swizzle(src0, 0, 0, 1, 1); /* D3D */
+         else
+            predsrc = swizzle(src0, 0, 0, 0, 0); /* GL */
+
+         /* SETP src0.xxyy, GT, {0}.x */
+         if (!submit_op2( emit, setp_token, pred_reg,
+                          predsrc, 
+                          swizzle(zero, 0, 0, 0, 0) ))
+            return FALSE;
+         
+         /* MOV dst, fail */
+         if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), dst,
+                          swizzle(zero, 3, 0, 0, 3 )))
+             return FALSE;
+
+         /* MOV dst.yz, tmp (predicated)
+          *
+          * Note that the predicate reg (and possible modifiers) is passed
+          * as the first source argument.
+          */
+         if (dst.mask & TGSI_WRITEMASK_YZ) {
+            mov_token.predicated = 1;
+            if (!submit_op2( emit, mov_token,
+                             writemask(dst, TGSI_WRITEMASK_YZ),
+                             src( pred_reg ), src( tmp ) ))
+               return FALSE;
+         }
+      }
+   }
+
+   return TRUE;
+}
+
+
+
+
+static boolean emit_ex2( struct svga_shader_emitter *emit,
+                         const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst;
+   SVGA3dShaderDestToken dst;
+   struct src_register src0;
+
+   inst = inst_token( SVGA3DOP_EXP );
+   dst = translate_dst_register( emit, insn, 0 );
+   src0 = translate_src_register( emit, &insn->Src[0] );
+   src0 = scalar( src0, TGSI_SWIZZLE_X );
+
+   if (dst.mask != TGSI_WRITEMASK_XYZW) {
+      SVGA3dShaderDestToken tmp = get_temp( emit );
+
+      if (!submit_op1( emit, inst, tmp, src0 ))
+         return FALSE;
+
+      return submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                         dst,
+                         scalar( src( tmp ), TGSI_SWIZZLE_X ) );
+   }
+
+   return submit_op1( emit, inst, dst, src0 );
+}
+
+
+static boolean emit_log(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 =
+      translate_src_register( emit, &insn->Src[0] );
+   struct src_register zero = get_zero_immediate( emit );
+   SVGA3dShaderDestToken abs_tmp;
+   struct src_register abs_src0;
+   SVGA3dShaderDestToken log2_abs;
+
+   abs_tmp.value = 0;
+
+   if (dst.mask & TGSI_WRITEMASK_Z)
+      log2_abs = dst;
+   else if (dst.mask & TGSI_WRITEMASK_XY)
+      log2_abs = get_temp( emit );
+   else
+      log2_abs.value = 0;
+
+   /* If z is being written, fill it with log2( abs( src0 ) ).
+    */
+   if (dst.mask & TGSI_WRITEMASK_XYZ) {
+      if (!src0.base.srcMod || src0.base.srcMod == SVGA3DSRCMOD_ABS)
+         abs_src0 = src0;
+      else {
+         abs_tmp = get_temp( emit );
+
+         if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                          abs_tmp,
+                          src0 ) )
+            return FALSE;
+
+         abs_src0 = src( abs_tmp );
+      }
+
+      abs_src0 = absolute( scalar( abs_src0, TGSI_SWIZZLE_X ) );
+
+      if (!submit_op1( emit, inst_token( SVGA3DOP_LOG ),
+                       writemask( log2_abs, TGSI_WRITEMASK_Z ),
+                       abs_src0 ) )
+         return FALSE;
+   }
+
+   if (dst.mask & TGSI_WRITEMASK_XY) {
+      SVGA3dShaderDestToken floor_log2;
+
+      if (dst.mask & TGSI_WRITEMASK_X)
+         floor_log2 = dst;
+      else
+         floor_log2 = get_temp( emit );
+
+      /* If x is being written, fill it with floor( log2( abs( src0 ) ) ).
+       */
+      if (!submit_op1( emit, inst_token( SVGA3DOP_FRC ),
+                       writemask( floor_log2, TGSI_WRITEMASK_X ),
+                       scalar( src( log2_abs ), TGSI_SWIZZLE_Z ) ) )
+         return FALSE;
+
+      if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ),
+                       writemask( floor_log2, TGSI_WRITEMASK_X ),
+                       scalar( src( log2_abs ), TGSI_SWIZZLE_Z ),
+                       negate( src( floor_log2 ) ) ) )
+         return FALSE;
+
+      /* If y is being written, fill it with
+       * abs ( src0 ) / ( 2 ^ floor( log2( abs( src0 ) ) ) ).
+       */
+      if (dst.mask & TGSI_WRITEMASK_Y) {
+         if (!submit_op1( emit, inst_token( SVGA3DOP_EXP ),
+                          writemask( dst, TGSI_WRITEMASK_Y ),
+                          negate( scalar( src( floor_log2 ),
+                                          TGSI_SWIZZLE_X ) ) ) )
+            return FALSE;
+
+         if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ),
+                          writemask( dst, TGSI_WRITEMASK_Y ),
+                          src( dst ),
+                          abs_src0 ) )
+            return FALSE;
+      }
+
+      if (!(dst.mask & TGSI_WRITEMASK_X))
+         release_temp( emit, floor_log2 );
+
+      if (!(dst.mask & TGSI_WRITEMASK_Z))
+         release_temp( emit, log2_abs );
+   }
+
+   if (dst.mask & TGSI_WRITEMASK_XYZ && src0.base.srcMod &&
+       src0.base.srcMod != SVGA3DSRCMOD_ABS)
+      release_temp( emit, abs_tmp );
+
+   /* If w is being written, fill it with one.
+    */
+   if (dst.mask & TGSI_WRITEMASK_W) {
+      if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                       writemask(dst, TGSI_WRITEMASK_W),
+                       scalar( zero, TGSI_SWIZZLE_W ) ))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+static boolean emit_bgnsub( struct svga_shader_emitter *emit,
+                           unsigned position,
+                           const struct tgsi_full_instruction *insn )
+{
+   unsigned i;
+
+   /* Note that we've finished the main function and are now emitting
+    * subroutines.  This affects how we terminate the generated
+    * shader.
+    */
+   emit->in_main_func = FALSE;
+   
+   for (i = 0; i < emit->nr_labels; i++) {
+      if (emit->label[i] == position) {
+         return (emit_instruction( emit, inst_token( SVGA3DOP_RET ) ) &&
+                 emit_instruction( emit, inst_token( SVGA3DOP_LABEL ) ) &&
+                 emit_src( emit, src_register( SVGA3DREG_LABEL, i )));
+      }
+   }
+
+   assert(0);
+   return TRUE;
+}
+
+static boolean emit_call( struct svga_shader_emitter *emit,
+                           const struct tgsi_full_instruction *insn )
+{
+   unsigned position = insn->Label.Label;
+   unsigned i;
+   
+   for (i = 0; i < emit->nr_labels; i++) {
+      if (emit->label[i] == position) 
+         break;
+   }
+
+   if (emit->nr_labels == Elements(emit->label))
+      return FALSE;
+
+   if (i == emit->nr_labels) {
+      emit->label[i] = position;
+      emit->nr_labels++;
+   }
+
+   return (emit_instruction( emit, inst_token( SVGA3DOP_CALL ) ) &&
+           emit_src( emit, src_register( SVGA3DREG_LABEL, i )));
+}
+
+
+static boolean emit_end( struct svga_shader_emitter *emit )
+{
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      return emit_vs_postamble( emit );
+   }
+   else {
+      return emit_ps_postamble( emit );
+   }
+}
+
+
+
+static boolean svga_emit_instruction( struct svga_shader_emitter *emit,
+                                      unsigned position,
+                                      const struct tgsi_full_instruction *insn )
+{
+   switch (insn->Instruction.Opcode) {
+
+   case TGSI_OPCODE_ARL:
+      return emit_arl( emit, insn );
+
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_TXL:
+   case TGSI_OPCODE_TXD:
+      return emit_tex( emit, insn );
+
+   case TGSI_OPCODE_DDX:
+   case TGSI_OPCODE_DDY:
+      return emit_deriv( emit, insn );
+
+   case TGSI_OPCODE_BGNSUB:
+      return emit_bgnsub( emit, position, insn );
+
+   case TGSI_OPCODE_ENDSUB:
+      return TRUE;
+
+   case TGSI_OPCODE_CAL:
+      return emit_call( emit, insn );
+
+   case TGSI_OPCODE_FLR:
+   case TGSI_OPCODE_TRUNC:        /* should be TRUNC, not FLR */
+      return emit_floor( emit, insn );
+
+   case TGSI_OPCODE_CMP:
+      return emit_cmp( emit, insn );
+
+   case TGSI_OPCODE_DIV:
+      return emit_div( emit, insn );
+
+   case TGSI_OPCODE_DP2:
+      return emit_dp2( emit, insn );
+
+   case TGSI_OPCODE_DPH:
+      return emit_dph( emit, insn );
+
+   case TGSI_OPCODE_NRM:
+      return emit_nrm( emit, insn );
+
+   case TGSI_OPCODE_COS:
+      return emit_cos( emit, insn );
+
+   case TGSI_OPCODE_SIN:
+      return emit_sin( emit, insn );
+
+   case TGSI_OPCODE_SCS:
+      return emit_sincos( emit, insn );
+
+   case TGSI_OPCODE_END:
+      /* TGSI always finishes the main func with an END */
+      return emit_end( emit );
+
+   case TGSI_OPCODE_KIL:
+      return emit_kil( emit, insn );
+
+      /* Selection opcodes.  The underlying language is fairly
+       * non-orthogonal about these.
+       */
+   case TGSI_OPCODE_SEQ:
+      return emit_select_op( emit, PIPE_FUNC_EQUAL, insn );
+
+   case TGSI_OPCODE_SNE:
+      return emit_select_op( emit, PIPE_FUNC_NOTEQUAL, insn );
+
+   case TGSI_OPCODE_SGT:
+      return emit_select_op( emit, PIPE_FUNC_GREATER, insn );
+
+   case TGSI_OPCODE_SGE:
+      return emit_select_op( emit, PIPE_FUNC_GEQUAL, insn );
+
+   case TGSI_OPCODE_SLT:
+      return emit_select_op( emit, PIPE_FUNC_LESS, insn );
+
+   case TGSI_OPCODE_SLE:
+      return emit_select_op( emit, PIPE_FUNC_LEQUAL, insn );
+
+   case TGSI_OPCODE_SUB:
+      return emit_sub( emit, insn );
+
+   case TGSI_OPCODE_POW:
+      return emit_pow( emit, insn );
+
+   case TGSI_OPCODE_EX2:
+      return emit_ex2( emit, insn );
+
+   case TGSI_OPCODE_EXP:
+      return emit_exp( emit, insn );
+
+   case TGSI_OPCODE_LOG:
+      return emit_log( emit, insn );
+
+   case TGSI_OPCODE_LG2:
+      return emit_scalar_op1( emit, SVGA3DOP_LOG, insn );
+
+   case TGSI_OPCODE_RSQ:
+      return emit_scalar_op1( emit, SVGA3DOP_RSQ, insn );
+
+   case TGSI_OPCODE_RCP:
+      return emit_scalar_op1( emit, SVGA3DOP_RCP, insn );
+
+   case TGSI_OPCODE_CONT:
+   case TGSI_OPCODE_RET:
+      /* This is a noop -- we tell mesa that we can't support RET
+       * within a function (early return), so this will always be
+       * followed by an ENDSUB.
+       */
+      return TRUE;
+
+      /* These aren't actually used by any of the frontends we care
+       * about:
+       */
+   case TGSI_OPCODE_CLAMP:
+   case TGSI_OPCODE_ROUND:
+   case TGSI_OPCODE_AND:
+   case TGSI_OPCODE_OR:
+   case TGSI_OPCODE_I2F:
+   case TGSI_OPCODE_NOT:
+   case TGSI_OPCODE_SHL:
+   case TGSI_OPCODE_ISHR:
+   case TGSI_OPCODE_XOR:
+      return FALSE;
+
+   case TGSI_OPCODE_IF:
+      return emit_if( emit, insn );
+   case TGSI_OPCODE_ELSE:
+      return emit_else( emit, insn );
+   case TGSI_OPCODE_ENDIF:
+      return emit_endif( emit, insn );
+
+   case TGSI_OPCODE_BGNLOOP:
+      return emit_bgnloop2( emit, insn );
+   case TGSI_OPCODE_ENDLOOP:
+      return emit_endloop2( emit, insn );
+   case TGSI_OPCODE_BRK:
+      return emit_brk( emit, insn );
+
+   case TGSI_OPCODE_XPD:
+      return emit_xpd( emit, insn );
+
+   case TGSI_OPCODE_KILP:
+      return emit_kilp( emit, insn );
+
+   case TGSI_OPCODE_DST:
+      return emit_dst_insn( emit, insn );
+
+   case TGSI_OPCODE_LIT:
+      return emit_lit( emit, insn );
+
+   case TGSI_OPCODE_LRP:
+      return emit_lrp( emit, insn );
+
+   default: {
+      unsigned opcode = translate_opcode(insn->Instruction.Opcode);
+
+      if (opcode == SVGA3DOP_LAST_INST)
+         return FALSE;
+
+      if (!emit_simple_instruction( emit, opcode, insn ))
+         return FALSE;
+   }
+   }
+
+   return TRUE;
+}
+
+
+static boolean svga_emit_immediate( struct svga_shader_emitter *emit,
+                                    struct tgsi_full_immediate *imm)
+{
+   static const float id[4] = {0,0,0,1};
+   float value[4];
+   unsigned i;
+
+   assert(1 <= imm->Immediate.NrTokens && imm->Immediate.NrTokens <= 5);
+   for (i = 0; i < imm->Immediate.NrTokens - 1; i++)
+      value[i] = imm->u[i].Float;
+
+   for ( ; i < 4; i++ )
+      value[i] = id[i];
+
+   return emit_def_const( emit, SVGA3D_CONST_TYPE_FLOAT,
+                          emit->imm_start + emit->internal_imm_count++,
+                          value[0], value[1], value[2], value[3]);
+}
+
+static boolean make_immediate( struct svga_shader_emitter *emit,
+                               float a,
+                               float b,
+                               float c,
+                               float d,
+                               struct src_register *out )
+{
+   unsigned idx = emit->nr_hw_const++;
+
+   if (!emit_def_const( emit, SVGA3D_CONST_TYPE_FLOAT,
+                        idx, a, b, c, d ))
+      return FALSE;
+
+   *out = src_register( SVGA3DREG_CONST, idx );
+
+   return TRUE;
+}
+
+static boolean emit_vs_preamble( struct svga_shader_emitter *emit )
+{
+   if (!emit->key.vkey.need_prescale) {
+      if (!make_immediate( emit, 0, 0, .5, .5,
+                           &emit->imm_0055))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+static boolean emit_ps_preamble( struct svga_shader_emitter *emit )
+{
+   unsigned i;
+
+   /* For SM20, need to initialize the temporaries we're using to hold
+    * color outputs to some value.  Shaders which don't set all of
+    * these values are likely to be rejected by the DX9 runtime.
+    */
+   if (!emit->use_sm30) {
+      struct src_register zero = get_zero_immediate( emit );
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+         if (SVGA3dShaderGetRegType(emit->true_col[i].value) != 0) {
+            
+            if (!submit_op1( emit,
+                             inst_token(SVGA3DOP_MOV),
+                             emit->temp_col[i],
+                             zero ))
+               return FALSE;
+         }
+      }
+   }
+   
+   return TRUE;
+}
+
+static boolean emit_ps_postamble( struct svga_shader_emitter *emit )
+{
+   unsigned i;
+
+   /* PS oDepth is incredibly fragile and it's very hard to catch the
+    * types of usage that break it during shader emit.  Easier just to
+    * redirect the main program to a temporary and then only touch
+    * oDepth with a hand-crafted MOV below.
+    */
+   if (SVGA3dShaderGetRegType(emit->true_pos.value) != 0) {
+
+      if (!submit_op1( emit,
+                       inst_token(SVGA3DOP_MOV),
+                       emit->true_pos,
+                       scalar(src(emit->temp_pos), TGSI_SWIZZLE_Z) ))
+         return FALSE;
+   }
+
+   /* Similarly for SM20 color outputs...  Luckily SM30 isn't so
+    * fragile.
+    */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      if (SVGA3dShaderGetRegType(emit->true_col[i].value) != 0) {
+
+         /* Potentially override output colors with white for XOR
+          * logicop workaround.
+          */
+         if (emit->unit == PIPE_SHADER_FRAGMENT &&
+             emit->key.fkey.white_fragments) {
+
+            struct src_register one = scalar( get_zero_immediate( emit ),
+                                              TGSI_SWIZZLE_W );
+
+            if (!submit_op1( emit,
+                             inst_token(SVGA3DOP_MOV),
+                             emit->true_col[i],
+                             one ))
+               return FALSE;
+         }
+         else {
+            if (!submit_op1( emit,
+                             inst_token(SVGA3DOP_MOV),
+                             emit->true_col[i],
+                             src(emit->temp_col[i]) ))
+               return FALSE;
+         }
+      }
+   }
+
+   return TRUE;
+}
+
+static boolean emit_vs_postamble( struct svga_shader_emitter *emit )
+{
+   /* PSIZ output is incredibly fragile and it's very hard to catch
+    * the types of usage that break it during shader emit.  Easier
+    * just to redirect the main program to a temporary and then only
+    * touch PSIZ with a hand-crafted MOV below.
+    */
+   if (SVGA3dShaderGetRegType(emit->true_psiz.value) != 0) {
+      
+      if (!submit_op1( emit,
+                       inst_token(SVGA3DOP_MOV),
+                       emit->true_psiz,
+                       scalar(src(emit->temp_psiz), TGSI_SWIZZLE_X) ))
+         return FALSE;
+   }
+
+   /* Need to perform various manipulations on vertex position to cope
+    * with the different GL and D3D clip spaces.
+    */
+   if (emit->key.vkey.need_prescale) {
+      SVGA3dShaderDestToken temp_pos = emit->temp_pos;
+      SVGA3dShaderDestToken pos = emit->true_pos;
+      unsigned offset = emit->info.file_max[TGSI_FILE_CONSTANT] + 1;
+      struct src_register prescale_scale = src_register( SVGA3DREG_CONST, 
+                                                         offset + 0 ); 
+      struct src_register prescale_trans = src_register( SVGA3DREG_CONST, 
+                                                         offset + 1 ); 
+
+      /* MUL temp_pos.xyz,    temp_pos,      prescale.scale
+       * MAD result.position, temp_pos.wwww, prescale.trans, temp_pos
+       *   --> Note that prescale.trans.w == 0
+       */
+      if (!submit_op2( emit, 
+                       inst_token(SVGA3DOP_MUL), 
+                       writemask(temp_pos, TGSI_WRITEMASK_XYZ), 
+                       src(temp_pos),
+                       prescale_scale ))
+         return FALSE;
+
+      if (!submit_op3( emit, 
+                       inst_token(SVGA3DOP_MAD), 
+                       pos, 
+                       swizzle(src(temp_pos), 3, 3, 3, 3),
+                       prescale_trans,
+                       src(temp_pos)))
+         return FALSE;
+   }
+   else {
+      SVGA3dShaderDestToken temp_pos = emit->temp_pos;
+      SVGA3dShaderDestToken pos = emit->true_pos;
+      struct src_register imm_0055 = emit->imm_0055;
+
+      /* Adjust GL clipping coordinate space to hardware (D3D-style):
+       *
+       * DP4 temp_pos.z, {0,0,.5,.5}, temp_pos
+       * MOV result.position, temp_pos 
+       */
+      if (!submit_op2( emit, 
+                       inst_token(SVGA3DOP_DP4), 
+                       writemask(temp_pos, TGSI_WRITEMASK_Z), 
+                       imm_0055, 
+                       src(temp_pos) ))
+         return FALSE;
+
+      if (!submit_op1( emit,
+                       inst_token(SVGA3DOP_MOV),
+                       pos,
+                       src(temp_pos) ))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+/*
+  0: IF VFACE :4
+  1:   COLOR = FrontColor;
+  2: ELSE
+  3:   COLOR = BackColor;
+  4: ENDIF
+ */
+static boolean emit_light_twoside( struct svga_shader_emitter *emit )
+{
+   struct src_register vface, zero;
+   struct src_register front[2];
+   struct src_register back[2];
+   SVGA3dShaderDestToken color[2];
+   int count =  emit->internal_color_count;
+   int i;
+   SVGA3dShaderInstToken if_token;
+
+   if (count == 0)
+      return TRUE;
+
+   vface = get_vface( emit );
+   zero = get_zero_immediate( emit );
+
+   /* Can't use get_temp() to allocate the color reg as such
+    * temporaries will be reclaimed after each instruction by the call
+    * to reset_temp_regs().
+    */
+   for (i = 0; i < count; i++) {
+      color[i] = dst_register( SVGA3DREG_TEMP, 
+                               emit->nr_hw_temp++ );
+
+      front[i] = emit->input_map[emit->internal_color_idx[i]];
+
+      /* Back is always the next input:
+       */
+      back[i] = front[i];
+      back[i].base.num = front[i].base.num + 1;
+
+      /* Reassign the input_map to the actual front-face color:
+       */
+      emit->input_map[emit->internal_color_idx[i]] = src(color[i]);
+   }
+   
+   if_token = inst_token( SVGA3DOP_IFC );
+
+   if (emit->key.fkey.front_ccw)
+      if_token.control = SVGA3DOPCOMP_LT;
+   else
+      if_token.control = SVGA3DOPCOMP_GT;
+
+   zero = scalar(zero, TGSI_SWIZZLE_X);
+
+   if (!(emit_instruction( emit, if_token ) &&
+         emit_src( emit, vface ) &&
+         emit_src( emit, zero ) ))
+      return FALSE;
+
+   for (i = 0; i < count; i++) {
+      if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), color[i], front[i] ))
+         return FALSE;
+   }
+
+   if (!(emit_instruction( emit, inst_token( SVGA3DOP_ELSE))))
+      return FALSE;
+   
+   for (i = 0; i < count; i++) {
+      if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), color[i], back[i] ))
+         return FALSE;
+   }
+
+   if (!emit_instruction( emit, inst_token( SVGA3DOP_ENDIF ) ))
+      return FALSE;
+
+   return TRUE;
+}
+
+/*
+  0: SETP_GT TEMP, VFACE, 0
+  where TEMP is a fake frontface register
+ */
+static boolean emit_frontface( struct svga_shader_emitter *emit )
+{
+   struct src_register vface, zero;
+   SVGA3dShaderDestToken temp;
+   struct src_register pass, fail;
+
+   vface = get_vface( emit );
+   zero = get_zero_immediate( emit );
+
+   /* Can't use get_temp() to allocate the fake frontface reg as such
+    * temporaries will be reclaimed after each instruction by the call
+    * to reset_temp_regs().
+    */
+   temp = dst_register( SVGA3DREG_TEMP,
+                        emit->nr_hw_temp++ );
+
+   if (emit->key.fkey.front_ccw) {
+      pass = scalar( zero, TGSI_SWIZZLE_X );
+      fail = scalar( zero, TGSI_SWIZZLE_W );
+   } else {
+      pass = scalar( zero, TGSI_SWIZZLE_W );
+      fail = scalar( zero, TGSI_SWIZZLE_X );
+   }
+
+   if (!emit_conditional(emit, PIPE_FUNC_GREATER,
+                         temp, vface, scalar( zero, TGSI_SWIZZLE_X ),
+                         pass, fail))
+      return FALSE;
+
+   /* Reassign the input_map to the actual front-face color:
+    */
+   emit->input_map[emit->internal_frontface_idx] = src(temp);
+
+   return TRUE;
+}
+
+static INLINE boolean
+needs_to_create_zero( struct svga_shader_emitter *emit )
+{
+   int i;
+
+   if (emit->unit == PIPE_SHADER_FRAGMENT) {
+      if (!emit->use_sm30)
+         return TRUE;
+
+      if (emit->key.fkey.light_twoside)
+         return TRUE;
+
+      if (emit->key.fkey.white_fragments)
+         return TRUE;
+
+      if (emit->emit_frontface)
+         return TRUE;
+
+      if (emit->info.opcode_count[TGSI_OPCODE_DST] >= 1 ||
+          emit->info.opcode_count[TGSI_OPCODE_LIT] >= 1)
+         return TRUE;
+   }
+
+   if (emit->info.opcode_count[TGSI_OPCODE_IF] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_BGNLOOP] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_DDX] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_DDY] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_SGE] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_SGT] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_SLE] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_SLT] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_SNE] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_SEQ] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_EXP] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_LOG] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_XPD] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_KILP] >= 1)
+      return TRUE;
+
+   for (i = 0; i < emit->key.fkey.num_textures; i++) {
+      if (emit->key.fkey.tex[i].compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
+         return TRUE;
+   }
+
+   return FALSE;
+}
+
+static INLINE boolean
+needs_to_create_loop_const( struct svga_shader_emitter *emit )
+{
+   return (emit->info.opcode_count[TGSI_OPCODE_BGNLOOP] >= 1);
+}
+
+static INLINE boolean
+needs_to_create_sincos_consts( struct svga_shader_emitter *emit )
+{
+   return !emit->use_sm30 && (emit->info.opcode_count[TGSI_OPCODE_SIN] >= 1 ||
+                              emit->info.opcode_count[TGSI_OPCODE_COS] >= 1 ||
+                              emit->info.opcode_count[TGSI_OPCODE_SCS] >= 1);
+}
+
+static INLINE boolean
+needs_to_create_arl_consts( struct svga_shader_emitter *emit )
+{
+   return (emit->num_arl_consts > 0);
+}
+
+static INLINE boolean
+pre_parse_add_indirect( struct svga_shader_emitter *emit,
+                        int num, int current_arl)
+{
+   int i;
+   assert(num < 0);
+
+   for (i = 0; i < emit->num_arl_consts; ++i) {
+      if (emit->arl_consts[i].arl_num == current_arl)
+         break;
+   }
+   /* new entry */
+   if (emit->num_arl_consts == i) {
+      ++emit->num_arl_consts;
+   }
+   emit->arl_consts[i].number = (emit->arl_consts[i].number > num) ?
+                                num :
+                                emit->arl_consts[i].number;
+   emit->arl_consts[i].arl_num = current_arl;
+   return TRUE;
+}
+
+static boolean
+pre_parse_instruction( struct svga_shader_emitter *emit,
+                       const struct tgsi_full_instruction *insn,
+                       int current_arl)
+{
+   if (insn->Src[0].Register.Indirect &&
+       insn->Src[0].Indirect.File == TGSI_FILE_ADDRESS) {
+      const struct tgsi_full_src_register *reg = &insn->Src[0];
+      if (reg->Register.Index < 0) {
+         pre_parse_add_indirect(emit, reg->Register.Index, current_arl);
+      }
+   }
+
+   if (insn->Src[1].Register.Indirect &&
+       insn->Src[1].Indirect.File == TGSI_FILE_ADDRESS) {
+      const struct tgsi_full_src_register *reg = &insn->Src[1];
+      if (reg->Register.Index < 0) {
+         pre_parse_add_indirect(emit, reg->Register.Index, current_arl);
+      }
+   }
+
+   if (insn->Src[2].Register.Indirect &&
+       insn->Src[2].Indirect.File == TGSI_FILE_ADDRESS) {
+      const struct tgsi_full_src_register *reg = &insn->Src[2];
+      if (reg->Register.Index < 0) {
+         pre_parse_add_indirect(emit, reg->Register.Index, current_arl);
+      }
+   }
+
+   return TRUE;
+}
+
+static boolean
+pre_parse_tokens( struct svga_shader_emitter *emit,
+                  const struct tgsi_token *tokens )
+{
+   struct tgsi_parse_context parse;
+   int current_arl = 0;
+
+   tgsi_parse_init( &parse, tokens );
+
+   while (!tgsi_parse_end_of_tokens( &parse )) {
+      tgsi_parse_token( &parse );
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         break;
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (parse.FullToken.FullInstruction.Instruction.Opcode ==
+             TGSI_OPCODE_ARL) {
+            ++current_arl;
+         }
+         if (!pre_parse_instruction( emit, &parse.FullToken.FullInstruction,
+                                     current_arl ))
+            return FALSE;
+         break;
+      default:
+         break;
+      }
+
+   }
+   return TRUE;
+}
+
+static boolean svga_shader_emit_helpers( struct svga_shader_emitter *emit )
+
+{
+   if (needs_to_create_zero( emit )) {
+      create_zero_immediate( emit );
+   }
+   if (needs_to_create_loop_const( emit )) {
+      create_loop_const( emit );
+   }
+   if (needs_to_create_sincos_consts( emit )) {
+      create_sincos_consts( emit );
+   }
+   if (needs_to_create_arl_consts( emit )) {
+      create_arl_consts( emit );
+   }
+
+   if (emit->unit == PIPE_SHADER_FRAGMENT) {
+      if (!emit_ps_preamble( emit ))
+         return FALSE;
+
+      if (emit->key.fkey.light_twoside) {
+         if (!emit_light_twoside( emit ))
+            return FALSE;
+      }
+      if (emit->emit_frontface) {
+         if (!emit_frontface( emit ))
+            return FALSE;
+      }
+   }
+
+   return TRUE;
+}
+
+boolean svga_shader_emit_instructions( struct svga_shader_emitter *emit,
+                                       const struct tgsi_token *tokens )
+{
+   struct tgsi_parse_context parse;
+   boolean ret = TRUE;
+   boolean helpers_emitted = FALSE;
+   unsigned line_nr = 0;
+
+   tgsi_parse_init( &parse, tokens );
+   emit->internal_imm_count = 0;
+
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      ret = emit_vs_preamble( emit );
+      if (!ret)
+         goto done;
+   }
+
+   pre_parse_tokens(emit, tokens);
+
+   while (!tgsi_parse_end_of_tokens( &parse )) {
+      tgsi_parse_token( &parse );
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         ret = svga_emit_immediate( emit, &parse.FullToken.FullImmediate );
+         if (!ret)
+            goto done;
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (emit->use_sm30)
+            ret = svga_translate_decl_sm30( emit, &parse.FullToken.FullDeclaration );
+         else
+            ret = svga_translate_decl_sm20( emit, &parse.FullToken.FullDeclaration );
+         if (!ret)
+            goto done;
+         break;
+         
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (!helpers_emitted) {
+            if (!svga_shader_emit_helpers( emit ))
+               goto done;
+            helpers_emitted = TRUE;
+         }
+         ret = svga_emit_instruction( emit, 
+                                      line_nr++,
+                                      &parse.FullToken.FullInstruction );
+         if (!ret)
+            goto done;
+         break;
+      default:
+         break;
+      }
+      
+      reset_temp_regs( emit );
+   }
+
+   /* Need to terminate the current subroutine.  Note that the
+    * hardware doesn't tolerate shaders without sub-routines
+    * terminating with RET+END.
+    */
+   if (!emit->in_main_func) {
+      ret = emit_instruction( emit, inst_token( SVGA3DOP_RET ) );
+      if (!ret)
+         goto done;
+   }
+
+   assert(emit->dynamic_branching_level == 0);
+
+   /* Need to terminate the whole shader:
+    */
+   ret = emit_instruction( emit, inst_token( SVGA3DOP_END ) );
+   if (!ret)
+      goto done;
+
+done:
+   assert(ret);
+   tgsi_parse_free( &parse );
+   return ret;
+}
+
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
new file mode 100644
index 0000000000..a2dcc84f7d
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -0,0 +1,308 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * VMware SVGA specific winsys interface.
+ * 
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ * 
+ * Documentation taken from the VMware SVGA DDK.
+ */
+
+#ifndef SVGA_WINSYS_H_
+#define SVGA_WINSYS_H_
+
+
+#include "svga_types.h"
+#include "svga_reg.h"
+#include "svga3d_reg.h"
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+
+
+struct svga_winsys_screen;
+struct svga_winsys_buffer;
+struct pipe_screen;
+struct pipe_context;
+struct pipe_fence_handle;
+struct pipe_resource;
+struct svga_region;
+struct winsys_handle;
+
+
+#define SVGA_BUFFER_USAGE_PINNED  (1 << 0)
+#define SVGA_BUFFER_USAGE_WRAPPED (1 << 1)
+
+
+#define SVGA_RELOC_WRITE 0x1
+#define SVGA_RELOC_READ  0x2
+
+
+
+/** Opaque surface handle */
+struct svga_winsys_surface;
+
+/** Opaque buffer handle */
+struct svga_winsys_handle;
+
+
+/**
+ * SVGA per-context winsys interface.
+ */
+struct svga_winsys_context
+{
+   void
+   (*destroy)(struct svga_winsys_context *swc);
+
+   void *       
+   (*reserve)(struct svga_winsys_context *swc, 
+	      uint32_t nr_bytes, uint32_t nr_relocs );
+   
+   /**
+    * Emit a relocation for a host surface.
+    * 
+    * @param flags bitmask of SVGA_RELOC_* flags
+    * 
+    * NOTE: Order of this call does matter. It should be the same order
+    * as relocations appear in the command buffer.
+    */
+   void
+   (*surface_relocation)(struct svga_winsys_context *swc, 
+	                 uint32 *sid, 
+	                 struct svga_winsys_surface *surface,
+	                 unsigned flags);
+   
+   /**
+    * Emit a relocation for a guest memory region.
+    * 
+    * @param flags bitmask of SVGA_RELOC_* flags
+    * 
+    * NOTE: Order of this call does matter. It should be the same order
+    * as relocations appear in the command buffer.
+    */
+   void
+   (*region_relocation)(struct svga_winsys_context *swc, 
+	                struct SVGAGuestPtr *ptr, 
+	                struct svga_winsys_buffer *buffer,
+	                uint32 offset,
+                        unsigned flags);
+
+   void
+   (*commit)(struct svga_winsys_context *swc);
+   
+   enum pipe_error
+   (*flush)(struct svga_winsys_context *swc, 
+	    struct pipe_fence_handle **pfence);
+
+   /** 
+    * Context ID used to fill in the commands
+    * 
+    * Context IDs are arbitrary small non-negative integers,
+    * global to the entire SVGA device.
+    */
+   uint32 cid;
+};
+
+
+/**
+ * SVGA per-screen winsys interface.
+ */
+struct svga_winsys_screen
+{
+   void
+   (*destroy)(struct svga_winsys_screen *sws);
+   
+   boolean
+   (*get_cap)(struct svga_winsys_screen *sws,
+              SVGA3dDevCapIndex index,
+              SVGA3dDevCapResult *result);
+   
+   /**
+    * Create a new context.
+    *
+    * Context objects encapsulate all render state, and shader
+    * objects are per-context.
+    *
+    * Surfaces are not per-context. The same surface can be shared
+    * between multiple contexts, and surface operations can occur
+    * without a context.
+    */
+   struct svga_winsys_context *
+   (*context_create)(struct svga_winsys_screen *sws);
+   
+   
+   /**
+    * This creates a "surface" object in the SVGA3D device,
+    * and returns the surface ID (sid). Surfaces are generic
+    * containers for host VRAM objects like textures, vertex
+    * buffers, and depth/stencil buffers.
+    *
+    * Surfaces are hierarchial:
+    *
+    * - Surface may have multiple faces (for cube maps)
+    *
+    * - Each face has a list of mipmap levels
+    *
+    * - Each mipmap image may have multiple volume
+    *   slices, if the image is three dimensional.
+    *
+    * - Each slice is a 2D array of 'blocks'
+    *
+    * - Each block may be one or more pixels.
+    *   (Usually 1, more for DXT or YUV formats.)
+    *
+    * Surfaces are generic host VRAM objects. The SVGA3D device
+    * may optimize surfaces according to the format they were
+    * created with, but this format does not limit the ways in
+    * which the surface may be used. For example, a depth surface
+    * can be used as a texture, or a floating point image may
+    * be used as a vertex buffer. Some surface usages may be
+    * lower performance, due to software emulation, but any
+    * usage should work with any surface.
+    */
+   struct svga_winsys_surface *
+   (*surface_create)(struct svga_winsys_screen *sws,
+                     SVGA3dSurfaceFlags flags,
+                     SVGA3dSurfaceFormat format,
+                     SVGA3dSize size,
+                     uint32 numFaces,
+                     uint32 numMipLevels);
+
+   /**
+    * Creates a surface from a winsys handle.
+    * Used to implement pipe_screen::resource_from_handle.
+    */
+   struct svga_winsys_surface *
+   (*surface_from_handle)(struct svga_winsys_screen *sws,
+                          struct winsys_handle *whandle,
+                          SVGA3dSurfaceFormat *format);
+
+   /**
+    * Get a winsys_handle from a surface.
+    * Used to implement pipe_screen::resource_get_handle.
+    */
+   boolean
+   (*surface_get_handle)(struct svga_winsys_screen *sws,
+                         struct svga_winsys_surface *surface,
+                         unsigned stride,
+                         struct winsys_handle *whandle);
+
+   /**
+    * Whether this surface is sitting in a validate list
+    */
+   boolean
+   (*surface_is_flushed)(struct svga_winsys_screen *sws,
+                         struct svga_winsys_surface *surface);
+
+   /**
+    * Reference a SVGA3D surface object. This allows sharing of a
+    * surface between different objects.
+    */
+   void 
+   (*surface_reference)(struct svga_winsys_screen *sws,
+			struct svga_winsys_surface **pdst,
+			struct svga_winsys_surface *src);
+
+   /**
+    * Buffer management. Buffer attributes are mostly fixed over its lifetime.
+    *
+    * @param usage bitmask of SVGA_BUFFER_USAGE_* flags.
+    *
+    * alignment indicates the client's alignment requirements, eg for
+    * SSE instructions.
+    */
+   struct svga_winsys_buffer *
+   (*buffer_create)( struct svga_winsys_screen *sws, 
+	             unsigned alignment, 
+	             unsigned usage,
+	             unsigned size );
+
+   /** 
+    * Map the entire data store of a buffer object into the client's address.
+    * flags is a bitmaks of PIPE_TRANSFER_*
+    */
+   void *
+   (*buffer_map)( struct svga_winsys_screen *sws, 
+	          struct svga_winsys_buffer *buf,
+		  unsigned flags );
+   
+   void 
+   (*buffer_unmap)( struct svga_winsys_screen *sws, 
+                    struct svga_winsys_buffer *buf );
+
+   void 
+   (*buffer_destroy)( struct svga_winsys_screen *sws,
+	              struct svga_winsys_buffer *buf );
+
+
+   /**
+    * Reference a fence object.
+    */
+   void
+   (*fence_reference)( struct svga_winsys_screen *sws,
+                       struct pipe_fence_handle **pdst,
+                       struct pipe_fence_handle *src );
+
+   /**
+    * Checks whether the fence has been signalled.
+    * \param flags  driver-specific meaning
+    * \return zero on success.
+    */
+   int (*fence_signalled)( struct svga_winsys_screen *sws,
+                           struct pipe_fence_handle *fence,
+                           unsigned flag );
+
+   /**
+    * Wait for the fence to finish.
+    * \param flags  driver-specific meaning
+    * \return zero on success.
+    */
+   int (*fence_finish)( struct svga_winsys_screen *sws,
+                        struct pipe_fence_handle *fence,
+                        unsigned flag );
+
+};
+
+
+struct pipe_screen *
+svga_screen_create(struct svga_winsys_screen *sws);
+
+struct svga_winsys_screen *
+svga_winsys_screen(struct pipe_screen *screen);
+
+struct svga_winsys_context *
+svga_winsys_context(struct pipe_context *context);
+
+struct pipe_resource *
+svga_screen_buffer_wrap_surface(struct pipe_screen *screen,
+				enum SVGA3dSurfaceFormat format,
+				struct svga_winsys_surface *srf);
+
+struct svga_winsys_surface *
+svga_screen_buffer_get_winsys_surface(struct pipe_resource *buffer);
+
+#endif /* SVGA_WINSYS_H_ */
diff --git a/src/gallium/drivers/svga/svgadump/svga_dump.c b/src/gallium/drivers/svga/svgadump/svga_dump.c
new file mode 100644
index 0000000000..d59fb89a58
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_dump.c
@@ -0,0 +1,1784 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * Dump SVGA commands.
+ *
+ * Generated automatically from svga3d_reg.h by svga_dump.py.
+ */
+
+#include "svga_types.h"
+#include "svga_shader_dump.h"
+#include "svga3d_reg.h"
+
+#include "util/u_debug.h"
+#include "svga_dump.h"
+
+static void
+dump_SVGA3dVertexDecl(const SVGA3dVertexDecl *cmd)
+{
+   switch((*cmd).identity.type) {
+   case SVGA3D_DECLTYPE_FLOAT1:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT1\n");
+      break;
+   case SVGA3D_DECLTYPE_FLOAT2:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT2\n");
+      break;
+   case SVGA3D_DECLTYPE_FLOAT3:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT3\n");
+      break;
+   case SVGA3D_DECLTYPE_FLOAT4:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT4\n");
+      break;
+   case SVGA3D_DECLTYPE_D3DCOLOR:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_D3DCOLOR\n");
+      break;
+   case SVGA3D_DECLTYPE_UBYTE4:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_UBYTE4\n");
+      break;
+   case SVGA3D_DECLTYPE_SHORT2:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT2\n");
+      break;
+   case SVGA3D_DECLTYPE_SHORT4:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT4\n");
+      break;
+   case SVGA3D_DECLTYPE_UBYTE4N:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_UBYTE4N\n");
+      break;
+   case SVGA3D_DECLTYPE_SHORT2N:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT2N\n");
+      break;
+   case SVGA3D_DECLTYPE_SHORT4N:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT4N\n");
+      break;
+   case SVGA3D_DECLTYPE_USHORT2N:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_USHORT2N\n");
+      break;
+   case SVGA3D_DECLTYPE_USHORT4N:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_USHORT4N\n");
+      break;
+   case SVGA3D_DECLTYPE_UDEC3:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_UDEC3\n");
+      break;
+   case SVGA3D_DECLTYPE_DEC3N:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_DEC3N\n");
+      break;
+   case SVGA3D_DECLTYPE_FLOAT16_2:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT16_2\n");
+      break;
+   case SVGA3D_DECLTYPE_FLOAT16_4:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT16_4\n");
+      break;
+   case SVGA3D_DECLTYPE_MAX:
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.identity.type = %i\n", (*cmd).identity.type);
+      break;
+   }
+   switch((*cmd).identity.method) {
+   case SVGA3D_DECLMETHOD_DEFAULT:
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_DEFAULT\n");
+      break;
+   case SVGA3D_DECLMETHOD_PARTIALU:
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_PARTIALU\n");
+      break;
+   case SVGA3D_DECLMETHOD_PARTIALV:
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_PARTIALV\n");
+      break;
+   case SVGA3D_DECLMETHOD_CROSSUV:
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_CROSSUV\n");
+      break;
+   case SVGA3D_DECLMETHOD_UV:
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_UV\n");
+      break;
+   case SVGA3D_DECLMETHOD_LOOKUP:
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_LOOKUP\n");
+      break;
+   case SVGA3D_DECLMETHOD_LOOKUPPRESAMPLED:
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_LOOKUPPRESAMPLED\n");
+      break;
+   default:
+      _debug_printf("\t\t.identity.method = %i\n", (*cmd).identity.method);
+      break;
+   }
+   switch((*cmd).identity.usage) {
+   case SVGA3D_DECLUSAGE_POSITION:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_POSITION\n");
+      break;
+   case SVGA3D_DECLUSAGE_BLENDWEIGHT:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_BLENDWEIGHT\n");
+      break;
+   case SVGA3D_DECLUSAGE_BLENDINDICES:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_BLENDINDICES\n");
+      break;
+   case SVGA3D_DECLUSAGE_NORMAL:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_NORMAL\n");
+      break;
+   case SVGA3D_DECLUSAGE_PSIZE:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_PSIZE\n");
+      break;
+   case SVGA3D_DECLUSAGE_TEXCOORD:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_TEXCOORD\n");
+      break;
+   case SVGA3D_DECLUSAGE_TANGENT:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_TANGENT\n");
+      break;
+   case SVGA3D_DECLUSAGE_BINORMAL:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_BINORMAL\n");
+      break;
+   case SVGA3D_DECLUSAGE_TESSFACTOR:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_TESSFACTOR\n");
+      break;
+   case SVGA3D_DECLUSAGE_POSITIONT:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_POSITIONT\n");
+      break;
+   case SVGA3D_DECLUSAGE_COLOR:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_COLOR\n");
+      break;
+   case SVGA3D_DECLUSAGE_FOG:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_FOG\n");
+      break;
+   case SVGA3D_DECLUSAGE_DEPTH:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_DEPTH\n");
+      break;
+   case SVGA3D_DECLUSAGE_SAMPLE:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_SAMPLE\n");
+      break;
+   case SVGA3D_DECLUSAGE_MAX:
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.identity.usage = %i\n", (*cmd).identity.usage);
+      break;
+   }
+   _debug_printf("\t\t.identity.usageIndex = %u\n", (*cmd).identity.usageIndex);
+   _debug_printf("\t\t.array.surfaceId = %u\n", (*cmd).array.surfaceId);
+   _debug_printf("\t\t.array.offset = %u\n", (*cmd).array.offset);
+   _debug_printf("\t\t.array.stride = %u\n", (*cmd).array.stride);
+   _debug_printf("\t\t.rangeHint.first = %u\n", (*cmd).rangeHint.first);
+   _debug_printf("\t\t.rangeHint.last = %u\n", (*cmd).rangeHint.last);
+}
+
+static void
+dump_SVGA3dTextureState(const SVGA3dTextureState *cmd)
+{
+   _debug_printf("\t\t.stage = %u\n", (*cmd).stage);
+   switch((*cmd).name) {
+   case SVGA3D_TS_INVALID:
+      _debug_printf("\t\t.name = SVGA3D_TS_INVALID\n");
+      break;
+   case SVGA3D_TS_BIND_TEXTURE:
+      _debug_printf("\t\t.name = SVGA3D_TS_BIND_TEXTURE\n");
+      break;
+   case SVGA3D_TS_COLOROP:
+      _debug_printf("\t\t.name = SVGA3D_TS_COLOROP\n");
+      break;
+   case SVGA3D_TS_COLORARG1:
+      _debug_printf("\t\t.name = SVGA3D_TS_COLORARG1\n");
+      break;
+   case SVGA3D_TS_COLORARG2:
+      _debug_printf("\t\t.name = SVGA3D_TS_COLORARG2\n");
+      break;
+   case SVGA3D_TS_ALPHAOP:
+      _debug_printf("\t\t.name = SVGA3D_TS_ALPHAOP\n");
+      break;
+   case SVGA3D_TS_ALPHAARG1:
+      _debug_printf("\t\t.name = SVGA3D_TS_ALPHAARG1\n");
+      break;
+   case SVGA3D_TS_ALPHAARG2:
+      _debug_printf("\t\t.name = SVGA3D_TS_ALPHAARG2\n");
+      break;
+   case SVGA3D_TS_ADDRESSU:
+      _debug_printf("\t\t.name = SVGA3D_TS_ADDRESSU\n");
+      break;
+   case SVGA3D_TS_ADDRESSV:
+      _debug_printf("\t\t.name = SVGA3D_TS_ADDRESSV\n");
+      break;
+   case SVGA3D_TS_MIPFILTER:
+      _debug_printf("\t\t.name = SVGA3D_TS_MIPFILTER\n");
+      break;
+   case SVGA3D_TS_MAGFILTER:
+      _debug_printf("\t\t.name = SVGA3D_TS_MAGFILTER\n");
+      break;
+   case SVGA3D_TS_MINFILTER:
+      _debug_printf("\t\t.name = SVGA3D_TS_MINFILTER\n");
+      break;
+   case SVGA3D_TS_BORDERCOLOR:
+      _debug_printf("\t\t.name = SVGA3D_TS_BORDERCOLOR\n");
+      break;
+   case SVGA3D_TS_TEXCOORDINDEX:
+      _debug_printf("\t\t.name = SVGA3D_TS_TEXCOORDINDEX\n");
+      break;
+   case SVGA3D_TS_TEXTURETRANSFORMFLAGS:
+      _debug_printf("\t\t.name = SVGA3D_TS_TEXTURETRANSFORMFLAGS\n");
+      break;
+   case SVGA3D_TS_TEXCOORDGEN:
+      _debug_printf("\t\t.name = SVGA3D_TS_TEXCOORDGEN\n");
+      break;
+   case SVGA3D_TS_BUMPENVMAT00:
+      _debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT00\n");
+      break;
+   case SVGA3D_TS_BUMPENVMAT01:
+      _debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT01\n");
+      break;
+   case SVGA3D_TS_BUMPENVMAT10:
+      _debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT10\n");
+      break;
+   case SVGA3D_TS_BUMPENVMAT11:
+      _debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT11\n");
+      break;
+   case SVGA3D_TS_TEXTURE_MIPMAP_LEVEL:
+      _debug_printf("\t\t.name = SVGA3D_TS_TEXTURE_MIPMAP_LEVEL\n");
+      break;
+   case SVGA3D_TS_TEXTURE_LOD_BIAS:
+      _debug_printf("\t\t.name = SVGA3D_TS_TEXTURE_LOD_BIAS\n");
+      break;
+   case SVGA3D_TS_TEXTURE_ANISOTROPIC_LEVEL:
+      _debug_printf("\t\t.name = SVGA3D_TS_TEXTURE_ANISOTROPIC_LEVEL\n");
+      break;
+   case SVGA3D_TS_ADDRESSW:
+      _debug_printf("\t\t.name = SVGA3D_TS_ADDRESSW\n");
+      break;
+   case SVGA3D_TS_GAMMA:
+      _debug_printf("\t\t.name = SVGA3D_TS_GAMMA\n");
+      break;
+   case SVGA3D_TS_BUMPENVLSCALE:
+      _debug_printf("\t\t.name = SVGA3D_TS_BUMPENVLSCALE\n");
+      break;
+   case SVGA3D_TS_BUMPENVLOFFSET:
+      _debug_printf("\t\t.name = SVGA3D_TS_BUMPENVLOFFSET\n");
+      break;
+   case SVGA3D_TS_COLORARG0:
+      _debug_printf("\t\t.name = SVGA3D_TS_COLORARG0\n");
+      break;
+   case SVGA3D_TS_ALPHAARG0:
+      _debug_printf("\t\t.name = SVGA3D_TS_ALPHAARG0\n");
+      break;
+   case SVGA3D_TS_MAX:
+      _debug_printf("\t\t.name = SVGA3D_TS_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.name = %i\n", (*cmd).name);
+      break;
+   }
+   _debug_printf("\t\t.value = %u\n", (*cmd).value);
+   _debug_printf("\t\t.floatValue = %f\n", (*cmd).floatValue);
+}
+
+static void
+dump_SVGA3dCopyBox(const SVGA3dCopyBox *cmd)
+{
+   _debug_printf("\t\t.x = %u\n", (*cmd).x);
+   _debug_printf("\t\t.y = %u\n", (*cmd).y);
+   _debug_printf("\t\t.z = %u\n", (*cmd).z);
+   _debug_printf("\t\t.w = %u\n", (*cmd).w);
+   _debug_printf("\t\t.h = %u\n", (*cmd).h);
+   _debug_printf("\t\t.d = %u\n", (*cmd).d);
+   _debug_printf("\t\t.srcx = %u\n", (*cmd).srcx);
+   _debug_printf("\t\t.srcy = %u\n", (*cmd).srcy);
+   _debug_printf("\t\t.srcz = %u\n", (*cmd).srcz);
+}
+
+static void
+dump_SVGA3dCmdSetClipPlane(const SVGA3dCmdSetClipPlane *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.index = %u\n", (*cmd).index);
+   _debug_printf("\t\t.plane[0] = %f\n", (*cmd).plane[0]);
+   _debug_printf("\t\t.plane[1] = %f\n", (*cmd).plane[1]);
+   _debug_printf("\t\t.plane[2] = %f\n", (*cmd).plane[2]);
+   _debug_printf("\t\t.plane[3] = %f\n", (*cmd).plane[3]);
+}
+
+static void
+dump_SVGA3dCmdWaitForQuery(const SVGA3dCmdWaitForQuery *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).type) {
+   case SVGA3D_QUERYTYPE_OCCLUSION:
+      _debug_printf("\t\t.type = SVGA3D_QUERYTYPE_OCCLUSION\n");
+      break;
+   case SVGA3D_QUERYTYPE_MAX:
+      _debug_printf("\t\t.type = SVGA3D_QUERYTYPE_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+   _debug_printf("\t\t.guestResult.gmrId = %u\n", (*cmd).guestResult.gmrId);
+   _debug_printf("\t\t.guestResult.offset = %u\n", (*cmd).guestResult.offset);
+}
+
+static void
+dump_SVGA3dCmdSetRenderTarget(const SVGA3dCmdSetRenderTarget *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).type) {
+   case SVGA3D_RT_DEPTH:
+      _debug_printf("\t\t.type = SVGA3D_RT_DEPTH\n");
+      break;
+   case SVGA3D_RT_STENCIL:
+      _debug_printf("\t\t.type = SVGA3D_RT_STENCIL\n");
+      break;
+   default:
+      _debug_printf("\t\t.type = SVGA3D_RT_COLOR%u\n", (*cmd).type - SVGA3D_RT_COLOR0);
+      break;
+   }
+   _debug_printf("\t\t.target.sid = %u\n", (*cmd).target.sid);
+   _debug_printf("\t\t.target.face = %u\n", (*cmd).target.face);
+   _debug_printf("\t\t.target.mipmap = %u\n", (*cmd).target.mipmap);
+}
+
+static void
+dump_SVGA3dCmdSetTextureState(const SVGA3dCmdSetTextureState *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+}
+
+static void
+dump_SVGA3dCmdSurfaceCopy(const SVGA3dCmdSurfaceCopy *cmd)
+{
+   _debug_printf("\t\t.src.sid = %u\n", (*cmd).src.sid);
+   _debug_printf("\t\t.src.face = %u\n", (*cmd).src.face);
+   _debug_printf("\t\t.src.mipmap = %u\n", (*cmd).src.mipmap);
+   _debug_printf("\t\t.dest.sid = %u\n", (*cmd).dest.sid);
+   _debug_printf("\t\t.dest.face = %u\n", (*cmd).dest.face);
+   _debug_printf("\t\t.dest.mipmap = %u\n", (*cmd).dest.mipmap);
+}
+
+static void
+dump_SVGA3dCmdSetMaterial(const SVGA3dCmdSetMaterial *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).face) {
+   case SVGA3D_FACE_INVALID:
+      _debug_printf("\t\t.face = SVGA3D_FACE_INVALID\n");
+      break;
+   case SVGA3D_FACE_NONE:
+      _debug_printf("\t\t.face = SVGA3D_FACE_NONE\n");
+      break;
+   case SVGA3D_FACE_FRONT:
+      _debug_printf("\t\t.face = SVGA3D_FACE_FRONT\n");
+      break;
+   case SVGA3D_FACE_BACK:
+      _debug_printf("\t\t.face = SVGA3D_FACE_BACK\n");
+      break;
+   case SVGA3D_FACE_FRONT_BACK:
+      _debug_printf("\t\t.face = SVGA3D_FACE_FRONT_BACK\n");
+      break;
+   case SVGA3D_FACE_MAX:
+      _debug_printf("\t\t.face = SVGA3D_FACE_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.face = %i\n", (*cmd).face);
+      break;
+   }
+   _debug_printf("\t\t.material.diffuse[0] = %f\n", (*cmd).material.diffuse[0]);
+   _debug_printf("\t\t.material.diffuse[1] = %f\n", (*cmd).material.diffuse[1]);
+   _debug_printf("\t\t.material.diffuse[2] = %f\n", (*cmd).material.diffuse[2]);
+   _debug_printf("\t\t.material.diffuse[3] = %f\n", (*cmd).material.diffuse[3]);
+   _debug_printf("\t\t.material.ambient[0] = %f\n", (*cmd).material.ambient[0]);
+   _debug_printf("\t\t.material.ambient[1] = %f\n", (*cmd).material.ambient[1]);
+   _debug_printf("\t\t.material.ambient[2] = %f\n", (*cmd).material.ambient[2]);
+   _debug_printf("\t\t.material.ambient[3] = %f\n", (*cmd).material.ambient[3]);
+   _debug_printf("\t\t.material.specular[0] = %f\n", (*cmd).material.specular[0]);
+   _debug_printf("\t\t.material.specular[1] = %f\n", (*cmd).material.specular[1]);
+   _debug_printf("\t\t.material.specular[2] = %f\n", (*cmd).material.specular[2]);
+   _debug_printf("\t\t.material.specular[3] = %f\n", (*cmd).material.specular[3]);
+   _debug_printf("\t\t.material.emissive[0] = %f\n", (*cmd).material.emissive[0]);
+   _debug_printf("\t\t.material.emissive[1] = %f\n", (*cmd).material.emissive[1]);
+   _debug_printf("\t\t.material.emissive[2] = %f\n", (*cmd).material.emissive[2]);
+   _debug_printf("\t\t.material.emissive[3] = %f\n", (*cmd).material.emissive[3]);
+   _debug_printf("\t\t.material.shininess = %f\n", (*cmd).material.shininess);
+}
+
+static void
+dump_SVGA3dCmdSetLightData(const SVGA3dCmdSetLightData *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.index = %u\n", (*cmd).index);
+   switch((*cmd).data.type) {
+   case SVGA3D_LIGHTTYPE_INVALID:
+      _debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_INVALID\n");
+      break;
+   case SVGA3D_LIGHTTYPE_POINT:
+      _debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_POINT\n");
+      break;
+   case SVGA3D_LIGHTTYPE_SPOT1:
+      _debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_SPOT1\n");
+      break;
+   case SVGA3D_LIGHTTYPE_SPOT2:
+      _debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_SPOT2\n");
+      break;
+   case SVGA3D_LIGHTTYPE_DIRECTIONAL:
+      _debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_DIRECTIONAL\n");
+      break;
+   case SVGA3D_LIGHTTYPE_MAX:
+      _debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.data.type = %i\n", (*cmd).data.type);
+      break;
+   }
+   _debug_printf("\t\t.data.inWorldSpace = %u\n", (*cmd).data.inWorldSpace);
+   _debug_printf("\t\t.data.diffuse[0] = %f\n", (*cmd).data.diffuse[0]);
+   _debug_printf("\t\t.data.diffuse[1] = %f\n", (*cmd).data.diffuse[1]);
+   _debug_printf("\t\t.data.diffuse[2] = %f\n", (*cmd).data.diffuse[2]);
+   _debug_printf("\t\t.data.diffuse[3] = %f\n", (*cmd).data.diffuse[3]);
+   _debug_printf("\t\t.data.specular[0] = %f\n", (*cmd).data.specular[0]);
+   _debug_printf("\t\t.data.specular[1] = %f\n", (*cmd).data.specular[1]);
+   _debug_printf("\t\t.data.specular[2] = %f\n", (*cmd).data.specular[2]);
+   _debug_printf("\t\t.data.specular[3] = %f\n", (*cmd).data.specular[3]);
+   _debug_printf("\t\t.data.ambient[0] = %f\n", (*cmd).data.ambient[0]);
+   _debug_printf("\t\t.data.ambient[1] = %f\n", (*cmd).data.ambient[1]);
+   _debug_printf("\t\t.data.ambient[2] = %f\n", (*cmd).data.ambient[2]);
+   _debug_printf("\t\t.data.ambient[3] = %f\n", (*cmd).data.ambient[3]);
+   _debug_printf("\t\t.data.position[0] = %f\n", (*cmd).data.position[0]);
+   _debug_printf("\t\t.data.position[1] = %f\n", (*cmd).data.position[1]);
+   _debug_printf("\t\t.data.position[2] = %f\n", (*cmd).data.position[2]);
+   _debug_printf("\t\t.data.position[3] = %f\n", (*cmd).data.position[3]);
+   _debug_printf("\t\t.data.direction[0] = %f\n", (*cmd).data.direction[0]);
+   _debug_printf("\t\t.data.direction[1] = %f\n", (*cmd).data.direction[1]);
+   _debug_printf("\t\t.data.direction[2] = %f\n", (*cmd).data.direction[2]);
+   _debug_printf("\t\t.data.direction[3] = %f\n", (*cmd).data.direction[3]);
+   _debug_printf("\t\t.data.range = %f\n", (*cmd).data.range);
+   _debug_printf("\t\t.data.falloff = %f\n", (*cmd).data.falloff);
+   _debug_printf("\t\t.data.attenuation0 = %f\n", (*cmd).data.attenuation0);
+   _debug_printf("\t\t.data.attenuation1 = %f\n", (*cmd).data.attenuation1);
+   _debug_printf("\t\t.data.attenuation2 = %f\n", (*cmd).data.attenuation2);
+   _debug_printf("\t\t.data.theta = %f\n", (*cmd).data.theta);
+   _debug_printf("\t\t.data.phi = %f\n", (*cmd).data.phi);
+}
+
+static void
+dump_SVGA3dCmdSetViewport(const SVGA3dCmdSetViewport *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.rect.x = %u\n", (*cmd).rect.x);
+   _debug_printf("\t\t.rect.y = %u\n", (*cmd).rect.y);
+   _debug_printf("\t\t.rect.w = %u\n", (*cmd).rect.w);
+   _debug_printf("\t\t.rect.h = %u\n", (*cmd).rect.h);
+}
+
+static void
+dump_SVGA3dCmdSetScissorRect(const SVGA3dCmdSetScissorRect *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.rect.x = %u\n", (*cmd).rect.x);
+   _debug_printf("\t\t.rect.y = %u\n", (*cmd).rect.y);
+   _debug_printf("\t\t.rect.w = %u\n", (*cmd).rect.w);
+   _debug_printf("\t\t.rect.h = %u\n", (*cmd).rect.h);
+}
+
+static void
+dump_SVGA3dCopyRect(const SVGA3dCopyRect *cmd)
+{
+   _debug_printf("\t\t.x = %u\n", (*cmd).x);
+   _debug_printf("\t\t.y = %u\n", (*cmd).y);
+   _debug_printf("\t\t.w = %u\n", (*cmd).w);
+   _debug_printf("\t\t.h = %u\n", (*cmd).h);
+   _debug_printf("\t\t.srcx = %u\n", (*cmd).srcx);
+   _debug_printf("\t\t.srcy = %u\n", (*cmd).srcy);
+}
+
+static void
+dump_SVGA3dCmdSetShader(const SVGA3dCmdSetShader *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).type) {
+   case SVGA3D_SHADERTYPE_COMPILED_DX8:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
+      break;
+   case SVGA3D_SHADERTYPE_VS:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
+      break;
+   case SVGA3D_SHADERTYPE_PS:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
+      break;
+   case SVGA3D_SHADERTYPE_MAX:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+   _debug_printf("\t\t.shid = %u\n", (*cmd).shid);
+}
+
+static void
+dump_SVGA3dCmdEndQuery(const SVGA3dCmdEndQuery *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).type) {
+   case SVGA3D_QUERYTYPE_OCCLUSION:
+      _debug_printf("\t\t.type = SVGA3D_QUERYTYPE_OCCLUSION\n");
+      break;
+   case SVGA3D_QUERYTYPE_MAX:
+      _debug_printf("\t\t.type = SVGA3D_QUERYTYPE_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+   _debug_printf("\t\t.guestResult.gmrId = %u\n", (*cmd).guestResult.gmrId);
+   _debug_printf("\t\t.guestResult.offset = %u\n", (*cmd).guestResult.offset);
+}
+
+static void
+dump_SVGA3dSize(const SVGA3dSize *cmd)
+{
+   _debug_printf("\t\t.width = %u\n", (*cmd).width);
+   _debug_printf("\t\t.height = %u\n", (*cmd).height);
+   _debug_printf("\t\t.depth = %u\n", (*cmd).depth);
+}
+
+static void
+dump_SVGA3dCmdDestroySurface(const SVGA3dCmdDestroySurface *cmd)
+{
+   _debug_printf("\t\t.sid = %u\n", (*cmd).sid);
+}
+
+static void
+dump_SVGA3dCmdDefineContext(const SVGA3dCmdDefineContext *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+}
+
+static void
+dump_SVGA3dRect(const SVGA3dRect *cmd)
+{
+   _debug_printf("\t\t.x = %u\n", (*cmd).x);
+   _debug_printf("\t\t.y = %u\n", (*cmd).y);
+   _debug_printf("\t\t.w = %u\n", (*cmd).w);
+   _debug_printf("\t\t.h = %u\n", (*cmd).h);
+}
+
+static void
+dump_SVGA3dCmdBeginQuery(const SVGA3dCmdBeginQuery *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).type) {
+   case SVGA3D_QUERYTYPE_OCCLUSION:
+      _debug_printf("\t\t.type = SVGA3D_QUERYTYPE_OCCLUSION\n");
+      break;
+   case SVGA3D_QUERYTYPE_MAX:
+      _debug_printf("\t\t.type = SVGA3D_QUERYTYPE_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+}
+
+static void
+dump_SVGA3dRenderState(const SVGA3dRenderState *cmd)
+{
+   switch((*cmd).state) {
+   case SVGA3D_RS_INVALID:
+      _debug_printf("\t\t.state = SVGA3D_RS_INVALID\n");
+      break;
+   case SVGA3D_RS_ZENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_ZENABLE\n");
+      break;
+   case SVGA3D_RS_ZWRITEENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_ZWRITEENABLE\n");
+      break;
+   case SVGA3D_RS_ALPHATESTENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_ALPHATESTENABLE\n");
+      break;
+   case SVGA3D_RS_DITHERENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_DITHERENABLE\n");
+      break;
+   case SVGA3D_RS_BLENDENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_BLENDENABLE\n");
+      break;
+   case SVGA3D_RS_FOGENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_FOGENABLE\n");
+      break;
+   case SVGA3D_RS_SPECULARENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_SPECULARENABLE\n");
+      break;
+   case SVGA3D_RS_STENCILENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILENABLE\n");
+      break;
+   case SVGA3D_RS_LIGHTINGENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_LIGHTINGENABLE\n");
+      break;
+   case SVGA3D_RS_NORMALIZENORMALS:
+      _debug_printf("\t\t.state = SVGA3D_RS_NORMALIZENORMALS\n");
+      break;
+   case SVGA3D_RS_POINTSPRITEENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSPRITEENABLE\n");
+      break;
+   case SVGA3D_RS_POINTSCALEENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSCALEENABLE\n");
+      break;
+   case SVGA3D_RS_STENCILREF:
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILREF\n");
+      break;
+   case SVGA3D_RS_STENCILMASK:
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILMASK\n");
+      break;
+   case SVGA3D_RS_STENCILWRITEMASK:
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILWRITEMASK\n");
+      break;
+   case SVGA3D_RS_FOGSTART:
+      _debug_printf("\t\t.state = SVGA3D_RS_FOGSTART\n");
+      break;
+   case SVGA3D_RS_FOGEND:
+      _debug_printf("\t\t.state = SVGA3D_RS_FOGEND\n");
+      break;
+   case SVGA3D_RS_FOGDENSITY:
+      _debug_printf("\t\t.state = SVGA3D_RS_FOGDENSITY\n");
+      break;
+   case SVGA3D_RS_POINTSIZE:
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSIZE\n");
+      break;
+   case SVGA3D_RS_POINTSIZEMIN:
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSIZEMIN\n");
+      break;
+   case SVGA3D_RS_POINTSIZEMAX:
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSIZEMAX\n");
+      break;
+   case SVGA3D_RS_POINTSCALE_A:
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSCALE_A\n");
+      break;
+   case SVGA3D_RS_POINTSCALE_B:
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSCALE_B\n");
+      break;
+   case SVGA3D_RS_POINTSCALE_C:
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSCALE_C\n");
+      break;
+   case SVGA3D_RS_FOGCOLOR:
+      _debug_printf("\t\t.state = SVGA3D_RS_FOGCOLOR\n");
+      break;
+   case SVGA3D_RS_AMBIENT:
+      _debug_printf("\t\t.state = SVGA3D_RS_AMBIENT\n");
+      break;
+   case SVGA3D_RS_CLIPPLANEENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_CLIPPLANEENABLE\n");
+      break;
+   case SVGA3D_RS_FOGMODE:
+      _debug_printf("\t\t.state = SVGA3D_RS_FOGMODE\n");
+      break;
+   case SVGA3D_RS_FILLMODE:
+      _debug_printf("\t\t.state = SVGA3D_RS_FILLMODE\n");
+      break;
+   case SVGA3D_RS_SHADEMODE:
+      _debug_printf("\t\t.state = SVGA3D_RS_SHADEMODE\n");
+      break;
+   case SVGA3D_RS_LINEPATTERN:
+      _debug_printf("\t\t.state = SVGA3D_RS_LINEPATTERN\n");
+      break;
+   case SVGA3D_RS_SRCBLEND:
+      _debug_printf("\t\t.state = SVGA3D_RS_SRCBLEND\n");
+      break;
+   case SVGA3D_RS_DSTBLEND:
+      _debug_printf("\t\t.state = SVGA3D_RS_DSTBLEND\n");
+      break;
+   case SVGA3D_RS_BLENDEQUATION:
+      _debug_printf("\t\t.state = SVGA3D_RS_BLENDEQUATION\n");
+      break;
+   case SVGA3D_RS_CULLMODE:
+      _debug_printf("\t\t.state = SVGA3D_RS_CULLMODE\n");
+      break;
+   case SVGA3D_RS_ZFUNC:
+      _debug_printf("\t\t.state = SVGA3D_RS_ZFUNC\n");
+      break;
+   case SVGA3D_RS_ALPHAFUNC:
+      _debug_printf("\t\t.state = SVGA3D_RS_ALPHAFUNC\n");
+      break;
+   case SVGA3D_RS_STENCILFUNC:
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILFUNC\n");
+      break;
+   case SVGA3D_RS_STENCILFAIL:
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILFAIL\n");
+      break;
+   case SVGA3D_RS_STENCILZFAIL:
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILZFAIL\n");
+      break;
+   case SVGA3D_RS_STENCILPASS:
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILPASS\n");
+      break;
+   case SVGA3D_RS_ALPHAREF:
+      _debug_printf("\t\t.state = SVGA3D_RS_ALPHAREF\n");
+      break;
+   case SVGA3D_RS_FRONTWINDING:
+      _debug_printf("\t\t.state = SVGA3D_RS_FRONTWINDING\n");
+      break;
+   case SVGA3D_RS_COORDINATETYPE:
+      _debug_printf("\t\t.state = SVGA3D_RS_COORDINATETYPE\n");
+      break;
+   case SVGA3D_RS_ZBIAS:
+      _debug_printf("\t\t.state = SVGA3D_RS_ZBIAS\n");
+      break;
+   case SVGA3D_RS_RANGEFOGENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_RANGEFOGENABLE\n");
+      break;
+   case SVGA3D_RS_COLORWRITEENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE\n");
+      break;
+   case SVGA3D_RS_VERTEXMATERIALENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_VERTEXMATERIALENABLE\n");
+      break;
+   case SVGA3D_RS_DIFFUSEMATERIALSOURCE:
+      _debug_printf("\t\t.state = SVGA3D_RS_DIFFUSEMATERIALSOURCE\n");
+      break;
+   case SVGA3D_RS_SPECULARMATERIALSOURCE:
+      _debug_printf("\t\t.state = SVGA3D_RS_SPECULARMATERIALSOURCE\n");
+      break;
+   case SVGA3D_RS_AMBIENTMATERIALSOURCE:
+      _debug_printf("\t\t.state = SVGA3D_RS_AMBIENTMATERIALSOURCE\n");
+      break;
+   case SVGA3D_RS_EMISSIVEMATERIALSOURCE:
+      _debug_printf("\t\t.state = SVGA3D_RS_EMISSIVEMATERIALSOURCE\n");
+      break;
+   case SVGA3D_RS_TEXTUREFACTOR:
+      _debug_printf("\t\t.state = SVGA3D_RS_TEXTUREFACTOR\n");
+      break;
+   case SVGA3D_RS_LOCALVIEWER:
+      _debug_printf("\t\t.state = SVGA3D_RS_LOCALVIEWER\n");
+      break;
+   case SVGA3D_RS_SCISSORTESTENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_SCISSORTESTENABLE\n");
+      break;
+   case SVGA3D_RS_BLENDCOLOR:
+      _debug_printf("\t\t.state = SVGA3D_RS_BLENDCOLOR\n");
+      break;
+   case SVGA3D_RS_STENCILENABLE2SIDED:
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILENABLE2SIDED\n");
+      break;
+   case SVGA3D_RS_CCWSTENCILFUNC:
+      _debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILFUNC\n");
+      break;
+   case SVGA3D_RS_CCWSTENCILFAIL:
+      _debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILFAIL\n");
+      break;
+   case SVGA3D_RS_CCWSTENCILZFAIL:
+      _debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILZFAIL\n");
+      break;
+   case SVGA3D_RS_CCWSTENCILPASS:
+      _debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILPASS\n");
+      break;
+   case SVGA3D_RS_VERTEXBLEND:
+      _debug_printf("\t\t.state = SVGA3D_RS_VERTEXBLEND\n");
+      break;
+   case SVGA3D_RS_SLOPESCALEDEPTHBIAS:
+      _debug_printf("\t\t.state = SVGA3D_RS_SLOPESCALEDEPTHBIAS\n");
+      break;
+   case SVGA3D_RS_DEPTHBIAS:
+      _debug_printf("\t\t.state = SVGA3D_RS_DEPTHBIAS\n");
+      break;
+   case SVGA3D_RS_OUTPUTGAMMA:
+      _debug_printf("\t\t.state = SVGA3D_RS_OUTPUTGAMMA\n");
+      break;
+   case SVGA3D_RS_ZVISIBLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_ZVISIBLE\n");
+      break;
+   case SVGA3D_RS_LASTPIXEL:
+      _debug_printf("\t\t.state = SVGA3D_RS_LASTPIXEL\n");
+      break;
+   case SVGA3D_RS_CLIPPING:
+      _debug_printf("\t\t.state = SVGA3D_RS_CLIPPING\n");
+      break;
+   case SVGA3D_RS_WRAP0:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP0\n");
+      break;
+   case SVGA3D_RS_WRAP1:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP1\n");
+      break;
+   case SVGA3D_RS_WRAP2:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP2\n");
+      break;
+   case SVGA3D_RS_WRAP3:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP3\n");
+      break;
+   case SVGA3D_RS_WRAP4:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP4\n");
+      break;
+   case SVGA3D_RS_WRAP5:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP5\n");
+      break;
+   case SVGA3D_RS_WRAP6:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP6\n");
+      break;
+   case SVGA3D_RS_WRAP7:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP7\n");
+      break;
+   case SVGA3D_RS_WRAP8:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP8\n");
+      break;
+   case SVGA3D_RS_WRAP9:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP9\n");
+      break;
+   case SVGA3D_RS_WRAP10:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP10\n");
+      break;
+   case SVGA3D_RS_WRAP11:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP11\n");
+      break;
+   case SVGA3D_RS_WRAP12:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP12\n");
+      break;
+   case SVGA3D_RS_WRAP13:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP13\n");
+      break;
+   case SVGA3D_RS_WRAP14:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP14\n");
+      break;
+   case SVGA3D_RS_WRAP15:
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP15\n");
+      break;
+   case SVGA3D_RS_MULTISAMPLEANTIALIAS:
+      _debug_printf("\t\t.state = SVGA3D_RS_MULTISAMPLEANTIALIAS\n");
+      break;
+   case SVGA3D_RS_MULTISAMPLEMASK:
+      _debug_printf("\t\t.state = SVGA3D_RS_MULTISAMPLEMASK\n");
+      break;
+   case SVGA3D_RS_INDEXEDVERTEXBLENDENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_INDEXEDVERTEXBLENDENABLE\n");
+      break;
+   case SVGA3D_RS_TWEENFACTOR:
+      _debug_printf("\t\t.state = SVGA3D_RS_TWEENFACTOR\n");
+      break;
+   case SVGA3D_RS_ANTIALIASEDLINEENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_ANTIALIASEDLINEENABLE\n");
+      break;
+   case SVGA3D_RS_COLORWRITEENABLE1:
+      _debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE1\n");
+      break;
+   case SVGA3D_RS_COLORWRITEENABLE2:
+      _debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE2\n");
+      break;
+   case SVGA3D_RS_COLORWRITEENABLE3:
+      _debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE3\n");
+      break;
+   case SVGA3D_RS_SEPARATEALPHABLENDENABLE:
+      _debug_printf("\t\t.state = SVGA3D_RS_SEPARATEALPHABLENDENABLE\n");
+      break;
+   case SVGA3D_RS_SRCBLENDALPHA:
+      _debug_printf("\t\t.state = SVGA3D_RS_SRCBLENDALPHA\n");
+      break;
+   case SVGA3D_RS_DSTBLENDALPHA:
+      _debug_printf("\t\t.state = SVGA3D_RS_DSTBLENDALPHA\n");
+      break;
+   case SVGA3D_RS_BLENDEQUATIONALPHA:
+      _debug_printf("\t\t.state = SVGA3D_RS_BLENDEQUATIONALPHA\n");
+      break;
+   case SVGA3D_RS_MAX:
+      _debug_printf("\t\t.state = SVGA3D_RS_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.state = %i\n", (*cmd).state);
+      break;
+   }
+   _debug_printf("\t\t.uintValue = %u\n", (*cmd).uintValue);
+   _debug_printf("\t\t.floatValue = %f\n", (*cmd).floatValue);
+}
+
+static void
+dump_SVGA3dVertexDivisor(const SVGA3dVertexDivisor *cmd)
+{
+   _debug_printf("\t\t.value = %u\n", (*cmd).value);
+   _debug_printf("\t\t.count = %u\n", (*cmd).count);
+   _debug_printf("\t\t.indexedData = %u\n", (*cmd).indexedData);
+   _debug_printf("\t\t.instanceData = %u\n", (*cmd).instanceData);
+}
+
+static void
+dump_SVGA3dCmdDefineShader(const SVGA3dCmdDefineShader *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.shid = %u\n", (*cmd).shid);
+   switch((*cmd).type) {
+   case SVGA3D_SHADERTYPE_COMPILED_DX8:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
+      break;
+   case SVGA3D_SHADERTYPE_VS:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
+      break;
+   case SVGA3D_SHADERTYPE_PS:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
+      break;
+   case SVGA3D_SHADERTYPE_MAX:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+}
+
+static void
+dump_SVGA3dCmdSetShaderConst(const SVGA3dCmdSetShaderConst *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.reg = %u\n", (*cmd).reg);
+   switch((*cmd).type) {
+   case SVGA3D_SHADERTYPE_COMPILED_DX8:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
+      break;
+   case SVGA3D_SHADERTYPE_VS:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
+      break;
+   case SVGA3D_SHADERTYPE_PS:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
+      break;
+   case SVGA3D_SHADERTYPE_MAX:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+   switch((*cmd).ctype) {
+   case SVGA3D_CONST_TYPE_FLOAT:
+      _debug_printf("\t\t.ctype = SVGA3D_CONST_TYPE_FLOAT\n");
+      _debug_printf("\t\t.values[0] = %f\n", *(const float *)&(*cmd).values[0]);
+      _debug_printf("\t\t.values[1] = %f\n", *(const float *)&(*cmd).values[1]);
+      _debug_printf("\t\t.values[2] = %f\n", *(const float *)&(*cmd).values[2]);
+      _debug_printf("\t\t.values[3] = %f\n", *(const float *)&(*cmd).values[3]);
+      break;
+   case SVGA3D_CONST_TYPE_INT:
+      _debug_printf("\t\t.ctype = SVGA3D_CONST_TYPE_INT\n");
+      _debug_printf("\t\t.values[0] = %u\n", (*cmd).values[0]);
+      _debug_printf("\t\t.values[1] = %u\n", (*cmd).values[1]);
+      _debug_printf("\t\t.values[2] = %u\n", (*cmd).values[2]);
+      _debug_printf("\t\t.values[3] = %u\n", (*cmd).values[3]);
+      break;
+   case SVGA3D_CONST_TYPE_BOOL:
+      _debug_printf("\t\t.ctype = SVGA3D_CONST_TYPE_BOOL\n");
+      _debug_printf("\t\t.values[0] = %u\n", (*cmd).values[0]);
+      _debug_printf("\t\t.values[1] = %u\n", (*cmd).values[1]);
+      _debug_printf("\t\t.values[2] = %u\n", (*cmd).values[2]);
+      _debug_printf("\t\t.values[3] = %u\n", (*cmd).values[3]);
+      break;
+   default:
+      _debug_printf("\t\t.ctype = %i\n", (*cmd).ctype);
+      _debug_printf("\t\t.values[0] = %u\n", (*cmd).values[0]);
+      _debug_printf("\t\t.values[1] = %u\n", (*cmd).values[1]);
+      _debug_printf("\t\t.values[2] = %u\n", (*cmd).values[2]);
+      _debug_printf("\t\t.values[3] = %u\n", (*cmd).values[3]);
+      break;
+   }
+}
+
+static void
+dump_SVGA3dCmdSetZRange(const SVGA3dCmdSetZRange *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.zRange.min = %f\n", (*cmd).zRange.min);
+   _debug_printf("\t\t.zRange.max = %f\n", (*cmd).zRange.max);
+}
+
+static void
+dump_SVGA3dCmdDrawPrimitives(const SVGA3dCmdDrawPrimitives *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.numVertexDecls = %u\n", (*cmd).numVertexDecls);
+   _debug_printf("\t\t.numRanges = %u\n", (*cmd).numRanges);
+}
+
+static void
+dump_SVGA3dCmdSetLightEnabled(const SVGA3dCmdSetLightEnabled *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.index = %u\n", (*cmd).index);
+   _debug_printf("\t\t.enabled = %u\n", (*cmd).enabled);
+}
+
+static void
+dump_SVGA3dPrimitiveRange(const SVGA3dPrimitiveRange *cmd)
+{
+   switch((*cmd).primType) {
+   case SVGA3D_PRIMITIVE_INVALID:
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_INVALID\n");
+      break;
+   case SVGA3D_PRIMITIVE_TRIANGLELIST:
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_TRIANGLELIST\n");
+      break;
+   case SVGA3D_PRIMITIVE_POINTLIST:
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_POINTLIST\n");
+      break;
+   case SVGA3D_PRIMITIVE_LINELIST:
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_LINELIST\n");
+      break;
+   case SVGA3D_PRIMITIVE_LINESTRIP:
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_LINESTRIP\n");
+      break;
+   case SVGA3D_PRIMITIVE_TRIANGLESTRIP:
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_TRIANGLESTRIP\n");
+      break;
+   case SVGA3D_PRIMITIVE_TRIANGLEFAN:
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_TRIANGLEFAN\n");
+      break;
+   case SVGA3D_PRIMITIVE_MAX:
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.primType = %i\n", (*cmd).primType);
+      break;
+   }
+   _debug_printf("\t\t.primitiveCount = %u\n", (*cmd).primitiveCount);
+   _debug_printf("\t\t.indexArray.surfaceId = %u\n", (*cmd).indexArray.surfaceId);
+   _debug_printf("\t\t.indexArray.offset = %u\n", (*cmd).indexArray.offset);
+   _debug_printf("\t\t.indexArray.stride = %u\n", (*cmd).indexArray.stride);
+   _debug_printf("\t\t.indexWidth = %u\n", (*cmd).indexWidth);
+   _debug_printf("\t\t.indexBias = %i\n", (*cmd).indexBias);
+}
+
+static void
+dump_SVGA3dCmdPresent(const SVGA3dCmdPresent *cmd)
+{
+   _debug_printf("\t\t.sid = %u\n", (*cmd).sid);
+}
+
+static void
+dump_SVGA3dCmdSetRenderState(const SVGA3dCmdSetRenderState *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+}
+
+static void
+dump_SVGA3dCmdSurfaceStretchBlt(const SVGA3dCmdSurfaceStretchBlt *cmd)
+{
+   _debug_printf("\t\t.src.sid = %u\n", (*cmd).src.sid);
+   _debug_printf("\t\t.src.face = %u\n", (*cmd).src.face);
+   _debug_printf("\t\t.src.mipmap = %u\n", (*cmd).src.mipmap);
+   _debug_printf("\t\t.dest.sid = %u\n", (*cmd).dest.sid);
+   _debug_printf("\t\t.dest.face = %u\n", (*cmd).dest.face);
+   _debug_printf("\t\t.dest.mipmap = %u\n", (*cmd).dest.mipmap);
+   _debug_printf("\t\t.boxSrc.x = %u\n", (*cmd).boxSrc.x);
+   _debug_printf("\t\t.boxSrc.y = %u\n", (*cmd).boxSrc.y);
+   _debug_printf("\t\t.boxSrc.z = %u\n", (*cmd).boxSrc.z);
+   _debug_printf("\t\t.boxSrc.w = %u\n", (*cmd).boxSrc.w);
+   _debug_printf("\t\t.boxSrc.h = %u\n", (*cmd).boxSrc.h);
+   _debug_printf("\t\t.boxSrc.d = %u\n", (*cmd).boxSrc.d);
+   _debug_printf("\t\t.boxDest.x = %u\n", (*cmd).boxDest.x);
+   _debug_printf("\t\t.boxDest.y = %u\n", (*cmd).boxDest.y);
+   _debug_printf("\t\t.boxDest.z = %u\n", (*cmd).boxDest.z);
+   _debug_printf("\t\t.boxDest.w = %u\n", (*cmd).boxDest.w);
+   _debug_printf("\t\t.boxDest.h = %u\n", (*cmd).boxDest.h);
+   _debug_printf("\t\t.boxDest.d = %u\n", (*cmd).boxDest.d);
+   switch((*cmd).mode) {
+   case SVGA3D_STRETCH_BLT_POINT:
+      _debug_printf("\t\t.mode = SVGA3D_STRETCH_BLT_POINT\n");
+      break;
+   case SVGA3D_STRETCH_BLT_LINEAR:
+      _debug_printf("\t\t.mode = SVGA3D_STRETCH_BLT_LINEAR\n");
+      break;
+   case SVGA3D_STRETCH_BLT_MAX:
+      _debug_printf("\t\t.mode = SVGA3D_STRETCH_BLT_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.mode = %i\n", (*cmd).mode);
+      break;
+   }
+}
+
+static void
+dump_SVGA3dCmdSurfaceDMA(const SVGA3dCmdSurfaceDMA *cmd)
+{
+   _debug_printf("\t\t.guest.ptr.gmrId = %u\n", (*cmd).guest.ptr.gmrId);
+   _debug_printf("\t\t.guest.ptr.offset = %u\n", (*cmd).guest.ptr.offset);
+   _debug_printf("\t\t.guest.pitch = %u\n", (*cmd).guest.pitch);
+   _debug_printf("\t\t.host.sid = %u\n", (*cmd).host.sid);
+   _debug_printf("\t\t.host.face = %u\n", (*cmd).host.face);
+   _debug_printf("\t\t.host.mipmap = %u\n", (*cmd).host.mipmap);
+   switch((*cmd).transfer) {
+   case SVGA3D_WRITE_HOST_VRAM:
+      _debug_printf("\t\t.transfer = SVGA3D_WRITE_HOST_VRAM\n");
+      break;
+   case SVGA3D_READ_HOST_VRAM:
+      _debug_printf("\t\t.transfer = SVGA3D_READ_HOST_VRAM\n");
+      break;
+   default:
+      _debug_printf("\t\t.transfer = %i\n", (*cmd).transfer);
+      break;
+   }
+}
+
+static void
+dump_SVGA3dCmdSurfaceDMASuffix(const SVGA3dCmdSurfaceDMASuffix *cmd)
+{
+   _debug_printf("\t\t.suffixSize = %u\n", (*cmd).suffixSize);
+   _debug_printf("\t\t.maximumOffset = %u\n", (*cmd).maximumOffset);
+   _debug_printf("\t\t.flags.discard = %u\n", (*cmd).flags.discard);
+   _debug_printf("\t\t.flags.unsynchronized = %u\n", (*cmd).flags.unsynchronized);
+}
+
+static void
+dump_SVGA3dCmdSetTransform(const SVGA3dCmdSetTransform *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).type) {
+   case SVGA3D_TRANSFORM_INVALID:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_INVALID\n");
+      break;
+   case SVGA3D_TRANSFORM_WORLD:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD\n");
+      break;
+   case SVGA3D_TRANSFORM_VIEW:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_VIEW\n");
+      break;
+   case SVGA3D_TRANSFORM_PROJECTION:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_PROJECTION\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE0:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE0\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE1:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE1\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE2:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE2\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE3:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE3\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE4:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE4\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE5:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE5\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE6:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE6\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE7:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE7\n");
+      break;
+   case SVGA3D_TRANSFORM_WORLD1:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD1\n");
+      break;
+   case SVGA3D_TRANSFORM_WORLD2:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD2\n");
+      break;
+   case SVGA3D_TRANSFORM_WORLD3:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD3\n");
+      break;
+   case SVGA3D_TRANSFORM_MAX:
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+   _debug_printf("\t\t.matrix[0] = %f\n", (*cmd).matrix[0]);
+   _debug_printf("\t\t.matrix[1] = %f\n", (*cmd).matrix[1]);
+   _debug_printf("\t\t.matrix[2] = %f\n", (*cmd).matrix[2]);
+   _debug_printf("\t\t.matrix[3] = %f\n", (*cmd).matrix[3]);
+   _debug_printf("\t\t.matrix[4] = %f\n", (*cmd).matrix[4]);
+   _debug_printf("\t\t.matrix[5] = %f\n", (*cmd).matrix[5]);
+   _debug_printf("\t\t.matrix[6] = %f\n", (*cmd).matrix[6]);
+   _debug_printf("\t\t.matrix[7] = %f\n", (*cmd).matrix[7]);
+   _debug_printf("\t\t.matrix[8] = %f\n", (*cmd).matrix[8]);
+   _debug_printf("\t\t.matrix[9] = %f\n", (*cmd).matrix[9]);
+   _debug_printf("\t\t.matrix[10] = %f\n", (*cmd).matrix[10]);
+   _debug_printf("\t\t.matrix[11] = %f\n", (*cmd).matrix[11]);
+   _debug_printf("\t\t.matrix[12] = %f\n", (*cmd).matrix[12]);
+   _debug_printf("\t\t.matrix[13] = %f\n", (*cmd).matrix[13]);
+   _debug_printf("\t\t.matrix[14] = %f\n", (*cmd).matrix[14]);
+   _debug_printf("\t\t.matrix[15] = %f\n", (*cmd).matrix[15]);
+}
+
+static void
+dump_SVGA3dCmdDestroyShader(const SVGA3dCmdDestroyShader *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.shid = %u\n", (*cmd).shid);
+   switch((*cmd).type) {
+   case SVGA3D_SHADERTYPE_COMPILED_DX8:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
+      break;
+   case SVGA3D_SHADERTYPE_VS:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
+      break;
+   case SVGA3D_SHADERTYPE_PS:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
+      break;
+   case SVGA3D_SHADERTYPE_MAX:
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+}
+
+static void
+dump_SVGA3dCmdDestroyContext(const SVGA3dCmdDestroyContext *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+}
+
+static void
+dump_SVGA3dCmdClear(const SVGA3dCmdClear *cmd)
+{
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).clearFlag) {
+   case SVGA3D_CLEAR_COLOR:
+      _debug_printf("\t\t.clearFlag = SVGA3D_CLEAR_COLOR\n");
+      break;
+   case SVGA3D_CLEAR_DEPTH:
+      _debug_printf("\t\t.clearFlag = SVGA3D_CLEAR_DEPTH\n");
+      break;
+   case SVGA3D_CLEAR_STENCIL:
+      _debug_printf("\t\t.clearFlag = SVGA3D_CLEAR_STENCIL\n");
+      break;
+   default:
+      _debug_printf("\t\t.clearFlag = %i\n", (*cmd).clearFlag);
+      break;
+   }
+   _debug_printf("\t\t.color = %u\n", (*cmd).color);
+   _debug_printf("\t\t.depth = %f\n", (*cmd).depth);
+   _debug_printf("\t\t.stencil = %u\n", (*cmd).stencil);
+}
+
+static void
+dump_SVGA3dCmdDefineSurface(const SVGA3dCmdDefineSurface *cmd)
+{
+   _debug_printf("\t\t.sid = %u\n", (*cmd).sid);
+   switch((*cmd).surfaceFlags) {
+   case SVGA3D_SURFACE_CUBEMAP:
+      _debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_CUBEMAP\n");
+      break;
+   case SVGA3D_SURFACE_HINT_STATIC:
+      _debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_STATIC\n");
+      break;
+   case SVGA3D_SURFACE_HINT_DYNAMIC:
+      _debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_DYNAMIC\n");
+      break;
+   case SVGA3D_SURFACE_HINT_INDEXBUFFER:
+      _debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_INDEXBUFFER\n");
+      break;
+   case SVGA3D_SURFACE_HINT_VERTEXBUFFER:
+      _debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_VERTEXBUFFER\n");
+      break;
+   default:
+      _debug_printf("\t\t.surfaceFlags = %i\n", (*cmd).surfaceFlags);
+      break;
+   }
+   switch((*cmd).format) {
+   case SVGA3D_FORMAT_INVALID:
+      _debug_printf("\t\t.format = SVGA3D_FORMAT_INVALID\n");
+      break;
+   case SVGA3D_X8R8G8B8:
+      _debug_printf("\t\t.format = SVGA3D_X8R8G8B8\n");
+      break;
+   case SVGA3D_A8R8G8B8:
+      _debug_printf("\t\t.format = SVGA3D_A8R8G8B8\n");
+      break;
+   case SVGA3D_R5G6B5:
+      _debug_printf("\t\t.format = SVGA3D_R5G6B5\n");
+      break;
+   case SVGA3D_X1R5G5B5:
+      _debug_printf("\t\t.format = SVGA3D_X1R5G5B5\n");
+      break;
+   case SVGA3D_A1R5G5B5:
+      _debug_printf("\t\t.format = SVGA3D_A1R5G5B5\n");
+      break;
+   case SVGA3D_A4R4G4B4:
+      _debug_printf("\t\t.format = SVGA3D_A4R4G4B4\n");
+      break;
+   case SVGA3D_Z_D32:
+      _debug_printf("\t\t.format = SVGA3D_Z_D32\n");
+      break;
+   case SVGA3D_Z_D16:
+      _debug_printf("\t\t.format = SVGA3D_Z_D16\n");
+      break;
+   case SVGA3D_Z_D24S8:
+      _debug_printf("\t\t.format = SVGA3D_Z_D24S8\n");
+      break;
+   case SVGA3D_Z_D15S1:
+      _debug_printf("\t\t.format = SVGA3D_Z_D15S1\n");
+      break;
+   case SVGA3D_LUMINANCE8:
+      _debug_printf("\t\t.format = SVGA3D_LUMINANCE8\n");
+      break;
+   case SVGA3D_LUMINANCE4_ALPHA4:
+      _debug_printf("\t\t.format = SVGA3D_LUMINANCE4_ALPHA4\n");
+      break;
+   case SVGA3D_LUMINANCE16:
+      _debug_printf("\t\t.format = SVGA3D_LUMINANCE16\n");
+      break;
+   case SVGA3D_LUMINANCE8_ALPHA8:
+      _debug_printf("\t\t.format = SVGA3D_LUMINANCE8_ALPHA8\n");
+      break;
+   case SVGA3D_DXT1:
+      _debug_printf("\t\t.format = SVGA3D_DXT1\n");
+      break;
+   case SVGA3D_DXT2:
+      _debug_printf("\t\t.format = SVGA3D_DXT2\n");
+      break;
+   case SVGA3D_DXT3:
+      _debug_printf("\t\t.format = SVGA3D_DXT3\n");
+      break;
+   case SVGA3D_DXT4:
+      _debug_printf("\t\t.format = SVGA3D_DXT4\n");
+      break;
+   case SVGA3D_DXT5:
+      _debug_printf("\t\t.format = SVGA3D_DXT5\n");
+      break;
+   case SVGA3D_BUMPU8V8:
+      _debug_printf("\t\t.format = SVGA3D_BUMPU8V8\n");
+      break;
+   case SVGA3D_BUMPL6V5U5:
+      _debug_printf("\t\t.format = SVGA3D_BUMPL6V5U5\n");
+      break;
+   case SVGA3D_BUMPX8L8V8U8:
+      _debug_printf("\t\t.format = SVGA3D_BUMPX8L8V8U8\n");
+      break;
+   case SVGA3D_BUMPL8V8U8:
+      _debug_printf("\t\t.format = SVGA3D_BUMPL8V8U8\n");
+      break;
+   case SVGA3D_ARGB_S10E5:
+      _debug_printf("\t\t.format = SVGA3D_ARGB_S10E5\n");
+      break;
+   case SVGA3D_ARGB_S23E8:
+      _debug_printf("\t\t.format = SVGA3D_ARGB_S23E8\n");
+      break;
+   case SVGA3D_A2R10G10B10:
+      _debug_printf("\t\t.format = SVGA3D_A2R10G10B10\n");
+      break;
+   case SVGA3D_V8U8:
+      _debug_printf("\t\t.format = SVGA3D_V8U8\n");
+      break;
+   case SVGA3D_Q8W8V8U8:
+      _debug_printf("\t\t.format = SVGA3D_Q8W8V8U8\n");
+      break;
+   case SVGA3D_CxV8U8:
+      _debug_printf("\t\t.format = SVGA3D_CxV8U8\n");
+      break;
+   case SVGA3D_X8L8V8U8:
+      _debug_printf("\t\t.format = SVGA3D_X8L8V8U8\n");
+      break;
+   case SVGA3D_A2W10V10U10:
+      _debug_printf("\t\t.format = SVGA3D_A2W10V10U10\n");
+      break;
+   case SVGA3D_ALPHA8:
+      _debug_printf("\t\t.format = SVGA3D_ALPHA8\n");
+      break;
+   case SVGA3D_R_S10E5:
+      _debug_printf("\t\t.format = SVGA3D_R_S10E5\n");
+      break;
+   case SVGA3D_R_S23E8:
+      _debug_printf("\t\t.format = SVGA3D_R_S23E8\n");
+      break;
+   case SVGA3D_RG_S10E5:
+      _debug_printf("\t\t.format = SVGA3D_RG_S10E5\n");
+      break;
+   case SVGA3D_RG_S23E8:
+      _debug_printf("\t\t.format = SVGA3D_RG_S23E8\n");
+      break;
+   case SVGA3D_BUFFER:
+      _debug_printf("\t\t.format = SVGA3D_BUFFER\n");
+      break;
+   case SVGA3D_Z_D24X8:
+      _debug_printf("\t\t.format = SVGA3D_Z_D24X8\n");
+      break;
+   case SVGA3D_FORMAT_MAX:
+      _debug_printf("\t\t.format = SVGA3D_FORMAT_MAX\n");
+      break;
+   default:
+      _debug_printf("\t\t.format = %i\n", (*cmd).format);
+      break;
+   }
+   _debug_printf("\t\t.face[0].numMipLevels = %u\n", (*cmd).face[0].numMipLevels);
+   _debug_printf("\t\t.face[1].numMipLevels = %u\n", (*cmd).face[1].numMipLevels);
+   _debug_printf("\t\t.face[2].numMipLevels = %u\n", (*cmd).face[2].numMipLevels);
+   _debug_printf("\t\t.face[3].numMipLevels = %u\n", (*cmd).face[3].numMipLevels);
+   _debug_printf("\t\t.face[4].numMipLevels = %u\n", (*cmd).face[4].numMipLevels);
+   _debug_printf("\t\t.face[5].numMipLevels = %u\n", (*cmd).face[5].numMipLevels);
+}
+
+static void
+dump_SVGASignedRect(const SVGASignedRect *cmd)
+{
+   _debug_printf("\t\t.left = %i\n", (*cmd).left);
+   _debug_printf("\t\t.top = %i\n", (*cmd).top);
+   _debug_printf("\t\t.right = %i\n", (*cmd).right);
+   _debug_printf("\t\t.bottom = %i\n", (*cmd).bottom);
+}
+
+static void
+dump_SVGA3dCmdBlitSurfaceToScreen(const SVGA3dCmdBlitSurfaceToScreen *cmd)
+{
+   _debug_printf("\t\t.srcImage.sid = %u\n", (*cmd).srcImage.sid);
+   _debug_printf("\t\t.srcImage.face = %u\n", (*cmd).srcImage.face);
+   _debug_printf("\t\t.srcImage.mipmap = %u\n", (*cmd).srcImage.mipmap);
+   _debug_printf("\t\t.srcRect.left = %i\n", (*cmd).srcRect.left);
+   _debug_printf("\t\t.srcRect.top = %i\n", (*cmd).srcRect.top);
+   _debug_printf("\t\t.srcRect.right = %i\n", (*cmd).srcRect.right);
+   _debug_printf("\t\t.srcRect.bottom = %i\n", (*cmd).srcRect.bottom);
+   _debug_printf("\t\t.destScreenId = %u\n", (*cmd).destScreenId);
+   _debug_printf("\t\t.destRect.left = %i\n", (*cmd).destRect.left);
+   _debug_printf("\t\t.destRect.top = %i\n", (*cmd).destRect.top);
+   _debug_printf("\t\t.destRect.right = %i\n", (*cmd).destRect.right);
+   _debug_printf("\t\t.destRect.bottom = %i\n", (*cmd).destRect.bottom);
+}
+
+
+void            
+svga_dump_command(uint32_t cmd_id, const void *data, uint32_t size)
+{
+   const uint8_t *body = (const uint8_t *)data;
+   const uint8_t *next = body + size;
+  
+   switch(cmd_id) {
+   case SVGA_3D_CMD_SURFACE_DEFINE:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_DEFINE\n");
+      {
+         const SVGA3dCmdDefineSurface *cmd = (const SVGA3dCmdDefineSurface *)body;
+         dump_SVGA3dCmdDefineSurface(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dSize) <= next) {
+            dump_SVGA3dSize((const SVGA3dSize *)body);
+            body += sizeof(SVGA3dSize);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SURFACE_DESTROY:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_DESTROY\n");
+      {
+         const SVGA3dCmdDestroySurface *cmd = (const SVGA3dCmdDestroySurface *)body;
+         dump_SVGA3dCmdDestroySurface(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SURFACE_COPY:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_COPY\n");
+      {
+         const SVGA3dCmdSurfaceCopy *cmd = (const SVGA3dCmdSurfaceCopy *)body;
+         dump_SVGA3dCmdSurfaceCopy(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dCopyBox) <= next) {
+            dump_SVGA3dCopyBox((const SVGA3dCopyBox *)body);
+            body += sizeof(SVGA3dCopyBox);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SURFACE_STRETCHBLT:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_STRETCHBLT\n");
+      {
+         const SVGA3dCmdSurfaceStretchBlt *cmd = (const SVGA3dCmdSurfaceStretchBlt *)body;
+         dump_SVGA3dCmdSurfaceStretchBlt(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SURFACE_DMA:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_DMA\n");
+      {
+         const SVGA3dCmdSurfaceDMA *cmd = (const SVGA3dCmdSurfaceDMA *)body;
+         dump_SVGA3dCmdSurfaceDMA(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dCopyBox) <= next) {
+            dump_SVGA3dCopyBox((const SVGA3dCopyBox *)body);
+            body += sizeof(SVGA3dCopyBox);
+         }
+         while(body + sizeof(SVGA3dCmdSurfaceDMASuffix) <= next) {
+            dump_SVGA3dCmdSurfaceDMASuffix((const SVGA3dCmdSurfaceDMASuffix *)body);
+            body += sizeof(SVGA3dCmdSurfaceDMASuffix);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_CONTEXT_DEFINE:
+      _debug_printf("\tSVGA_3D_CMD_CONTEXT_DEFINE\n");
+      {
+         const SVGA3dCmdDefineContext *cmd = (const SVGA3dCmdDefineContext *)body;
+         dump_SVGA3dCmdDefineContext(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_CONTEXT_DESTROY:
+      _debug_printf("\tSVGA_3D_CMD_CONTEXT_DESTROY\n");
+      {
+         const SVGA3dCmdDestroyContext *cmd = (const SVGA3dCmdDestroyContext *)body;
+         dump_SVGA3dCmdDestroyContext(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETTRANSFORM:
+      _debug_printf("\tSVGA_3D_CMD_SETTRANSFORM\n");
+      {
+         const SVGA3dCmdSetTransform *cmd = (const SVGA3dCmdSetTransform *)body;
+         dump_SVGA3dCmdSetTransform(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETZRANGE:
+      _debug_printf("\tSVGA_3D_CMD_SETZRANGE\n");
+      {
+         const SVGA3dCmdSetZRange *cmd = (const SVGA3dCmdSetZRange *)body;
+         dump_SVGA3dCmdSetZRange(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETRENDERSTATE:
+      _debug_printf("\tSVGA_3D_CMD_SETRENDERSTATE\n");
+      {
+         const SVGA3dCmdSetRenderState *cmd = (const SVGA3dCmdSetRenderState *)body;
+         dump_SVGA3dCmdSetRenderState(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dRenderState) <= next) {
+            dump_SVGA3dRenderState((const SVGA3dRenderState *)body);
+            body += sizeof(SVGA3dRenderState);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SETRENDERTARGET:
+      _debug_printf("\tSVGA_3D_CMD_SETRENDERTARGET\n");
+      {
+         const SVGA3dCmdSetRenderTarget *cmd = (const SVGA3dCmdSetRenderTarget *)body;
+         dump_SVGA3dCmdSetRenderTarget(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETTEXTURESTATE:
+      _debug_printf("\tSVGA_3D_CMD_SETTEXTURESTATE\n");
+      {
+         const SVGA3dCmdSetTextureState *cmd = (const SVGA3dCmdSetTextureState *)body;
+         dump_SVGA3dCmdSetTextureState(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dTextureState) <= next) {
+            dump_SVGA3dTextureState((const SVGA3dTextureState *)body);
+            body += sizeof(SVGA3dTextureState);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SETMATERIAL:
+      _debug_printf("\tSVGA_3D_CMD_SETMATERIAL\n");
+      {
+         const SVGA3dCmdSetMaterial *cmd = (const SVGA3dCmdSetMaterial *)body;
+         dump_SVGA3dCmdSetMaterial(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETLIGHTDATA:
+      _debug_printf("\tSVGA_3D_CMD_SETLIGHTDATA\n");
+      {
+         const SVGA3dCmdSetLightData *cmd = (const SVGA3dCmdSetLightData *)body;
+         dump_SVGA3dCmdSetLightData(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETLIGHTENABLED:
+      _debug_printf("\tSVGA_3D_CMD_SETLIGHTENABLED\n");
+      {
+         const SVGA3dCmdSetLightEnabled *cmd = (const SVGA3dCmdSetLightEnabled *)body;
+         dump_SVGA3dCmdSetLightEnabled(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETVIEWPORT:
+      _debug_printf("\tSVGA_3D_CMD_SETVIEWPORT\n");
+      {
+         const SVGA3dCmdSetViewport *cmd = (const SVGA3dCmdSetViewport *)body;
+         dump_SVGA3dCmdSetViewport(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETCLIPPLANE:
+      _debug_printf("\tSVGA_3D_CMD_SETCLIPPLANE\n");
+      {
+         const SVGA3dCmdSetClipPlane *cmd = (const SVGA3dCmdSetClipPlane *)body;
+         dump_SVGA3dCmdSetClipPlane(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_CLEAR:
+      _debug_printf("\tSVGA_3D_CMD_CLEAR\n");
+      {
+         const SVGA3dCmdClear *cmd = (const SVGA3dCmdClear *)body;
+         dump_SVGA3dCmdClear(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dRect) <= next) {
+            dump_SVGA3dRect((const SVGA3dRect *)body);
+            body += sizeof(SVGA3dRect);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_PRESENT:
+      _debug_printf("\tSVGA_3D_CMD_PRESENT\n");
+      {
+         const SVGA3dCmdPresent *cmd = (const SVGA3dCmdPresent *)body;
+         dump_SVGA3dCmdPresent(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dCopyRect) <= next) {
+            dump_SVGA3dCopyRect((const SVGA3dCopyRect *)body);
+            body += sizeof(SVGA3dCopyRect);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SHADER_DEFINE:
+      _debug_printf("\tSVGA_3D_CMD_SHADER_DEFINE\n");
+      {
+         const SVGA3dCmdDefineShader *cmd = (const SVGA3dCmdDefineShader *)body;
+         dump_SVGA3dCmdDefineShader(cmd);
+         body = (const uint8_t *)&cmd[1];
+         svga_shader_dump((const uint32_t *)body, 
+                      (unsigned)(next - body)/sizeof(uint32_t),
+                      FALSE );
+         body = next;
+      }
+      break;
+   case SVGA_3D_CMD_SHADER_DESTROY:
+      _debug_printf("\tSVGA_3D_CMD_SHADER_DESTROY\n");
+      {
+         const SVGA3dCmdDestroyShader *cmd = (const SVGA3dCmdDestroyShader *)body;
+         dump_SVGA3dCmdDestroyShader(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SET_SHADER:
+      _debug_printf("\tSVGA_3D_CMD_SET_SHADER\n");
+      {
+         const SVGA3dCmdSetShader *cmd = (const SVGA3dCmdSetShader *)body;
+         dump_SVGA3dCmdSetShader(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SET_SHADER_CONST:
+      _debug_printf("\tSVGA_3D_CMD_SET_SHADER_CONST\n");
+      {
+         const SVGA3dCmdSetShaderConst *cmd = (const SVGA3dCmdSetShaderConst *)body;
+         dump_SVGA3dCmdSetShaderConst(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_DRAW_PRIMITIVES:
+      _debug_printf("\tSVGA_3D_CMD_DRAW_PRIMITIVES\n");
+      {
+         const SVGA3dCmdDrawPrimitives *cmd = (const SVGA3dCmdDrawPrimitives *)body;
+         unsigned i, j;
+         dump_SVGA3dCmdDrawPrimitives(cmd);
+         body = (const uint8_t *)&cmd[1];
+         for(i = 0; i < cmd->numVertexDecls; ++i) {
+            dump_SVGA3dVertexDecl((const SVGA3dVertexDecl *)body);
+            body += sizeof(SVGA3dVertexDecl);
+         }
+         for(j = 0; j < cmd->numRanges; ++j) {
+            dump_SVGA3dPrimitiveRange((const SVGA3dPrimitiveRange *)body);
+            body += sizeof(SVGA3dPrimitiveRange);
+         }
+         while(body + sizeof(SVGA3dVertexDivisor) <= next) {
+            dump_SVGA3dVertexDivisor((const SVGA3dVertexDivisor *)body);
+            body += sizeof(SVGA3dVertexDivisor);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SETSCISSORRECT:
+      _debug_printf("\tSVGA_3D_CMD_SETSCISSORRECT\n");
+      {
+         const SVGA3dCmdSetScissorRect *cmd = (const SVGA3dCmdSetScissorRect *)body;
+         dump_SVGA3dCmdSetScissorRect(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_BEGIN_QUERY:
+      _debug_printf("\tSVGA_3D_CMD_BEGIN_QUERY\n");
+      {
+         const SVGA3dCmdBeginQuery *cmd = (const SVGA3dCmdBeginQuery *)body;
+         dump_SVGA3dCmdBeginQuery(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_END_QUERY:
+      _debug_printf("\tSVGA_3D_CMD_END_QUERY\n");
+      {
+         const SVGA3dCmdEndQuery *cmd = (const SVGA3dCmdEndQuery *)body;
+         dump_SVGA3dCmdEndQuery(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_WAIT_FOR_QUERY:
+      _debug_printf("\tSVGA_3D_CMD_WAIT_FOR_QUERY\n");
+      {
+         const SVGA3dCmdWaitForQuery *cmd = (const SVGA3dCmdWaitForQuery *)body;
+         dump_SVGA3dCmdWaitForQuery(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN:
+      _debug_printf("\tSVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN\n");
+      {
+         const SVGA3dCmdBlitSurfaceToScreen *cmd = (const SVGA3dCmdBlitSurfaceToScreen *)body;
+         dump_SVGA3dCmdBlitSurfaceToScreen(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGASignedRect) <= next) {
+            dump_SVGASignedRect((const SVGASignedRect *)body);
+            body += sizeof(SVGASignedRect);
+         }
+      }
+      break;
+   default:
+      _debug_printf("\t0x%08x\n", cmd_id);
+      break;
+   }
+
+   while(body + sizeof(uint32_t) <= next) {
+      _debug_printf("\t\t0x%08x\n", *(const uint32_t *)body);
+      body += sizeof(uint32_t);
+   }
+   while(body + sizeof(uint32_t) <= next)
+      _debug_printf("\t\t0x%02x\n", *body++);
+}
+
+
+void            
+svga_dump_commands(const void *commands, uint32_t size)
+{
+   const uint8_t *next = commands;
+   const uint8_t *last = next + size;
+   
+   assert(size % sizeof(uint32_t) == 0);
+   
+   while(next < last) {
+      const uint32_t cmd_id = *(const uint32_t *)next;
+
+      if(SVGA_3D_CMD_BASE <= cmd_id && cmd_id < SVGA_3D_CMD_MAX) {
+         const SVGA3dCmdHeader *header = (const SVGA3dCmdHeader *)next;
+         const uint8_t *body = (const uint8_t *)&header[1];
+
+         next = body + header->size;
+         if(next > last)
+            break;
+
+         svga_dump_command(cmd_id, body, header->size);
+      }
+      else if(cmd_id == SVGA_CMD_FENCE) {
+         _debug_printf("\tSVGA_CMD_FENCE\n");
+         _debug_printf("\t\t0x%08x\n", ((const uint32_t *)next)[1]);
+         next += 2*sizeof(uint32_t);
+      }
+      else {
+         _debug_printf("\t0x%08x\n", cmd_id);
+         next += sizeof(uint32_t);
+      }
+   }
+}
+
diff --git a/src/gallium/drivers/svga/svgadump/svga_dump.h b/src/gallium/drivers/svga/svgadump/svga_dump.h
new file mode 100644
index 0000000000..ca0154361c
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_dump.h
@@ -0,0 +1,37 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_DUMP_H_
+#define SVGA_DUMP_H_
+
+#include "pipe/p_compiler.h"
+
+void            
+svga_dump_command(uint32_t cmd_id, const void *data, uint32_t size);
+
+void
+svga_dump_commands(const void *commands, uint32_t size);
+
+#endif /* SVGA_DUMP_H_ */
diff --git a/src/gallium/drivers/svga/svgadump/svga_dump.py b/src/gallium/drivers/svga/svgadump/svga_dump.py
new file mode 100755
index 0000000000..0bc0b3ae31
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_dump.py
@@ -0,0 +1,340 @@
+#!/usr/bin/env python
+'''
+Generates dumper for the SVGA 3D command stream using pygccxml.
+
+Jose Fonseca <jfonseca@vmware.com>
+'''
+
+copyright = '''
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+ '''
+
+import os
+import sys
+
+from pygccxml import parser
+from pygccxml import declarations
+
+from pygccxml.declarations import algorithm
+from pygccxml.declarations import decl_visitor
+from pygccxml.declarations import type_traits
+from pygccxml.declarations import type_visitor
+
+
+enums = True
+
+
+class decl_dumper_t(decl_visitor.decl_visitor_t):
+
+    def __init__(self, instance = '', decl = None):
+        decl_visitor.decl_visitor_t.__init__(self)
+        self._instance = instance
+        self.decl = decl
+
+    def clone(self):
+        return decl_dumper_t(self._instance, self.decl)
+
+    def visit_class(self):
+        class_ = self.decl
+        assert self.decl.class_type in ('struct', 'union')
+
+        for variable in class_.variables():
+            if variable.name != '':
+                #print 'variable = %r' % variable.name
+                dump_type(self._instance + '.' + variable.name, variable.type)
+
+    def visit_enumeration(self):
+        if enums:
+            print '   switch(%s) {' % ("(*cmd)" + self._instance,)
+            for name, value in self.decl.values:
+                print '   case %s:' % (name,)
+                print '      _debug_printf("\\t\\t%s = %s\\n");' % (self._instance, name)
+                print '      break;'
+            print '   default:'
+            print '      _debug_printf("\\t\\t%s = %%i\\n", %s);' % (self._instance, "(*cmd)" + self._instance)
+            print '      break;'
+            print '   }'
+        else:
+            print '   _debug_printf("\\t\\t%s = %%i\\n", %s);' % (self._instance, "(*cmd)" + self._instance)
+
+
+def dump_decl(instance, decl):
+    dumper = decl_dumper_t(instance, decl)
+    algorithm.apply_visitor(dumper, decl)
+
+
+class type_dumper_t(type_visitor.type_visitor_t):
+
+    def __init__(self, instance, type_):
+        type_visitor.type_visitor_t.__init__(self)
+        self.instance = instance
+        self.type = type_
+
+    def clone(self):
+        return type_dumper_t(self.instance, self.type)
+
+    def visit_char(self):
+        self.print_instance('%i')
+        
+    def visit_unsigned_char(self):
+        self.print_instance('%u')
+
+    def visit_signed_char(self):
+        self.print_instance('%i')
+    
+    def visit_wchar(self):
+        self.print_instance('%i')
+        
+    def visit_short_int(self):
+        self.print_instance('%i')
+        
+    def visit_short_unsigned_int(self):
+        self.print_instance('%u')
+        
+    def visit_bool(self):
+        self.print_instance('%i')
+        
+    def visit_int(self):
+        self.print_instance('%i')
+        
+    def visit_unsigned_int(self):
+        self.print_instance('%u')
+        
+    def visit_long_int(self):
+        self.print_instance('%li')
+        
+    def visit_long_unsigned_int(self):
+        self.print_instance('%lu')
+        
+    def visit_long_long_int(self):
+        self.print_instance('%lli')
+        
+    def visit_long_long_unsigned_int(self):
+        self.print_instance('%llu')
+        
+    def visit_float(self):
+        self.print_instance('%f')
+        
+    def visit_double(self):
+        self.print_instance('%f')
+        
+    def visit_array(self):
+        for i in range(type_traits.array_size(self.type)):
+            dump_type(self.instance + '[%i]' % i, type_traits.base_type(self.type))
+
+    def visit_pointer(self):
+        self.print_instance('%p')
+
+    def visit_declarated(self):
+        #print 'decl = %r' % self.type.decl_string
+        decl = type_traits.remove_declarated(self.type)
+        dump_decl(self.instance, decl)
+
+    def print_instance(self, format):
+        print '   _debug_printf("\\t\\t%s = %s\\n", %s);' % (self.instance, format, "(*cmd)" + self.instance)
+
+
+def dump_type(instance, type_):
+    type_ = type_traits.remove_alias(type_)
+    visitor = type_dumper_t(instance, type_)
+    algorithm.apply_visitor(visitor, type_)
+
+
+def dump_struct(decls, class_):
+    print 'static void'
+    print 'dump_%s(const %s *cmd)' % (class_.name, class_.name)
+    print '{'
+    dump_decl('', class_)
+    print '}'
+    print ''
+
+
+cmds = [
+    ('SVGA_3D_CMD_SURFACE_DEFINE', 'SVGA3dCmdDefineSurface', (), 'SVGA3dSize'),
+    ('SVGA_3D_CMD_SURFACE_DESTROY', 'SVGA3dCmdDestroySurface', (), None),
+    ('SVGA_3D_CMD_SURFACE_COPY', 'SVGA3dCmdSurfaceCopy', (), 'SVGA3dCopyBox'),
+    ('SVGA_3D_CMD_SURFACE_STRETCHBLT', 'SVGA3dCmdSurfaceStretchBlt', (), None),
+    ('SVGA_3D_CMD_SURFACE_DMA', 'SVGA3dCmdSurfaceDMA', (), 'SVGA3dCopyBox'),
+    ('SVGA_3D_CMD_CONTEXT_DEFINE', 'SVGA3dCmdDefineContext', (), None),
+    ('SVGA_3D_CMD_CONTEXT_DESTROY', 'SVGA3dCmdDestroyContext', (), None),
+    ('SVGA_3D_CMD_SETTRANSFORM', 'SVGA3dCmdSetTransform', (), None),
+    ('SVGA_3D_CMD_SETZRANGE', 'SVGA3dCmdSetZRange', (), None),
+    ('SVGA_3D_CMD_SETRENDERSTATE', 'SVGA3dCmdSetRenderState', (), 'SVGA3dRenderState'),
+    ('SVGA_3D_CMD_SETRENDERTARGET', 'SVGA3dCmdSetRenderTarget', (), None),
+    ('SVGA_3D_CMD_SETTEXTURESTATE', 'SVGA3dCmdSetTextureState', (), 'SVGA3dTextureState'),
+    ('SVGA_3D_CMD_SETMATERIAL', 'SVGA3dCmdSetMaterial', (), None),
+    ('SVGA_3D_CMD_SETLIGHTDATA', 'SVGA3dCmdSetLightData', (), None),
+    ('SVGA_3D_CMD_SETLIGHTENABLED', 'SVGA3dCmdSetLightEnabled', (), None),
+    ('SVGA_3D_CMD_SETVIEWPORT', 'SVGA3dCmdSetViewport', (), None),
+    ('SVGA_3D_CMD_SETCLIPPLANE', 'SVGA3dCmdSetClipPlane', (), None),
+    ('SVGA_3D_CMD_CLEAR', 'SVGA3dCmdClear', (), 'SVGA3dRect'),
+    ('SVGA_3D_CMD_PRESENT', 'SVGA3dCmdPresent', (), 'SVGA3dCopyRect'),
+    ('SVGA_3D_CMD_SHADER_DEFINE', 'SVGA3dCmdDefineShader', (), None),
+    ('SVGA_3D_CMD_SHADER_DESTROY', 'SVGA3dCmdDestroyShader', (), None),
+    ('SVGA_3D_CMD_SET_SHADER', 'SVGA3dCmdSetShader', (), None),
+    ('SVGA_3D_CMD_SET_SHADER_CONST', 'SVGA3dCmdSetShaderConst', (), None),
+    ('SVGA_3D_CMD_DRAW_PRIMITIVES', 'SVGA3dCmdDrawPrimitives', (('SVGA3dVertexDecl', 'numVertexDecls'), ('SVGA3dPrimitiveRange', 'numRanges')), 'SVGA3dVertexDivisor'),
+    ('SVGA_3D_CMD_SETSCISSORRECT', 'SVGA3dCmdSetScissorRect', (), None),
+    ('SVGA_3D_CMD_BEGIN_QUERY', 'SVGA3dCmdBeginQuery', (), None),
+    ('SVGA_3D_CMD_END_QUERY', 'SVGA3dCmdEndQuery', (), None),
+    ('SVGA_3D_CMD_WAIT_FOR_QUERY', 'SVGA3dCmdWaitForQuery', (), None),
+    #('SVGA_3D_CMD_PRESENT_READBACK', None, (), None),
+    ('SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN', 'SVGA3dCmdBlitSurfaceToScreen', (), 'SVGASignedRect'),
+]
+
+def dump_cmds():
+    print r'''
+void            
+svga_dump_command(uint32_t cmd_id, const void *data, uint32_t size)
+{
+   const uint8_t *body = (const uint8_t *)data;
+   const uint8_t *next = body + size;
+'''
+    print '   switch(cmd_id) {'
+    indexes = 'ijklmn'
+    for id, header, body, footer in cmds:
+        print '   case %s:' % id
+        print '      _debug_printf("\\t%s\\n");' % id
+        print '      {'
+        print '         const %s *cmd = (const %s *)body;' % (header, header)
+        if len(body):
+            print '         unsigned ' + ', '.join(indexes[:len(body)]) + ';'
+        print '         dump_%s(cmd);' % header
+        print '         body = (const uint8_t *)&cmd[1];'
+        for i in range(len(body)):
+            struct, count = body[i]
+            idx = indexes[i]
+            print '         for(%s = 0; %s < cmd->%s; ++%s) {' % (idx, idx, count, idx)
+            print '            dump_%s((const %s *)body);' % (struct, struct)
+            print '            body += sizeof(%s);' % struct
+            print '         }'
+        if footer is not None:
+            print '         while(body + sizeof(%s) <= next) {' % footer
+            print '            dump_%s((const %s *)body);' % (footer, footer)
+            print '            body += sizeof(%s);' % footer
+            print '         }'
+        if id == 'SVGA_3D_CMD_SHADER_DEFINE':
+            print '         svga_shader_dump((const uint32_t *)body,'
+            print '                          (unsigned)(next - body)/sizeof(uint32_t),'
+            print '                          FALSE);'
+            print '         body = next;'
+        print '      }'
+        print '      break;'
+    print '   default:'
+    print '      _debug_printf("\\t0x%08x\\n", cmd_id);'
+    print '      break;'
+    print '   }'
+    print r'''
+   while(body + sizeof(uint32_t) <= next) {
+      _debug_printf("\t\t0x%08x\n", *(const uint32_t *)body);
+      body += sizeof(uint32_t);
+   }
+   while(body + sizeof(uint32_t) <= next)
+      _debug_printf("\t\t0x%02x\n", *body++);
+}
+'''
+    print r'''
+void            
+svga_dump_commands(const void *commands, uint32_t size)
+{
+   const uint8_t *next = commands;
+   const uint8_t *last = next + size;
+   
+   assert(size % sizeof(uint32_t) == 0);
+   
+   while(next < last) {
+      const uint32_t cmd_id = *(const uint32_t *)next;
+
+      if(SVGA_3D_CMD_BASE <= cmd_id && cmd_id < SVGA_3D_CMD_MAX) {
+         const SVGA3dCmdHeader *header = (const SVGA3dCmdHeader *)next;
+         const uint8_t *body = (const uint8_t *)&header[1];
+
+         next = body + header->size;
+         if(next > last)
+            break;
+
+         svga_dump_command(cmd_id, body, header->size);
+      }
+      else if(cmd_id == SVGA_CMD_FENCE) {
+         _debug_printf("\tSVGA_CMD_FENCE\n");
+         _debug_printf("\t\t0x%08x\n", ((const uint32_t *)next)[1]);
+         next += 2*sizeof(uint32_t);
+      }
+      else {
+         _debug_printf("\t0x%08x\n", cmd_id);
+         next += sizeof(uint32_t);
+      }
+   }
+}
+'''
+
+def main():
+    print copyright.strip()
+    print
+    print '/**'
+    print ' * @file'
+    print ' * Dump SVGA commands.'
+    print ' *'
+    print ' * Generated automatically from svga3d_reg.h by svga_dump.py.'
+    print ' */'
+    print
+    print '#include "svga_types.h"'
+    print '#include "svga_shader_dump.h"'
+    print '#include "svga3d_reg.h"'
+    print
+    print '#include "util/u_debug.h"'
+    print '#include "svga_dump.h"'
+    print
+
+    config = parser.config_t(
+        include_paths = ['../../../include', '../include'],
+        compiler = 'gcc',
+    )
+
+    headers = [
+        'svga_types.h', 
+        'svga3d_reg.h', 
+    ]
+
+    decls = parser.parse(headers, config, parser.COMPILATION_MODE.ALL_AT_ONCE)
+    global_ns = declarations.get_global_namespace(decls)
+
+    names = set()
+    for id, header, body, footer in cmds:
+        names.add(header)
+        for struct, count in body:
+            names.add(struct)
+        if footer is not None:
+            names.add(footer)
+
+    for class_ in global_ns.classes(lambda decl: decl.name in names):
+        dump_struct(decls, class_)
+
+    dump_cmds()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader.h b/src/gallium/drivers/svga/svgadump/svga_shader.h
new file mode 100644
index 0000000000..5db64bf135
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_shader.h
@@ -0,0 +1,227 @@
+/**********************************************************
+ * Copyright 2007-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * SVGA Shader Token Definitions
+ * 
+ * @author Michal Krol <michal@vmware.com>
+ */
+
+#ifndef ST_SHADER_SVGA_H
+#define ST_SHADER_SVGA_H
+
+#include "pipe/p_compiler.h"
+
+struct sh_op
+{
+   unsigned opcode:16;
+   unsigned control:8;
+   unsigned length:4;
+   unsigned predicated:1;
+   unsigned unused:1;
+   unsigned coissue:1;
+   unsigned is_reg:1;
+};
+
+struct sh_reg
+{
+   unsigned number:11;
+   unsigned type_hi:2;
+   unsigned relative:1;
+   unsigned unused:14;
+   unsigned type_lo:3;
+   unsigned is_reg:1;
+};
+
+static INLINE unsigned
+sh_reg_type( struct sh_reg reg )
+{
+   return reg.type_lo | (reg.type_hi << 3);
+}
+
+struct sh_cdata
+{
+   float xyzw[4];
+};
+
+struct sh_def
+{
+   struct sh_op op;
+   struct sh_reg reg;
+   struct sh_cdata cdata;
+};
+
+struct sh_defb
+{
+   struct sh_op op;
+   struct sh_reg reg;
+   uint data;
+};
+
+struct sh_idata
+{
+   int xyzw[4];
+};
+
+struct sh_defi
+{
+   struct sh_op op;
+   struct sh_reg reg;
+   struct sh_idata idata;
+};
+
+#define PS_TEXTURETYPE_UNKNOWN   SVGA3DSAMP_UNKNOWN
+#define PS_TEXTURETYPE_2D        SVGA3DSAMP_2D
+#define PS_TEXTURETYPE_CUBE      SVGA3DSAMP_CUBE
+#define PS_TEXTURETYPE_VOLUME    SVGA3DSAMP_VOLUME
+
+struct sh_sampleinfo
+{
+   unsigned unused:27;
+   unsigned texture_type:4;
+   unsigned is_reg:1;
+};
+
+struct sh_semantic
+{
+   unsigned usage:4;
+   unsigned unused1:12;
+   unsigned usage_index:4;
+   unsigned unused2:11;
+   unsigned is_reg:1;
+};
+
+#define SH_WRITEMASK_0              0x1
+#define SH_WRITEMASK_1              0x2
+#define SH_WRITEMASK_2              0x4
+#define SH_WRITEMASK_3              0x8
+#define SH_WRITEMASK_ALL            0xf
+
+#define SH_DSTMOD_NONE              0x0
+#define SH_DSTMOD_SATURATE          0x1
+#define SH_DSTMOD_PARTIALPRECISION  0x2
+#define SH_DSTMOD_MSAMPCENTROID     0x4
+
+struct sh_dstreg
+{
+   unsigned number:11;
+   unsigned type_hi:2;
+   unsigned relative:1;
+   unsigned unused:2;
+   unsigned write_mask:4;
+   unsigned modifier:4;
+   unsigned shift_scale:4;
+   unsigned type_lo:3;
+   unsigned is_reg:1;
+};
+
+static INLINE unsigned
+sh_dstreg_type( struct sh_dstreg reg )
+{
+   return reg.type_lo | (reg.type_hi << 3);
+}
+
+struct sh_dcl
+{
+   struct sh_op op;
+   union {
+      struct sh_sampleinfo sampleinfo;
+      struct sh_semantic semantic;
+   } u;
+   struct sh_dstreg reg;
+};
+
+struct sh_srcreg
+{
+   unsigned number:11;
+   unsigned type_hi:2;
+   unsigned relative:1;
+   unsigned unused:2;
+   unsigned swizzle_x:2;
+   unsigned swizzle_y:2;
+   unsigned swizzle_z:2;
+   unsigned swizzle_w:2;
+   unsigned modifier:4;
+   unsigned type_lo:3;
+   unsigned is_reg:1;
+};
+
+static INLINE unsigned
+sh_srcreg_type( struct sh_srcreg reg )
+{
+   return reg.type_lo | (reg.type_hi << 3);
+}
+
+struct sh_dstop
+{
+   struct sh_op op;
+   struct sh_dstreg dst;
+};
+
+struct sh_srcop
+{
+   struct sh_op op;
+   struct sh_srcreg src;
+};
+
+struct sh_src2op
+{
+   struct sh_op op;
+   struct sh_srcreg src0;
+   struct sh_srcreg src1;
+};
+
+struct sh_unaryop
+{
+   struct sh_op op;
+   struct sh_dstreg dst;
+   struct sh_srcreg src;
+};
+
+struct sh_binaryop
+{
+   struct sh_op op;
+   struct sh_dstreg dst;
+   struct sh_srcreg src0;
+   struct sh_srcreg src1;
+};
+
+struct sh_trinaryop
+{
+   struct sh_op op;
+   struct sh_dstreg dst;
+   struct sh_srcreg src0;
+   struct sh_srcreg src1;
+   struct sh_srcreg src2;
+};
+
+struct sh_comment
+{
+   unsigned opcode:16;
+   unsigned size:16;
+};
+
+#endif /* ST_SHADER_SVGA_H */
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader_dump.c b/src/gallium/drivers/svga/svgadump/svga_shader_dump.c
new file mode 100644
index 0000000000..4ee1bf2c35
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_shader_dump.c
@@ -0,0 +1,722 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * SVGA Shader Dump Facilities
+ * 
+ * @author Michal Krol <michal@vmware.com>
+ */
+
+#include "svga_shader.h"
+#include "svga_shader_dump.h"
+#include "svga_shader_op.h"
+#include "util/u_debug.h"
+
+#include "../svga_hw_reg.h"
+#include "svga3d_shaderdefs.h"
+
+struct dump_info
+{
+   uint32 version;
+   boolean is_ps;
+   int indent;
+};
+
+#define DUMP_MAX_OP_SRC 4
+
+struct dump_op
+{
+   struct sh_op op;
+   struct sh_dstreg dst;
+   struct sh_srcreg dstind;
+   struct sh_srcreg src[DUMP_MAX_OP_SRC];
+   struct sh_srcreg srcind[DUMP_MAX_OP_SRC];
+   struct sh_srcreg p0;
+};
+
+static void
+dump_indent(int indent)
+{
+   int i;
+
+   for (i = 0; i < indent; ++i) {
+      _debug_printf("  ");
+   }
+}
+
+static void dump_op( struct sh_op op, const char *mnemonic )
+{
+   assert( op.is_reg == 0 );
+
+   if (op.predicated) {
+      _debug_printf("(p0) ");
+   }
+   if (op.coissue)
+      _debug_printf( "+" );
+   _debug_printf( "%s", mnemonic );
+
+   switch (op.opcode) {
+   case SVGA3DOP_TEX:
+      switch (op.control) {
+      case 0:
+         break;
+      case 1 /* PROJECT */:
+         _debug_printf("p");
+         break;
+      case 2 /* BIAS */:
+         _debug_printf("b");
+         break;
+      default:
+         assert(0);
+      }
+      break;
+
+   case SVGA3DOP_IFC:
+   case SVGA3DOP_BREAKC:
+   case SVGA3DOP_SETP:
+      switch (op.control) {
+      case SVGA3DOPCOMP_GT:
+         _debug_printf("_gt");
+         break;
+      case SVGA3DOPCOMP_EQ:
+         _debug_printf("_eq");
+         break;
+      case SVGA3DOPCOMP_GE:
+         _debug_printf("_ge");
+         break;
+      case SVGA3DOPCOMP_LT:
+         _debug_printf("_lt");
+         break;
+      case SVGA3DOPCOMPC_NE:
+         _debug_printf("_ne");
+         break;
+      case SVGA3DOPCOMP_LE:
+         _debug_printf("_le");
+         break;
+      default:
+         assert(0);
+      }
+      break;
+
+   default:
+      assert(op.control == 0);
+   }
+}
+
+static void
+format_reg(const char *name,
+           const struct sh_reg reg,
+           const struct sh_srcreg *indreg)
+{
+   if (reg.relative) {
+      assert(indreg);
+
+      if (sh_srcreg_type(*indreg) == SVGA3DREG_LOOP) {
+         _debug_printf("%s[aL+%u]", name, reg.number);
+      } else {
+         _debug_printf("%s[a%u.x+%u]", name, indreg->number, reg.number);
+      }
+   } else {
+      _debug_printf("%s%u", name, reg.number);
+   }
+}
+
+static void dump_reg( struct sh_reg reg, struct sh_srcreg *indreg, const struct dump_info *di )
+{
+   assert( reg.is_reg == 1 );
+
+   switch (sh_reg_type( reg )) {
+   case SVGA3DREG_TEMP:
+      format_reg("r", reg, NULL);
+      break;
+
+   case SVGA3DREG_INPUT:
+      format_reg("v", reg, indreg);
+      break;
+
+   case SVGA3DREG_CONST:
+      format_reg("c", reg, indreg);
+      break;
+
+   case SVGA3DREG_ADDR:    /* VS */
+   /* SVGA3DREG_TEXTURE */ /* PS */
+      assert(!reg.relative);
+      if (di->is_ps) {
+         format_reg("t", reg, NULL);
+      } else {
+         format_reg("a", reg, NULL);
+      }
+      break;
+
+   case SVGA3DREG_RASTOUT:
+      assert(!reg.relative);
+      switch (reg.number) {
+      case 0 /*POSITION*/:
+         _debug_printf( "oPos" );
+         break;
+      case 1 /*FOG*/:
+         _debug_printf( "oFog" );
+         break;
+      case 2 /*POINT_SIZE*/:
+         _debug_printf( "oPts" );
+         break;
+      default:
+         assert( 0 );
+         _debug_printf( "???" );
+      }
+      break;
+
+   case SVGA3DREG_ATTROUT:
+      assert( reg.number < 2 );
+      format_reg("oD", reg, NULL);
+      break;
+
+   case SVGA3DREG_TEXCRDOUT:  /* VS */
+   /* SVGA3DREG_OUTPUT */     /* VS3.0+ */
+      if (!di->is_ps && di->version >= SVGA3D_VS_30) {
+         format_reg("o", reg, indreg);
+      } else {
+         format_reg("oT", reg, NULL);
+      }
+      break;
+
+   case SVGA3DREG_COLOROUT:
+      format_reg("oC", reg, NULL);
+      break;
+
+   case SVGA3DREG_DEPTHOUT:
+      assert(!reg.relative);
+      assert(reg.number == 0);
+      _debug_printf("oDepth");
+      break;
+
+   case SVGA3DREG_SAMPLER:
+      format_reg("s", reg, NULL);
+      break;
+
+   case SVGA3DREG_CONSTBOOL:
+      format_reg("b", reg, NULL);
+      break;
+
+   case SVGA3DREG_CONSTINT:
+      format_reg("i", reg, NULL);
+      break;
+
+   case SVGA3DREG_LOOP:
+      assert(!reg.relative);
+      assert( reg.number == 0 );
+      _debug_printf( "aL" );
+      break;
+
+   case SVGA3DREG_MISCTYPE:
+      assert(!reg.relative);
+      switch (reg.number) {
+      case SVGA3DMISCREG_POSITION:
+         _debug_printf("vPos");
+         break;
+      case SVGA3DMISCREG_FACE:
+         _debug_printf("vFace");
+         break;
+      default:
+         assert(0);
+         _debug_printf("???");
+      }
+      break;
+
+   case SVGA3DREG_LABEL:
+      format_reg("l", reg, NULL);
+      break;
+
+   case SVGA3DREG_PREDICATE:
+      format_reg("p", reg, NULL);
+      break;
+
+   default:
+      assert( 0 );
+      _debug_printf( "???" );
+   }
+}
+
+static void dump_cdata( struct sh_cdata cdata )
+{
+   _debug_printf( "%f, %f, %f, %f", cdata.xyzw[0], cdata.xyzw[1], cdata.xyzw[2], cdata.xyzw[3] );
+}
+
+static void dump_idata( struct sh_idata idata )
+{
+   _debug_printf( "%d, %d, %d, %d", idata.xyzw[0], idata.xyzw[1], idata.xyzw[2], idata.xyzw[3] );
+}
+
+static void dump_bdata( boolean bdata )
+{
+   _debug_printf( bdata ? "TRUE" : "FALSE" );
+}
+
+static void
+dump_sampleinfo(struct sh_sampleinfo sampleinfo)
+{
+   assert( sampleinfo.is_reg == 1 );
+
+   switch (sampleinfo.texture_type) {
+   case SVGA3DSAMP_2D:
+      _debug_printf( "_2d" );
+      break;
+   case SVGA3DSAMP_CUBE:
+      _debug_printf( "_cube" );
+      break;
+   case SVGA3DSAMP_VOLUME:
+      _debug_printf( "_volume" );
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+static void
+dump_semantic(uint usage,
+              uint usage_index)
+{
+   switch (usage) {
+   case SVGA3D_DECLUSAGE_POSITION:
+      _debug_printf("_position");
+      break;
+   case SVGA3D_DECLUSAGE_BLENDWEIGHT:
+      _debug_printf("_blendweight");
+      break;
+   case SVGA3D_DECLUSAGE_BLENDINDICES:
+      _debug_printf("_blendindices");
+      break;
+   case SVGA3D_DECLUSAGE_NORMAL:
+      _debug_printf("_normal");
+      break;
+   case SVGA3D_DECLUSAGE_PSIZE:
+      _debug_printf("_psize");
+      break;
+   case SVGA3D_DECLUSAGE_TEXCOORD:
+      _debug_printf("_texcoord");
+      break;
+   case SVGA3D_DECLUSAGE_TANGENT:
+      _debug_printf("_tangent");
+      break;
+   case SVGA3D_DECLUSAGE_BINORMAL:
+      _debug_printf("_binormal");
+      break;
+   case SVGA3D_DECLUSAGE_TESSFACTOR:
+      _debug_printf("_tessfactor");
+      break;
+   case SVGA3D_DECLUSAGE_POSITIONT:
+      _debug_printf("_positiont");
+      break;
+   case SVGA3D_DECLUSAGE_COLOR:
+      _debug_printf("_color");
+      break;
+   case SVGA3D_DECLUSAGE_FOG:
+      _debug_printf("_fog");
+      break;
+   case SVGA3D_DECLUSAGE_DEPTH:
+      _debug_printf("_depth");
+      break;
+   case SVGA3D_DECLUSAGE_SAMPLE:
+      _debug_printf("_sample");
+      break;
+   default:
+      assert(!"Unknown usage");
+      _debug_printf("_???");
+   }
+
+   if (usage_index) {
+      _debug_printf("%u", usage_index);
+   }
+}
+
+static void
+dump_dstreg(struct sh_dstreg dstreg,
+            struct sh_srcreg *indreg,
+            const struct dump_info *di)
+{
+   union {
+      struct sh_reg reg;
+      struct sh_dstreg dstreg;
+   } u;
+
+   memset(&u, 0, sizeof(u));
+
+   assert( (dstreg.modifier & (SVGA3DDSTMOD_SATURATE | SVGA3DDSTMOD_PARTIALPRECISION)) == dstreg.modifier );
+
+   if (dstreg.modifier & SVGA3DDSTMOD_SATURATE)
+      _debug_printf( "_sat" );
+   if (dstreg.modifier & SVGA3DDSTMOD_PARTIALPRECISION)
+      _debug_printf( "_pp" );
+   switch (dstreg.shift_scale) {
+   case 0:
+      break;
+   case 1:
+      _debug_printf( "_x2" );
+      break;
+   case 2:
+      _debug_printf( "_x4" );
+      break;
+   case 3:
+      _debug_printf( "_x8" );
+      break;
+   case 13:
+      _debug_printf( "_d8" );
+      break;
+   case 14:
+      _debug_printf( "_d4" );
+      break;
+   case 15:
+      _debug_printf( "_d2" );
+      break;
+   default:
+      assert( 0 );
+   }
+   _debug_printf( " " );
+
+   u.dstreg = dstreg;
+   dump_reg( u.reg, indreg, di);
+   if (dstreg.write_mask != SVGA3DWRITEMASK_ALL) {
+      _debug_printf( "." );
+      if (dstreg.write_mask & SVGA3DWRITEMASK_0)
+         _debug_printf( "x" );
+      if (dstreg.write_mask & SVGA3DWRITEMASK_1)
+         _debug_printf( "y" );
+      if (dstreg.write_mask & SVGA3DWRITEMASK_2)
+         _debug_printf( "z" );
+      if (dstreg.write_mask & SVGA3DWRITEMASK_3)
+         _debug_printf( "w" );
+   }
+}
+
+static void dump_srcreg( struct sh_srcreg srcreg, struct sh_srcreg *indreg, const struct dump_info *di )
+{
+   switch (srcreg.modifier) {
+   case SVGA3DSRCMOD_NEG:
+   case SVGA3DSRCMOD_BIASNEG:
+   case SVGA3DSRCMOD_SIGNNEG:
+   case SVGA3DSRCMOD_X2NEG:
+   case SVGA3DSRCMOD_ABSNEG:
+      _debug_printf( "-" );
+      break;
+   case SVGA3DSRCMOD_COMP:
+      _debug_printf( "1-" );
+      break;
+   case SVGA3DSRCMOD_NOT:
+      _debug_printf( "!" );
+   }
+   dump_reg( *(struct sh_reg *) &srcreg, indreg, di );
+   switch (srcreg.modifier) {
+   case SVGA3DSRCMOD_NONE:
+   case SVGA3DSRCMOD_NEG:
+   case SVGA3DSRCMOD_COMP:
+   case SVGA3DSRCMOD_NOT:
+      break;
+   case SVGA3DSRCMOD_BIAS:
+   case SVGA3DSRCMOD_BIASNEG:
+      _debug_printf( "_bias" );
+      break;
+   case SVGA3DSRCMOD_SIGN:
+   case SVGA3DSRCMOD_SIGNNEG:
+      _debug_printf( "_bx2" );
+      break;
+   case SVGA3DSRCMOD_X2:
+   case SVGA3DSRCMOD_X2NEG:
+      _debug_printf( "_x2" );
+      break;
+   case SVGA3DSRCMOD_DZ:
+      _debug_printf( "_dz" );
+      break;
+   case SVGA3DSRCMOD_DW:
+      _debug_printf( "_dw" );
+      break;
+   case SVGA3DSRCMOD_ABS:
+   case SVGA3DSRCMOD_ABSNEG:
+      _debug_printf("_abs");
+      break;
+   default:
+      assert( 0 );
+   }
+   if (srcreg.swizzle_x != 0 || srcreg.swizzle_y != 1 || srcreg.swizzle_z != 2 || srcreg.swizzle_w != 3) {
+      _debug_printf( "." );
+      if (srcreg.swizzle_x == srcreg.swizzle_y && srcreg.swizzle_y == srcreg.swizzle_z && srcreg.swizzle_z == srcreg.swizzle_w) {
+         _debug_printf( "%c", "xyzw"[srcreg.swizzle_x] );
+      }
+      else {
+         _debug_printf( "%c", "xyzw"[srcreg.swizzle_x] );
+         _debug_printf( "%c", "xyzw"[srcreg.swizzle_y] );
+         _debug_printf( "%c", "xyzw"[srcreg.swizzle_z] );
+         _debug_printf( "%c", "xyzw"[srcreg.swizzle_w] );
+      }
+   }
+}
+
+static void
+parse_op(struct dump_info *di,
+         const uint **token,
+         struct dump_op *op,
+         uint num_dst,
+         uint num_src)
+{
+   uint i;
+
+   assert(num_dst <= 1);
+   assert(num_src <= DUMP_MAX_OP_SRC);
+
+   op->op = *(struct sh_op *)*token;
+   *token += sizeof(struct sh_op) / sizeof(uint);
+
+   if (num_dst >= 1) {
+      op->dst = *(struct sh_dstreg *)*token;
+      *token += sizeof(struct sh_dstreg) / sizeof(uint);
+      if (op->dst.relative &&
+          (!di->is_ps && di->version >= SVGA3D_VS_30)) {
+         op->dstind = *(struct sh_srcreg *)*token;
+         *token += sizeof(struct sh_srcreg) / sizeof(uint);
+      }
+   }
+
+   if (op->op.predicated) {
+      op->p0 = *(struct sh_srcreg *)*token;
+      *token += sizeof(struct sh_srcreg) / sizeof(uint);
+   }
+
+   for (i = 0; i < num_src; ++i) {
+      op->src[i] = *(struct sh_srcreg *)*token;
+      *token += sizeof(struct sh_srcreg) / sizeof(uint);
+      if (op->src[i].relative &&
+          ((!di->is_ps && di->version >= SVGA3D_VS_20) ||
+          (di->is_ps && di->version >= SVGA3D_PS_30))) {
+         op->srcind[i] = *(struct sh_srcreg *)*token;
+         *token += sizeof(struct sh_srcreg) / sizeof(uint);
+      }
+   }
+}
+
+static void
+dump_inst(struct dump_info *di,
+          const unsigned **assem,
+          struct sh_op op,
+          const struct sh_opcode_info *info)
+{
+   struct dump_op dop;
+   boolean not_first_arg = FALSE;
+   uint i;
+
+   assert(info->num_dst <= 1);
+
+   di->indent -= info->pre_dedent;
+   dump_indent(di->indent);
+   di->indent += info->post_indent;
+
+   dump_op(op, info->mnemonic);
+
+   parse_op(di, assem, &dop, info->num_dst, info->num_src);
+   if (info->num_dst > 0) {
+      dump_dstreg(dop.dst, &dop.dstind, di);
+      not_first_arg = TRUE;
+   }
+
+   for (i = 0; i < info->num_src; i++) {
+      if (not_first_arg) {
+         _debug_printf(", ");
+      } else {
+         _debug_printf(" ");
+      }
+      dump_srcreg(dop.src[i], &dop.srcind[i], di);
+      not_first_arg = TRUE;
+   }
+
+   _debug_printf("\n");
+}
+
+void
+svga_shader_dump(
+   const unsigned *assem,
+   unsigned dwords,
+   unsigned do_binary )
+{
+   boolean finished = FALSE;
+   struct dump_info di;
+
+   di.version = *assem++;
+   di.is_ps = (di.version & 0xFFFF0000) == 0xFFFF0000;
+   di.indent = 0;
+
+   _debug_printf(
+      "%s_%u_%u\n",
+      di.is_ps ? "ps" : "vs",
+      (di.version >> 8) & 0xff,
+      di.version & 0xff );
+
+   while (!finished) {
+      struct sh_op op = *(struct sh_op *) assem;
+
+      switch (op.opcode) {
+      case SVGA3DOP_DCL:
+         {
+            struct sh_dcl dcl = *(struct sh_dcl *) assem;
+
+            _debug_printf( "dcl" );
+            switch (sh_dstreg_type(dcl.reg)) {
+            case SVGA3DREG_INPUT:
+               if ((di.is_ps && di.version >= SVGA3D_PS_30) ||
+                   (!di.is_ps && di.version >= SVGA3D_VS_30)) {
+                  dump_semantic(dcl.u.semantic.usage,
+                                dcl.u.semantic.usage_index);
+               }
+               break;
+            case SVGA3DREG_TEXCRDOUT:
+               if (!di.is_ps && di.version >= SVGA3D_VS_30) {
+                  dump_semantic(dcl.u.semantic.usage,
+                                dcl.u.semantic.usage_index);
+               }
+               break;
+            case SVGA3DREG_SAMPLER:
+               dump_sampleinfo( dcl.u.sampleinfo );
+               break;
+            }
+            dump_dstreg(dcl.reg, NULL, &di);
+            _debug_printf( "\n" );
+            assem += sizeof( struct sh_dcl ) / sizeof( unsigned );
+         }
+         break;
+
+      case SVGA3DOP_DEFB:
+         {
+            struct sh_defb defb = *(struct sh_defb *) assem;
+
+            _debug_printf( "defb " );
+            dump_reg( defb.reg, NULL, &di );
+            _debug_printf( ", " );
+            dump_bdata( defb.data );
+            _debug_printf( "\n" );
+            assem += sizeof( struct sh_defb ) / sizeof( unsigned );
+         }
+         break;
+
+      case SVGA3DOP_DEFI:
+         {
+            struct sh_defi defi = *(struct sh_defi *) assem;
+
+            _debug_printf( "defi " );
+            dump_reg( defi.reg, NULL, &di );
+            _debug_printf( ", " );
+            dump_idata( defi.idata );
+            _debug_printf( "\n" );
+            assem += sizeof( struct sh_defi ) / sizeof( unsigned );
+         }
+         break;
+
+      case SVGA3DOP_TEXCOORD:
+         {
+            struct sh_opcode_info info = *svga_opcode_info(op.opcode);
+
+            assert(di.is_ps);
+            if (di.version > SVGA3D_PS_13) {
+               assert(info.num_src == 0);
+
+               info.num_src = 1;
+            }
+
+            dump_inst(&di, &assem, op, &info);
+         }
+         break;
+
+      case SVGA3DOP_TEX:
+         {
+            struct sh_opcode_info info = *svga_opcode_info(op.opcode);
+
+            assert(di.is_ps);
+            if (di.version > SVGA3D_PS_13) {
+               assert(info.num_src == 0);
+
+               if (di.version > SVGA3D_PS_14) {
+                  info.num_src = 2;
+                  info.mnemonic = "texld";
+               } else {
+                  info.num_src = 1;
+               }
+            }
+
+            dump_inst(&di, &assem, op, &info);
+         }
+         break;
+
+      case SVGA3DOP_DEF:
+         {
+            struct sh_def def = *(struct sh_def *) assem;
+
+            _debug_printf( "def " );
+            dump_reg( def.reg, NULL, &di );
+            _debug_printf( ", " );
+            dump_cdata( def.cdata );
+            _debug_printf( "\n" );
+            assem += sizeof( struct sh_def ) / sizeof( unsigned );
+         }
+         break;
+
+      case SVGA3DOP_SINCOS:
+         {
+            struct sh_opcode_info info = *svga_opcode_info(op.opcode);
+
+            if ((di.is_ps && di.version >= SVGA3D_PS_30) ||
+                (!di.is_ps && di.version >= SVGA3D_VS_30)) {
+               assert(info.num_src == 3);
+
+               info.num_src = 1;
+            }
+
+            dump_inst(&di, &assem, op, &info);
+         }
+         break;
+
+      case SVGA3DOP_PHASE:
+         _debug_printf( "phase\n" );
+         assem += sizeof( struct sh_op ) / sizeof( unsigned );
+         break;
+
+      case SVGA3DOP_COMMENT:
+         {
+            struct sh_comment comment = *(struct sh_comment *)assem;
+
+            /* Ignore comment contents. */
+            assem += sizeof(struct sh_comment) / sizeof(unsigned) + comment.size;
+         }
+         break;
+
+      case SVGA3DOP_END:
+         finished = TRUE;
+         break;
+
+      default:
+         {
+            const struct sh_opcode_info *info = svga_opcode_info(op.opcode);
+
+            dump_inst(&di, &assem, op, info);
+         }
+      }
+   }
+}
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader_dump.h b/src/gallium/drivers/svga/svgadump/svga_shader_dump.h
new file mode 100644
index 0000000000..a2657acb2f
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_shader_dump.h
@@ -0,0 +1,42 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * SVGA Shader Dump Facilities
+ * 
+ * @author Michal Krol <michal@vmware.com>
+ */
+
+#ifndef SVGA_SHADER_DUMP_H
+#define SVGA_SHADER_DUMP_H
+
+void
+svga_shader_dump(
+   const unsigned *assem,
+   unsigned dwords,
+   unsigned do_binary );
+
+#endif /* SVGA_SHADER_DUMP_H */
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader_op.c b/src/gallium/drivers/svga/svgadump/svga_shader_op.c
new file mode 100644
index 0000000000..95612a8006
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_shader_op.c
@@ -0,0 +1,168 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * SVGA Shader Token Opcode Info
+ * 
+ * @author Michal Krol <michal@vmware.com>
+ */
+
+#include "util/u_debug.h"
+#include "svga_shader_op.h"
+
+#include "../svga_hw_reg.h"
+#include "svga3d_shaderdefs.h"
+
+#define SVGA3DOP_INVALID SVGA3DOP_END
+#define TGSI_OPCODE_INVALID TGSI_OPCODE_LAST
+
+static struct sh_opcode_info opcode_info[] =
+{
+   { "nop",          0, 0, 0, 0, SVGA3DOP_NOP          },
+   { "mov",          1, 1, 0, 0, SVGA3DOP_MOV,         },
+   { "add",          1, 2, 0, 0, SVGA3DOP_ADD,         },
+   { "sub",          1, 2, 0, 0, SVGA3DOP_SUB,         },
+   { "mad",          1, 3, 0, 0, SVGA3DOP_MAD,         },
+   { "mul",          1, 2, 0, 0, SVGA3DOP_MUL,         },
+   { "rcp",          1, 1, 0, 0, SVGA3DOP_RCP,         },
+   { "rsq",          1, 1, 0, 0, SVGA3DOP_RSQ,         },
+   { "dp3",          1, 2, 0, 0, SVGA3DOP_DP3,         },
+   { "dp4",          1, 2, 0, 0, SVGA3DOP_DP4,         },
+   { "min",          1, 2, 0, 0, SVGA3DOP_MIN,         },
+   { "max",          1, 2, 0, 0, SVGA3DOP_MAX,         },
+   { "slt",          1, 2, 0, 0, SVGA3DOP_SLT,         },
+   { "sge",          1, 2, 0, 0, SVGA3DOP_SGE,         },
+   { "exp",          1, 1, 0, 0, SVGA3DOP_EXP,         },
+   { "log",          1, 1, 0, 0, SVGA3DOP_LOG,         },
+   { "lit",          1, 1, 0, 0, SVGA3DOP_LIT,         },
+   { "dst",          1, 2, 0, 0, SVGA3DOP_DST,         },
+   { "lrp",          1, 3, 0, 0, SVGA3DOP_LRP,         },
+   { "frc",          1, 1, 0, 0, SVGA3DOP_FRC,         },
+   { "m4x4",         1, 2, 0, 0, SVGA3DOP_M4x4,        },
+   { "m4x3",         1, 2, 0, 0, SVGA3DOP_M4x3,        },
+   { "m3x4",         1, 2, 0, 0, SVGA3DOP_M3x4,        },
+   { "m3x3",         1, 2, 0, 0, SVGA3DOP_M3x3,        },
+   { "m3x2",         1, 2, 0, 0, SVGA3DOP_M3x2,        },
+   { "call",         0, 1, 0, 0, SVGA3DOP_CALL,        },
+   { "callnz",       0, 2, 0, 0, SVGA3DOP_CALLNZ,      },
+   { "loop",         0, 2, 0, 1, SVGA3DOP_LOOP,        },
+   { "ret",          0, 0, 0, 0, SVGA3DOP_RET,         },
+   { "endloop",      0, 0, 1, 0, SVGA3DOP_ENDLOOP,     },
+   { "label",        0, 1, 0, 0, SVGA3DOP_LABEL,       },
+   { "dcl",          0, 0, 0, 0, SVGA3DOP_DCL,         },
+   { "pow",          1, 2, 0, 0, SVGA3DOP_POW,         },
+   { "crs",          1, 2, 0, 0, SVGA3DOP_CRS,         },
+   { "sgn",          1, 3, 0, 0, SVGA3DOP_SGN,         },
+   { "abs",          1, 1, 0, 0, SVGA3DOP_ABS,         },
+   { "nrm",          1, 1, 0, 0, SVGA3DOP_NRM,         }, /* 3-componenet normalization */
+   { "sincos",       1, 3, 0, 0, SVGA3DOP_SINCOS,      },
+   { "rep",          0, 1, 0, 1, SVGA3DOP_REP,         },
+   { "endrep",       0, 0, 1, 0, SVGA3DOP_ENDREP,      },
+   { "if",           0, 1, 0, 1, SVGA3DOP_IF,          },
+   { "ifc",          0, 2, 0, 1, SVGA3DOP_IFC,         },
+   { "else",         0, 0, 1, 1, SVGA3DOP_ELSE,        },
+   { "endif",        0, 0, 1, 0, SVGA3DOP_ENDIF,       },
+   { "break",        0, 0, 0, 0, SVGA3DOP_BREAK,       },
+   { "breakc",       0, 2, 0, 0, SVGA3DOP_BREAKC,      },
+   { "mova",         1, 1, 0, 0, SVGA3DOP_MOVA,        },
+   { "defb",         0, 0, 0, 0, SVGA3DOP_DEFB,        },
+   { "defi",         0, 0, 0, 0, SVGA3DOP_DEFI,        },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, 0, 0, SVGA3DOP_INVALID,     },
+   { "texcoord",     1, 0, 0, 0, SVGA3DOP_TEXCOORD,    },
+   { "texkill",      1, 0, 0, 0, SVGA3DOP_TEXKILL,     },
+   { "tex",          1, 0, 0, 0, SVGA3DOP_TEX,         },
+   { "texbem",       1, 1, 0, 0, SVGA3DOP_TEXBEM,      },
+   { "texbeml",      1, 1, 0, 0, SVGA3DOP_TEXBEML,     },
+   { "texreg2ar",    1, 1, 0, 0, SVGA3DOP_TEXREG2AR,   },
+   { "texreg2gb",    1, 1, 0, 0, SVGA3DOP_TEXREG2GB,   },
+   { "texm3x2pad",   1, 1, 0, 0, SVGA3DOP_TEXM3x2PAD,  },
+   { "texm3x2tex",   1, 1, 0, 0, SVGA3DOP_TEXM3x2TEX,  },
+   { "texm3x3pad",   1, 1, 0, 0, SVGA3DOP_TEXM3x3PAD,  },
+   { "texm3x3tex",   1, 1, 0, 0, SVGA3DOP_TEXM3x3TEX,  },
+   { "reserved0",    0, 0, 0, 0, SVGA3DOP_RESERVED0,   },
+   { "texm3x3spec",  1, 2, 0, 0, SVGA3DOP_TEXM3x3SPEC, },
+   { "texm3x3vspec", 1, 1, 0, 0, SVGA3DOP_TEXM3x3VSPEC,},
+   { "expp",         1, 1, 0, 0, SVGA3DOP_EXPP,        },
+   { "logp",         1, 1, 0, 0, SVGA3DOP_LOGP,        },
+   { "cnd",          1, 3, 0, 0, SVGA3DOP_CND,         },
+   { "def",          0, 0, 0, 0, SVGA3DOP_DEF,         },
+   { "texreg2rgb",   1, 1, 0, 0, SVGA3DOP_TEXREG2RGB,  },
+   { "texdp3tex",    1, 1, 0, 0, SVGA3DOP_TEXDP3TEX,   },
+   { "texm3x2depth", 1, 1, 0, 0, SVGA3DOP_TEXM3x2DEPTH,},
+   { "texdp3",       1, 1, 0, 0, SVGA3DOP_TEXDP3,      },
+   { "texm3x3",      1, 1, 0, 0, SVGA3DOP_TEXM3x3,     },
+   { "texdepth",     1, 0, 0, 0, SVGA3DOP_TEXDEPTH,    },
+   { "cmp",          1, 3, 0, 0, SVGA3DOP_CMP,         },
+   { "bem",          1, 2, 0, 0, SVGA3DOP_BEM,         },
+   { "dp2add",       1, 3, 0, 0, SVGA3DOP_DP2ADD,      },
+   { "dsx",          1, 1, 0, 0, SVGA3DOP_INVALID,     },
+   { "dsy",          1, 1, 0, 0, SVGA3DOP_INVALID,     },
+   { "texldd",       1, 4, 0, 0, SVGA3DOP_INVALID,     },
+   { "setp",         1, 2, 0, 0, SVGA3DOP_SETP,        },
+   { "texldl",       1, 2, 0, 0, SVGA3DOP_INVALID,     },
+   { "breakp",       0, 1, 0, 0, SVGA3DOP_INVALID,     },
+};
+
+const struct sh_opcode_info *svga_opcode_info( uint op )
+{
+   struct sh_opcode_info *info;
+
+   if (op >= sizeof( opcode_info ) / sizeof( opcode_info[0] )) {
+      /* The opcode is either PHASE, COMMENT, END or out of range.
+       */
+      assert( 0 );
+      return NULL;
+   }
+
+   info = &opcode_info[op];
+
+   if (info->svga_opcode == SVGA3DOP_INVALID) {
+      /* No valid information. Please provide number of dst/src registers.
+       */
+      assert( 0 );
+      return NULL;
+   }
+
+   /* Sanity check.
+    */
+   assert( op == info->svga_opcode );
+
+   return info;
+}
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader_op.h b/src/gallium/drivers/svga/svgadump/svga_shader_op.h
new file mode 100644
index 0000000000..a5ccae5ae5
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_shader_op.h
@@ -0,0 +1,48 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * SVGA Shader Token Opcode Info
+ * 
+ * @author Michal Krol <michal@vmware.com>
+ */
+
+#ifndef SVGA_SHADER_OP_H
+#define SVGA_SHADER_OP_H
+
+struct sh_opcode_info
+{
+   const char *mnemonic;
+   unsigned num_dst:8;
+   unsigned num_src:8;
+   unsigned pre_dedent:1;
+   unsigned post_indent:1;
+   unsigned svga_opcode:16;
+};
+
+const struct sh_opcode_info *svga_opcode_info( unsigned op );
+
+#endif /* SVGA_SHADER_OP_H */
diff --git a/src/gallium/drivers/sw/Makefile b/src/gallium/drivers/sw/Makefile
new file mode 100644
index 0000000000..2713a62ee9
--- /dev/null
+++ b/src/gallium/drivers/sw/Makefile
@@ -0,0 +1,10 @@
+# Meta-driver which combines whichever software rasterizers have been
+# built into a single convenience library.
+
+TOP = ../../../..
+include $(TOP)/configs/current
+
+C_SOURCES = \
+	sw.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/sw/SConscript b/src/gallium/drivers/sw/SConscript
new file mode 100644
index 0000000000..e9ebf751dd
--- /dev/null
+++ b/src/gallium/drivers/sw/SConscript
@@ -0,0 +1,38 @@
+#######################################################################
+# SConscript for swrast convenience library
+#
+# This is a meta-driver which consists of any and all of the software
+# rasterizers into a single driver.  A software rasterizer is defined
+# as any driver which takes an sw_winsys pointer as the only argument
+# to create_screen.
+
+Import('*')
+
+env = env.Clone()
+
+# To avoid targets having to check extensively or add drivers on a whim, append
+# all referenced extra drivers to the exported symbol.
+extra = []
+if True:
+    env.Append(CPPDEFINES = 'GALLIUM_SOFTPIPE')
+    env.Prepend(LIBS = [softpipe])
+    extra.append(softpipe)
+
+if env['llvm']:
+    env.Append(CPPDEFINES = 'GALLIUM_LLVMPIPE')
+    env.Tool('udis86')
+    env.Prepend(LIBS = [llvmpipe])
+    extra.append(llvmpipe)
+
+if 'cell' in env['drivers']:
+    env.Append(CPPDEFINES = 'GALLIUM_CELL')
+    env.Prepend(LIBS = [cell])
+    extra.append(cell)
+
+sw = env.ConvenienceLibrary(
+	target = 'sw',
+	source = [
+		'sw.c',
+		]
+    ) + extra
+Export('sw')
diff --git a/src/gallium/drivers/sw/sw.c b/src/gallium/drivers/sw/sw.c
new file mode 100644
index 0000000000..6b873ecc1b
--- /dev/null
+++ b/src/gallium/drivers/sw/sw.c
@@ -0,0 +1,58 @@
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "target-helpers/wrap_screen.h"
+#include "sw_public.h"
+
+
+/* Helper function to choose and instantiate one of the software rasterizers:
+ * cell, llvmpipe, softpipe.
+ */
+
+#ifdef GALLIUM_SOFTPIPE
+#include "softpipe/sp_public.h"
+#endif
+
+#ifdef GALLIUM_LLVMPIPE
+#include "llvmpipe/lp_public.h"
+#endif
+
+#ifdef GALLIUM_CELL
+#include "cell/ppu/cell_public.h"
+#endif
+
+struct pipe_screen *
+swrast_create_screen(struct sw_winsys *winsys)
+{
+   const char *default_driver;
+   const char *driver;
+   struct pipe_screen *screen = NULL;
+
+#if defined(GALLIUM_CELL)
+   default_driver = "cell";
+#elif defined(GALLIUM_LLVMPIPE)
+   default_driver = "llvmpipe";
+#elif defined(GALLIUM_SOFTPIPE)
+   default_driver = "softpipe";
+#else
+   default_driver = "";
+#endif
+
+   driver = debug_get_option("GALLIUM_DRIVER", default_driver);
+
+#if defined(GALLIUM_CELL)
+   if (screen == NULL && strcmp(driver, "cell") == 0)
+      screen = cell_create_screen( winsys );
+#endif
+
+#if defined(GALLIUM_LLVMPIPE)
+   if (screen == NULL && strcmp(driver, "llvmpipe") == 0)
+      screen = llvmpipe_create_screen( winsys );
+#endif
+
+#if defined(GALLIUM_SOFTPIPE)
+   if (screen == NULL)
+      screen = softpipe_create_screen( winsys );
+#endif
+
+   return screen;
+}
diff --git a/src/gallium/drivers/sw/sw_public.h b/src/gallium/drivers/sw/sw_public.h
new file mode 100644
index 0000000000..7085c5c85a
--- /dev/null
+++ b/src/gallium/drivers/sw/sw_public.h
@@ -0,0 +1,13 @@
+#ifndef SW_PUBLIC_H
+#define SW_PUBLIC_H
+
+/* A convenience library, primarily to isolate the logic required to
+ * figure out which if any software rasterizers have been built and
+ * select between them.
+ */
+struct sw_winsys;
+
+struct pipe_screen *
+swrast_create_screen(struct sw_winsys *winsys);
+
+#endif
diff --git a/src/gallium/drivers/trace/Makefile b/src/gallium/drivers/trace/Makefile
new file mode 100644
index 0000000000..1b0c087a2a
--- /dev/null
+++ b/src/gallium/drivers/trace/Makefile
@@ -0,0 +1,14 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = trace
+
+C_SOURCES = \
+	tr_context.c \
+	tr_dump.c \
+	tr_dump_state.c \
+	tr_screen.c \
+	tr_drm.c \
+	tr_texture.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/trace/README b/src/gallium/drivers/trace/README
new file mode 100644
index 0000000000..cdcd8d2b4b
--- /dev/null
+++ b/src/gallium/drivers/trace/README
@@ -0,0 +1,64 @@
+                             TRACE PIPE DRIVER
+
+
+= About =
+
+This directory contains a Gallium3D trace debugger pipe driver.
+It can traces all incoming calls.
+
+
+= Build Instructions =
+
+To build, invoke scons on the top dir as
+ 
+ scons dri=no statetrackers=mesa winsys=xlib
+
+
+= Usage =
+
+To use do
+
+ export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/lib
+
+ensure the right libGL.so is being picked by doing
+
+ ldd progs/trivial/tri 
+
+== Tracing ==
+
+For tracing then do
+
+ GALLIUM_TRACE=tri.trace progs/trivial/tri
+
+which should create a tri.trace file, which is an XML file. You can view copying 
+trace.xsl to the same directory, and opening with a XSLT capable browser such as 
+Firefox or Internet Explorer.
+
+== Remote debugging ==
+
+For remote debugging see:
+
+  src/gallium/drivers/rbug/README
+
+= Integrating =
+
+You can integrate the trace pipe driver either inside the state tracker or the 
+target. The procedure on both cases is the same. Let's assume you have a 
+pipe_screen obtained by the usual means (variable and function names are just
+for illustration purposes):
+
+  real_screen = real_screen_create(...);
+  
+The trace screen is then created by doing
+
+  trace_screen = trace_screen_create(real_screen);
+
+You can then simply use trace_screen instead of real_screen.
+
+You can create as many contexts you wish from trace_screen::context_create they
+are automatically wrapped by trace_screen.
+
+
+--
+Jose Fonseca <jrfonseca@tungstengraphics.com>
+Jakob Bornecrantz <jakob@vmware.com>
diff --git a/src/gallium/drivers/trace/SConscript b/src/gallium/drivers/trace/SConscript
new file mode 100644
index 0000000000..0dc43a9ec4
--- /dev/null
+++ b/src/gallium/drivers/trace/SConscript
@@ -0,0 +1,16 @@
+Import('*')
+
+env = env.Clone()
+
+trace = env.ConvenienceLibrary(
+    target = 'trace',
+    source = [
+        'tr_context.c',
+        'tr_drm.c',
+        'tr_dump.c',
+        'tr_dump_state.c',
+        'tr_screen.c',
+        'tr_texture.c',
+    ])
+
+Export('trace')
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
new file mode 100644
index 0000000000..55dd6cf883
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -0,0 +1,1500 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+
+#include "pipe/p_format.h"
+#include "pipe/p_screen.h"
+
+#include "tr_dump.h"
+#include "tr_dump_state.h"
+#include "tr_public.h"
+#include "tr_screen.h"
+#include "tr_texture.h"
+#include "tr_context.h"
+
+
+
+
+
+static INLINE struct pipe_resource *
+trace_resource_unwrap(struct trace_context *tr_ctx,
+                     struct pipe_resource *resource)
+{
+   struct trace_resource *tr_tex;
+
+   if(!resource)
+      return NULL;
+
+   tr_tex = trace_resource(resource);
+
+   assert(tr_tex->resource);
+   return tr_tex->resource;
+}
+
+
+static INLINE struct pipe_surface *
+trace_surface_unwrap(struct trace_context *tr_ctx,
+                     struct pipe_surface *surface)
+{
+   struct trace_screen *tr_scr = trace_screen(tr_ctx->base.screen);
+   struct trace_surface *tr_surf;
+
+   if(!surface)
+      return NULL;
+
+   assert(surface->texture);
+   if(!surface->texture)
+      return surface;
+
+   tr_surf = trace_surface(surface);
+
+   assert(tr_surf->surface);
+   assert(tr_surf->surface->texture->screen == tr_scr->screen);
+   (void) tr_scr;
+   return tr_surf->surface;
+}
+
+
+static INLINE void
+trace_context_draw_arrays(struct pipe_context *_pipe,
+                          unsigned mode, unsigned start, unsigned count)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "draw_arrays");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, mode);
+   trace_dump_arg(uint, start);
+   trace_dump_arg(uint, count);
+
+   pipe->draw_arrays(pipe, mode, start, count);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_draw_elements(struct pipe_context *_pipe,
+                            struct pipe_resource *_indexBuffer,
+                            unsigned indexSize, int indexBias,
+                            unsigned mode, unsigned start, unsigned count)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct trace_resource *tr_buf = trace_resource(_indexBuffer);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_resource *indexBuffer = tr_buf->resource;
+
+   trace_dump_call_begin("pipe_context", "draw_elements");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, indexBuffer);
+   trace_dump_arg(uint, indexSize);
+   trace_dump_arg(int, indexBias);
+   trace_dump_arg(uint, mode);
+   trace_dump_arg(uint, start);
+   trace_dump_arg(uint, count);
+
+   pipe->draw_elements(pipe, indexBuffer, indexSize, indexBias,
+                       mode, start, count);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_draw_range_elements(struct pipe_context *_pipe,
+                                  struct pipe_resource *_indexBuffer,
+                                  unsigned indexSize,
+                                  int indexBias,
+                                  unsigned minIndex,
+                                  unsigned maxIndex,
+                                  unsigned mode,
+                                  unsigned start,
+                                  unsigned count)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct trace_resource *tr_buf = trace_resource(_indexBuffer);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_resource *indexBuffer = tr_buf->resource;
+
+   trace_dump_call_begin("pipe_context", "draw_range_elements");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, indexBuffer);
+   trace_dump_arg(uint, indexSize);
+   trace_dump_arg(int, indexBias);
+   trace_dump_arg(uint, minIndex);
+   trace_dump_arg(uint, maxIndex);
+   trace_dump_arg(uint, mode);
+   trace_dump_arg(uint, start);
+   trace_dump_arg(uint, count);
+
+   pipe->draw_range_elements(pipe,
+                             indexBuffer, indexSize, indexBias,
+                             minIndex, maxIndex,
+                             mode, start, count);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE struct pipe_query *
+trace_context_create_query(struct pipe_context *_pipe,
+                           unsigned query_type)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_query *result;
+
+   trace_dump_call_begin("pipe_context", "create_query");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, query_type);
+
+   result = pipe->create_query(pipe, query_type);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static INLINE void
+trace_context_destroy_query(struct pipe_context *_pipe,
+                            struct pipe_query *query)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "destroy_query");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, query);
+
+   pipe->destroy_query(pipe, query);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_begin_query(struct pipe_context *_pipe,
+                          struct pipe_query *query)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "begin_query");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, query);
+
+   pipe->begin_query(pipe, query);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_end_query(struct pipe_context *_pipe,
+                        struct pipe_query *query)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "end_query");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, query);
+
+   pipe->end_query(pipe, query);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE boolean
+trace_context_get_query_result(struct pipe_context *_pipe,
+                               struct pipe_query *query,
+                               boolean wait,
+                               void *presult)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   uint64_t result;
+   boolean _result;
+
+   trace_dump_call_begin("pipe_context", "get_query_result");
+
+   trace_dump_arg(ptr, pipe);
+
+   _result = pipe->get_query_result(pipe, query, wait, presult);
+   result = *((uint64_t*)presult);
+
+   trace_dump_arg(uint, result);
+   trace_dump_ret(bool, _result);
+
+   trace_dump_call_end();
+
+   return _result;
+}
+
+
+static INLINE void *
+trace_context_create_blend_state(struct pipe_context *_pipe,
+                                 const struct pipe_blend_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_blend_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(blend_state, state);
+
+   result = pipe->create_blend_state(pipe, state);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_blend_state(struct pipe_context *_pipe,
+                               void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_blend_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_blend_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_blend_state(struct pipe_context *_pipe,
+                                 void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_blend_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_blend_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_sampler_state(struct pipe_context *_pipe,
+                                   const struct pipe_sampler_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_sampler_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(sampler_state, state);
+
+   result = pipe->create_sampler_state(pipe, state);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_fragment_sampler_states(struct pipe_context *_pipe,
+                                           unsigned num_states,
+                                           void **states)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_fragment_sampler_states");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_states);
+   trace_dump_arg_array(ptr, states, num_states);
+
+   pipe->bind_fragment_sampler_states(pipe, num_states, states);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_bind_vertex_sampler_states(struct pipe_context *_pipe,
+                                         unsigned num_states,
+                                         void **states)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_vertex_sampler_states");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_states);
+   trace_dump_arg_array(ptr, states, num_states);
+
+   pipe->bind_vertex_sampler_states(pipe, num_states, states);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_sampler_state(struct pipe_context *_pipe,
+                                   void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_sampler_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_sampler_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_rasterizer_state(struct pipe_context *_pipe,
+                                      const struct pipe_rasterizer_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_rasterizer_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(rasterizer_state, state);
+
+   result = pipe->create_rasterizer_state(pipe, state);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_rasterizer_state(struct pipe_context *_pipe,
+                                    void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_rasterizer_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_rasterizer_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_rasterizer_state(struct pipe_context *_pipe,
+                                      void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_rasterizer_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_rasterizer_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                               const struct pipe_depth_stencil_alpha_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_depth_stencil_alpha_state");
+
+   result = pipe->create_depth_stencil_alpha_state(pipe, state);
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(depth_stencil_alpha_state, state);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                             void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_depth_stencil_alpha_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_depth_stencil_alpha_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                               void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_depth_stencil_alpha_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_depth_stencil_alpha_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_fs_state(struct pipe_context *_pipe,
+                              const struct pipe_shader_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_fs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(shader_state, state);
+
+   result = pipe->create_fs_state(pipe, state);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_fs_state(struct pipe_context *_pipe,
+                            void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_fs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_fs_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_fs_state(struct pipe_context *_pipe,
+                              void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_fs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_fs_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_vs_state(struct pipe_context *_pipe,
+                              const struct pipe_shader_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_vs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(shader_state, state);
+
+   result = pipe->create_vs_state(pipe, state);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_vs_state(struct pipe_context *_pipe,
+                            void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_vs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_vs_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_vs_state(struct pipe_context *_pipe,
+                              void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_vs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_vs_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_vertex_elements_state(struct pipe_context *_pipe,
+                                           unsigned num_elements,
+                                           const struct  pipe_vertex_element *elements)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_vertex_elements_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_elements);
+
+   trace_dump_arg_begin("elements");
+   trace_dump_struct_array(vertex_element, elements, num_elements);
+   trace_dump_arg_end();
+
+   result = pipe->create_vertex_elements_state(pipe, num_elements, elements);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_vertex_elements_state(struct pipe_context *_pipe,
+                                         void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_vertex_elements_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_vertex_elements_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_vertex_elements_state(struct pipe_context *_pipe,
+                                           void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_vertex_elements_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_vertex_elements_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_blend_color(struct pipe_context *_pipe,
+                              const struct pipe_blend_color *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_blend_color");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(blend_color, state);
+
+   pipe->set_blend_color(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_stencil_ref(struct pipe_context *_pipe,
+                              const struct pipe_stencil_ref *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_stencil_ref");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(stencil_ref, state);
+
+   pipe->set_stencil_ref(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_clip_state(struct pipe_context *_pipe,
+                             const struct pipe_clip_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_clip_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(clip_state, state);
+
+   pipe->set_clip_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+static INLINE void
+trace_context_set_sample_mask(struct pipe_context *_pipe,
+                              unsigned sample_mask)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_sample_mask");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, sample_mask);
+
+   pipe->set_sample_mask(pipe, sample_mask);
+
+   trace_dump_call_end();
+}
+
+static INLINE void
+trace_context_set_constant_buffer(struct pipe_context *_pipe,
+                                  uint shader, uint index,
+                                  struct pipe_resource *buffer)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   if (buffer) {
+      buffer = trace_resource_unwrap(tr_ctx, buffer);
+   }
+
+   trace_dump_call_begin("pipe_context", "set_constant_buffer");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, shader);
+   trace_dump_arg(uint, index);
+   trace_dump_arg(ptr, buffer);
+
+   pipe->set_constant_buffer(pipe, shader, index, buffer);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_framebuffer_state(struct pipe_context *_pipe,
+                                    const struct pipe_framebuffer_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_framebuffer_state unwrapped_state;
+   unsigned i;
+
+
+   /* Unwrap the input state */
+   memcpy(&unwrapped_state, state, sizeof(unwrapped_state));
+   for(i = 0; i < state->nr_cbufs; ++i)
+      unwrapped_state.cbufs[i] = trace_surface_unwrap(tr_ctx, state->cbufs[i]);
+   for(i = state->nr_cbufs; i < PIPE_MAX_COLOR_BUFS; ++i)
+      unwrapped_state.cbufs[i] = NULL;
+   unwrapped_state.zsbuf = trace_surface_unwrap(tr_ctx, state->zsbuf);
+   state = &unwrapped_state;
+
+   trace_dump_call_begin("pipe_context", "set_framebuffer_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(framebuffer_state, state);
+
+   pipe->set_framebuffer_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_polygon_stipple(struct pipe_context *_pipe,
+                                  const struct pipe_poly_stipple *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_polygon_stipple");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(poly_stipple, state);
+
+   pipe->set_polygon_stipple(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_scissor_state(struct pipe_context *_pipe,
+                                const struct pipe_scissor_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_scissor_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(scissor_state, state);
+
+   pipe->set_scissor_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_viewport_state(struct pipe_context *_pipe,
+                                 const struct pipe_viewport_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_viewport_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(viewport_state, state);
+
+   pipe->set_viewport_state(pipe, state);
+
+   trace_dump_call_end();
+}
+
+
+static struct pipe_sampler_view *
+trace_create_sampler_view(struct pipe_context *_pipe,
+                          struct pipe_resource *_resource,
+                          const struct pipe_sampler_view *templ)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct trace_resource *tr_tex = trace_resource(_resource);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_resource *texture = tr_tex->resource;
+   struct pipe_sampler_view *result;
+   struct trace_sampler_view *tr_view;
+
+   trace_dump_call_begin("pipe_context", "create_sampler_view");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, texture);
+   trace_dump_arg(sampler_view_template, templ);
+
+   result = pipe->create_sampler_view(pipe, texture, templ);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   /*
+    * Wrap pipe_sampler_view
+    */
+   tr_view = CALLOC_STRUCT(trace_sampler_view);
+   tr_view->base = *templ;
+   tr_view->base.reference.count = 1;
+   tr_view->base.texture = NULL;
+   pipe_resource_reference(&tr_view->base.texture, _resource);
+   tr_view->base.context = _pipe;
+   tr_view->sampler_view = result;
+   result = &tr_view->base;
+
+   return result;
+}
+
+
+static void
+trace_sampler_view_destroy(struct pipe_context *_pipe,
+                           struct pipe_sampler_view *_view)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct trace_sampler_view *tr_view = trace_sampler_view(_view);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_sampler_view *view = tr_view->sampler_view;
+
+   trace_dump_call_begin("pipe_context", "sampler_view_destroy");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, view);
+
+   pipe->sampler_view_destroy(pipe, view);
+
+   trace_dump_call_end();
+
+   pipe_resource_reference(&_view->texture, NULL);
+   FREE(_view);
+}
+
+
+static INLINE void
+trace_context_set_fragment_sampler_views(struct pipe_context *_pipe,
+                                         unsigned num,
+                                         struct pipe_sampler_view **views)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct trace_sampler_view *tr_view;
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_sampler_view *unwrapped_views[PIPE_MAX_SAMPLERS];
+   unsigned i;
+
+   for(i = 0; i < num; ++i) {
+      tr_view = trace_sampler_view(views[i]);
+      unwrapped_views[i] = tr_view ? tr_view->sampler_view : NULL;
+   }
+   views = unwrapped_views;
+
+   trace_dump_call_begin("pipe_context", "set_fragment_sampler_views");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num);
+   trace_dump_arg_array(ptr, views, num);
+
+   pipe->set_fragment_sampler_views(pipe, num, views);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_vertex_sampler_views(struct pipe_context *_pipe,
+                                       unsigned num,
+                                       struct pipe_sampler_view **views)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct trace_sampler_view *tr_view;
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_sampler_view *unwrapped_views[PIPE_MAX_VERTEX_SAMPLERS];
+   unsigned i;
+
+   for(i = 0; i < num; ++i) {
+      tr_view = trace_sampler_view(views[i]);
+      unwrapped_views[i] = tr_view ? tr_view->sampler_view : NULL;
+   }
+   views = unwrapped_views;
+
+   trace_dump_call_begin("pipe_context", "set_vertex_sampler_views");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num);
+   trace_dump_arg_array(ptr, views, num);
+
+   pipe->set_vertex_sampler_views(pipe, num, views);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_vertex_buffers(struct pipe_context *_pipe,
+                                 unsigned num_buffers,
+                                 const struct pipe_vertex_buffer *buffers)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   unsigned i;
+
+   trace_dump_call_begin("pipe_context", "set_vertex_buffers");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_buffers);
+
+   trace_dump_arg_begin("buffers");
+   trace_dump_struct_array(vertex_buffer, buffers, num_buffers);
+   trace_dump_arg_end();
+
+   if (num_buffers) {
+      struct pipe_vertex_buffer *_buffers = MALLOC(num_buffers * sizeof(*_buffers));
+      memcpy(_buffers, buffers, num_buffers * sizeof(*_buffers));
+      for (i = 0; i < num_buffers; i++)
+         _buffers[i].buffer = trace_resource_unwrap(tr_ctx, buffers[i].buffer);
+      pipe->set_vertex_buffers(pipe, num_buffers, _buffers);
+      FREE(_buffers);
+   } else {
+      pipe->set_vertex_buffers(pipe, num_buffers, NULL);
+   }
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_resource_copy_region(struct pipe_context *_pipe,
+                                   struct pipe_resource *dst,
+                                   struct pipe_subresource subdst,
+                                   unsigned dstx, unsigned dsty, unsigned dstz,
+                                   struct pipe_resource *src,
+                                   struct pipe_subresource subsrc,
+                                   unsigned srcx, unsigned srcy, unsigned srcz,
+                                   unsigned width, unsigned height)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   dst = trace_resource_unwrap(tr_ctx, dst);
+   src = trace_resource_unwrap(tr_ctx, src);
+
+   trace_dump_call_begin("pipe_context", "resource_copy_region");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, dst);
+   trace_dump_arg_struct(subresource, subdst);
+   trace_dump_arg(uint, dstx);
+   trace_dump_arg(uint, dsty);
+   trace_dump_arg(uint, dstz);
+   trace_dump_arg(ptr, src);
+   trace_dump_arg_struct(subresource, subsrc);
+   trace_dump_arg(uint, srcx);
+   trace_dump_arg(uint, srcy);
+   trace_dump_arg(uint, srcz);
+   trace_dump_arg(uint, width);
+   trace_dump_arg(uint, height);
+
+   pipe->resource_copy_region(pipe,
+                              dst, subdst, dstx, dsty, dstz,
+                              src, subsrc, srcx, srcy, srcz, width, height);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_clear(struct pipe_context *_pipe,
+                    unsigned buffers,
+                    const float *rgba,
+                    double depth,
+                    unsigned stencil)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "clear");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, buffers);
+   trace_dump_arg_array(float, rgba, 4);
+   trace_dump_arg(float, depth);
+   trace_dump_arg(uint, stencil);
+
+   pipe->clear(pipe, buffers, rgba, depth, stencil);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_clear_render_target(struct pipe_context *_pipe,
+                                  struct pipe_surface *dst,
+                                  const float *rgba,
+                                  unsigned dstx, unsigned dsty,
+                                  unsigned width, unsigned height)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   dst = trace_surface_unwrap(tr_ctx, dst);
+
+   trace_dump_call_begin("pipe_context", "clear_render_target");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, dst);
+   trace_dump_arg_array(float, rgba, 4);
+   trace_dump_arg(uint, dstx);
+   trace_dump_arg(uint, dsty);
+   trace_dump_arg(uint, width);
+   trace_dump_arg(uint, height);
+
+   pipe->clear_render_target(pipe, dst, rgba, dstx, dsty, width, height);
+
+   trace_dump_call_end();
+}
+
+static INLINE void
+trace_context_clear_depth_stencil(struct pipe_context *_pipe,
+                                  struct pipe_surface *dst,
+                                  unsigned clear_flags,
+                                  double depth,
+                                  unsigned stencil,
+                                  unsigned dstx, unsigned dsty,
+                                  unsigned width, unsigned height)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   dst = trace_surface_unwrap(tr_ctx, dst);
+
+   trace_dump_call_begin("pipe_context", "clear_depth_stencil");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, dst);
+   trace_dump_arg(uint, clear_flags);
+   trace_dump_arg(float, depth);
+   trace_dump_arg(uint, stencil);
+   trace_dump_arg(uint, dstx);
+   trace_dump_arg(uint, dsty);
+   trace_dump_arg(uint, width);
+   trace_dump_arg(uint, height);
+
+   pipe->clear_depth_stencil(pipe, dst, clear_flags, depth, stencil,
+                             dstx, dsty, width, height);
+
+   trace_dump_call_end();
+}
+
+static INLINE void
+trace_context_flush(struct pipe_context *_pipe,
+                    unsigned flags,
+                    struct pipe_fence_handle **fence)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "flush");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, flags);
+
+   pipe->flush(pipe, flags, fence);
+
+   if(fence)
+      trace_dump_ret(ptr, *fence);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_destroy(struct pipe_context *_pipe)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "destroy");
+   trace_dump_arg(ptr, pipe);
+   trace_dump_call_end();
+
+   pipe->destroy(pipe);
+
+   FREE(tr_ctx);
+}
+
+static unsigned int
+trace_is_resource_referenced( struct pipe_context *_pipe,
+			      struct pipe_resource *_resource,
+			      unsigned face, unsigned level)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct trace_resource *tr_tex = trace_resource(_resource);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_resource *texture = tr_tex->resource;
+   unsigned int referenced;
+
+   trace_dump_call_begin("pipe_context", "is_resource_referenced");
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, texture);
+   trace_dump_arg(uint, face);
+   trace_dump_arg(uint, level);
+
+   referenced = pipe->is_resource_referenced(pipe, texture, face, level);
+
+   trace_dump_ret(uint, referenced);
+   trace_dump_call_end();
+
+   return referenced;
+}
+
+
+/********************************************************************
+ * transfer
+ */
+
+
+static struct pipe_transfer *
+trace_context_get_transfer(struct pipe_context *_context,
+			   struct pipe_resource *_resource,
+			   struct pipe_subresource sr,
+			   unsigned usage,
+			   const struct pipe_box *box)
+{
+   struct trace_context *tr_context = trace_context(_context);
+   struct trace_resource *tr_tex = trace_resource(_resource);
+   struct pipe_context *context = tr_context->pipe;
+   struct pipe_resource *texture = tr_tex->resource;
+   struct pipe_transfer *result = NULL;
+
+   assert(texture->screen == context->screen);
+
+   /*
+    * Map and transfers can't be serialized so we convert all write transfers
+    * to transfer_inline_write and ignore read transfers.
+    */
+
+   result = context->get_transfer(context, texture, sr, usage, box);
+
+   if (result)
+      result = trace_transfer_create(tr_context, tr_tex, result);
+
+   return result;
+}
+
+
+static void
+trace_context_transfer_destroy(struct pipe_context *_context,
+                                   struct pipe_transfer *_transfer)
+{
+   struct trace_context *tr_context = trace_context(_context);
+   struct trace_transfer *tr_trans = trace_transfer(_transfer);
+
+   trace_transfer_destroy(tr_context, tr_trans);
+}
+
+
+static void *
+trace_context_transfer_map(struct pipe_context *_context,
+                          struct pipe_transfer *_transfer)
+{
+   struct trace_context *tr_context = trace_context(_context);
+   struct trace_transfer *tr_trans = trace_transfer(_transfer);
+   struct pipe_context *context = tr_context->pipe;
+   struct pipe_transfer *transfer = tr_trans->transfer;
+   void *map;
+
+   map = context->transfer_map(context, transfer);
+   if(map) {
+      if(transfer->usage & PIPE_TRANSFER_WRITE) {
+         assert(!tr_trans->map);
+         tr_trans->map = map;
+      }
+   }
+
+   return map;
+}
+
+
+static void
+trace_context_transfer_flush_region( struct pipe_context *_context,
+				     struct pipe_transfer *_transfer,
+				     const struct pipe_box *box)
+{
+   struct trace_context *tr_context = trace_context(_context);
+   struct trace_transfer *tr_transfer = trace_transfer(_transfer);
+   struct pipe_context *context = tr_context->pipe;
+   struct pipe_transfer *transfer = tr_transfer->transfer;
+
+   context->transfer_flush_region(context,
+				  transfer,
+				  box);
+}
+
+static void
+trace_context_transfer_unmap(struct pipe_context *_context,
+			     struct pipe_transfer *_transfer)
+{
+   struct trace_context *tr_ctx = trace_context(_context);
+   struct trace_transfer *tr_trans = trace_transfer(_transfer);
+   struct pipe_context *context = tr_ctx->pipe;
+   struct pipe_transfer *transfer = tr_trans->transfer;
+
+   if(tr_trans->map) {
+      /*
+       * Fake a transfer_inline_write
+       */
+
+      struct pipe_resource *resource = transfer->resource;
+      struct pipe_subresource sr = transfer->sr;
+      unsigned usage = transfer->usage;
+      const struct pipe_box *box = &transfer->box;
+      unsigned stride = transfer->stride;
+      unsigned slice_stride = transfer->slice_stride;
+
+      trace_dump_call_begin("pipe_context", "transfer_inline_write");
+
+      trace_dump_arg(ptr, context);
+      trace_dump_arg(ptr, resource);
+      trace_dump_arg_struct(subresource, sr);
+      trace_dump_arg(uint, usage);
+      trace_dump_arg(box, box);
+
+      trace_dump_arg_begin("data");
+      trace_dump_box_bytes(tr_trans->map,
+                           resource->format,
+                           box,
+                           stride,
+                           slice_stride);
+      trace_dump_arg_end();
+
+      trace_dump_arg(uint, stride);
+      trace_dump_arg(uint, slice_stride);
+
+      trace_dump_call_end();
+
+      tr_trans->map = NULL;
+   }
+
+   context->transfer_unmap(context, transfer);
+}
+
+
+static void
+trace_context_transfer_inline_write(struct pipe_context *_context,
+				    struct pipe_resource *_resource,
+				    struct pipe_subresource sr,
+				    unsigned usage,
+				    const struct pipe_box *box,
+				    const void *data,
+				    unsigned stride,
+				    unsigned slice_stride)
+{
+   struct trace_context *tr_context = trace_context(_context);
+   struct trace_resource *tr_tex = trace_resource(_resource);
+   struct pipe_context *context = tr_context->pipe;
+   struct pipe_resource *resource = tr_tex->resource;
+
+   assert(resource->screen == context->screen);
+
+   trace_dump_call_begin("pipe_context", "transfer_inline_write");
+
+   trace_dump_arg(ptr, context);
+   trace_dump_arg(ptr, resource);
+   trace_dump_arg_struct(subresource, sr);
+   trace_dump_arg(uint, usage);
+   trace_dump_arg(box, box);
+
+   trace_dump_arg_begin("data");
+   trace_dump_box_bytes(data,
+                        resource->format,
+                        box,
+                        stride,
+                        slice_stride);
+   trace_dump_arg_end();
+
+   trace_dump_arg(uint, stride);
+   trace_dump_arg(uint, slice_stride);
+
+   trace_dump_call_end();
+
+   context->transfer_inline_write(context, resource,
+				  sr, usage, box, data, stride, slice_stride);
+}
+
+
+
+
+static const struct debug_named_value rbug_blocker_flags[] = {
+   {"before", 1, NULL},
+   {"after", 2, NULL},
+   DEBUG_NAMED_VALUE_END
+};
+
+struct pipe_context *
+trace_context_create(struct trace_screen *tr_scr,
+                     struct pipe_context *pipe)
+{
+   struct trace_context *tr_ctx;
+
+   if(!pipe)
+      goto error1;
+
+   if(!trace_enabled())
+      goto error1;
+
+   tr_ctx = CALLOC_STRUCT(trace_context);
+   if(!tr_ctx)
+      goto error1;
+
+   tr_ctx->base.winsys = NULL;
+   tr_ctx->base.priv = pipe->priv; /* expose wrapped priv data */
+   tr_ctx->base.screen = &tr_scr->base;
+
+   tr_ctx->base.destroy = trace_context_destroy;
+   tr_ctx->base.draw_arrays = trace_context_draw_arrays;
+   tr_ctx->base.draw_elements = trace_context_draw_elements;
+   tr_ctx->base.draw_range_elements = trace_context_draw_range_elements;
+   tr_ctx->base.create_query = trace_context_create_query;
+   tr_ctx->base.destroy_query = trace_context_destroy_query;
+   tr_ctx->base.begin_query = trace_context_begin_query;
+   tr_ctx->base.end_query = trace_context_end_query;
+   tr_ctx->base.get_query_result = trace_context_get_query_result;
+   tr_ctx->base.create_blend_state = trace_context_create_blend_state;
+   tr_ctx->base.bind_blend_state = trace_context_bind_blend_state;
+   tr_ctx->base.delete_blend_state = trace_context_delete_blend_state;
+   tr_ctx->base.create_sampler_state = trace_context_create_sampler_state;
+   tr_ctx->base.bind_fragment_sampler_states = trace_context_bind_fragment_sampler_states;
+   tr_ctx->base.bind_vertex_sampler_states = trace_context_bind_vertex_sampler_states;
+   tr_ctx->base.delete_sampler_state = trace_context_delete_sampler_state;
+   tr_ctx->base.create_rasterizer_state = trace_context_create_rasterizer_state;
+   tr_ctx->base.bind_rasterizer_state = trace_context_bind_rasterizer_state;
+   tr_ctx->base.delete_rasterizer_state = trace_context_delete_rasterizer_state;
+   tr_ctx->base.create_depth_stencil_alpha_state = trace_context_create_depth_stencil_alpha_state;
+   tr_ctx->base.bind_depth_stencil_alpha_state = trace_context_bind_depth_stencil_alpha_state;
+   tr_ctx->base.delete_depth_stencil_alpha_state = trace_context_delete_depth_stencil_alpha_state;
+   tr_ctx->base.create_fs_state = trace_context_create_fs_state;
+   tr_ctx->base.bind_fs_state = trace_context_bind_fs_state;
+   tr_ctx->base.delete_fs_state = trace_context_delete_fs_state;
+   tr_ctx->base.create_vs_state = trace_context_create_vs_state;
+   tr_ctx->base.bind_vs_state = trace_context_bind_vs_state;
+   tr_ctx->base.delete_vs_state = trace_context_delete_vs_state;
+   tr_ctx->base.create_vertex_elements_state = trace_context_create_vertex_elements_state;
+   tr_ctx->base.bind_vertex_elements_state = trace_context_bind_vertex_elements_state;
+   tr_ctx->base.delete_vertex_elements_state = trace_context_delete_vertex_elements_state;
+   tr_ctx->base.set_blend_color = trace_context_set_blend_color;
+   tr_ctx->base.set_stencil_ref = trace_context_set_stencil_ref;
+   tr_ctx->base.set_clip_state = trace_context_set_clip_state;
+   tr_ctx->base.set_sample_mask = trace_context_set_sample_mask;
+   tr_ctx->base.set_constant_buffer = trace_context_set_constant_buffer;
+   tr_ctx->base.set_framebuffer_state = trace_context_set_framebuffer_state;
+   tr_ctx->base.set_polygon_stipple = trace_context_set_polygon_stipple;
+   tr_ctx->base.set_scissor_state = trace_context_set_scissor_state;
+   tr_ctx->base.set_viewport_state = trace_context_set_viewport_state;
+   tr_ctx->base.set_fragment_sampler_views = trace_context_set_fragment_sampler_views;
+   tr_ctx->base.set_vertex_sampler_views = trace_context_set_vertex_sampler_views;
+   tr_ctx->base.create_sampler_view = trace_create_sampler_view;
+   tr_ctx->base.sampler_view_destroy = trace_sampler_view_destroy;
+   tr_ctx->base.set_vertex_buffers = trace_context_set_vertex_buffers;
+   tr_ctx->base.resource_copy_region = trace_context_resource_copy_region;
+   tr_ctx->base.clear = trace_context_clear;
+   tr_ctx->base.clear_render_target = trace_context_clear_render_target;
+   tr_ctx->base.clear_depth_stencil = trace_context_clear_depth_stencil;
+   tr_ctx->base.flush = trace_context_flush;
+   tr_ctx->base.is_resource_referenced = trace_is_resource_referenced;
+
+   tr_ctx->base.get_transfer = trace_context_get_transfer;
+   tr_ctx->base.transfer_destroy = trace_context_transfer_destroy;
+   tr_ctx->base.transfer_map = trace_context_transfer_map;
+   tr_ctx->base.transfer_unmap = trace_context_transfer_unmap;
+   tr_ctx->base.transfer_flush_region = trace_context_transfer_flush_region;
+   tr_ctx->base.transfer_inline_write = trace_context_transfer_inline_write;
+
+   tr_ctx->pipe = pipe;
+
+   return &tr_ctx->base;
+
+error1:
+   return pipe;
+}
diff --git a/src/gallium/drivers/trace/tr_context.h b/src/gallium/drivers/trace/tr_context.h
new file mode 100644
index 0000000000..dadbe56118
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_context.h
@@ -0,0 +1,70 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_CONTEXT_H_
+#define TR_CONTEXT_H_
+
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "pipe/p_context.h"
+
+#include "tr_screen.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct trace_screen;
+   
+struct trace_context
+{
+   struct pipe_context base;
+
+   struct pipe_context *pipe;
+};
+
+
+static INLINE struct trace_context *
+trace_context(struct pipe_context *pipe)
+{
+   assert(pipe);
+   return (struct trace_context *)pipe;
+}
+
+
+struct pipe_context *
+trace_context_create(struct trace_screen *tr_scr,
+                     struct pipe_context *pipe);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TR_CONTEXT_H_ */
diff --git a/src/gallium/drivers/trace/tr_drm.c b/src/gallium/drivers/trace/tr_drm.c
new file mode 100644
index 0000000000..e685033212
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_drm.c
@@ -0,0 +1,101 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "state_tracker/drm_api.h"
+
+#include "util/u_memory.h"
+#include "rbug/rbug_public.h"
+#include "tr_drm.h"
+#include "tr_screen.h"
+#include "tr_public.h"
+
+struct trace_drm_api
+{
+   struct drm_api base;
+
+   struct drm_api *api;
+};
+
+static INLINE struct trace_drm_api *
+trace_drm_api(struct drm_api *_api)
+{
+   return (struct trace_drm_api *)_api;
+}
+
+static struct pipe_screen *
+trace_drm_create_screen(struct drm_api *_api, int fd)
+{
+   struct trace_drm_api *tr_api = trace_drm_api(_api);
+   struct drm_api *api = tr_api->api;
+   struct pipe_screen *screen;
+
+   /* TODO trace call */
+
+   screen = api->create_screen(api, fd);
+
+   return trace_screen_create(rbug_screen_create(screen));
+}
+
+static void
+trace_drm_destroy(struct drm_api *_api)
+{
+   struct trace_drm_api *tr_api = trace_drm_api(_api);
+   struct drm_api *api = tr_api->api;
+
+   if (api->destroy)
+      api->destroy(api);
+
+   FREE(tr_api);
+}
+
+struct drm_api *
+trace_drm_create(struct drm_api *api)
+{
+   struct trace_drm_api *tr_api;
+
+   if (!api)
+      goto error;
+
+   if (!trace_enabled() && !rbug_enabled())
+      goto error;
+
+   tr_api = CALLOC_STRUCT(trace_drm_api);
+
+   if (!tr_api)
+      goto error;
+
+   tr_api->base.name = api->name;
+   tr_api->base.driver_name = api->driver_name;
+   tr_api->base.create_screen = trace_drm_create_screen;
+   tr_api->base.destroy = trace_drm_destroy;
+   tr_api->api = api;
+
+   return &tr_api->base;
+
+error:
+   return api;
+}
diff --git a/src/gallium/drivers/trace/tr_drm.h b/src/gallium/drivers/trace/tr_drm.h
new file mode 100644
index 0000000000..845c66a32a
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_drm.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_DRM_H
+#define TR_DRM_H
+
+struct drm_api;
+
+struct drm_api* trace_drm_create(struct drm_api *api);
+
+#endif /* ID_DRM_H */
diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c
new file mode 100644
index 0000000000..51a4ea9633
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_dump.c
@@ -0,0 +1,635 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Trace dumping functions.
+ *
+ * For now we just use standard XML for dumping the trace calls, as this is
+ * simple to write, parse, and visually inspect, but the actual representation
+ * is abstracted out of this file, so that we can switch to a binary
+ * representation if/when it becomes justified.
+ *
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE)
+#include <stdlib.h>
+#endif
+
+#include "pipe/p_compiler.h"
+#include "os/os_thread.h"
+#include "os/os_stream.h"
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+#include "util/u_math.h"
+#include "util/u_format.h"
+
+#include "tr_dump.h"
+#include "tr_screen.h"
+#include "tr_texture.h"
+
+
+static struct os_stream *stream = NULL;
+static unsigned refcount = 0;
+static pipe_mutex call_mutex;
+static long unsigned call_no = 0;
+static boolean dumping = FALSE;
+static boolean initialized = FALSE;
+
+
+static INLINE void
+trace_dump_write(const char *buf, size_t size)
+{
+   if(stream)
+      os_stream_write(stream, buf, size);
+}
+
+
+static INLINE void
+trace_dump_writes(const char *s)
+{
+   trace_dump_write(s, strlen(s));
+}
+
+
+static INLINE void
+trace_dump_writef(const char *format, ...)
+{
+   static char buf[1024];
+   unsigned len;
+   va_list ap;
+   va_start(ap, format);
+   len = util_vsnprintf(buf, sizeof(buf), format, ap);
+   va_end(ap);
+   trace_dump_write(buf, len);
+}
+
+
+static INLINE void
+trace_dump_escape(const char *str)
+{
+   const unsigned char *p = (const unsigned char *)str;
+   unsigned char c;
+   while((c = *p++) != 0) {
+      if(c == '<')
+         trace_dump_writes("&lt;");
+      else if(c == '>')
+         trace_dump_writes("&gt;");
+      else if(c == '&')
+         trace_dump_writes("&amp;");
+      else if(c == '\'')
+         trace_dump_writes("&apos;");
+      else if(c == '\"')
+         trace_dump_writes("&quot;");
+      else if(c >= 0x20 && c <= 0x7e)
+         trace_dump_writef("%c", c);
+      else
+         trace_dump_writef("&#%u;", c);
+   }
+}
+
+
+static INLINE void
+trace_dump_indent(unsigned level)
+{
+   unsigned i;
+   for(i = 0; i < level; ++i)
+      trace_dump_writes("\t");
+}
+
+
+static INLINE void
+trace_dump_newline(void)
+{
+   trace_dump_writes("\n");
+}
+
+
+static INLINE void
+trace_dump_tag(const char *name)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes("/>");
+}
+
+
+static INLINE void
+trace_dump_tag_begin(const char *name)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes(">");
+}
+
+static INLINE void
+trace_dump_tag_begin1(const char *name,
+                      const char *attr1, const char *value1)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes(" ");
+   trace_dump_writes(attr1);
+   trace_dump_writes("='");
+   trace_dump_escape(value1);
+   trace_dump_writes("'>");
+}
+
+
+static INLINE void
+trace_dump_tag_begin2(const char *name,
+                      const char *attr1, const char *value1,
+                      const char *attr2, const char *value2)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes(" ");
+   trace_dump_writes(attr1);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value1);
+   trace_dump_writes("\' ");
+   trace_dump_writes(attr2);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value2);
+   trace_dump_writes("\'>");
+}
+
+
+static INLINE void
+trace_dump_tag_begin3(const char *name,
+                      const char *attr1, const char *value1,
+                      const char *attr2, const char *value2,
+                      const char *attr3, const char *value3)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes(" ");
+   trace_dump_writes(attr1);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value1);
+   trace_dump_writes("\' ");
+   trace_dump_writes(attr2);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value2);
+   trace_dump_writes("\' ");
+   trace_dump_writes(attr3);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value3);
+   trace_dump_writes("\'>");
+}
+
+
+static INLINE void
+trace_dump_tag_end(const char *name)
+{
+   trace_dump_writes("</");
+   trace_dump_writes(name);
+   trace_dump_writes(">");
+}
+
+static void
+trace_dump_trace_close(void)
+{
+   if(stream) {
+      trace_dump_writes("</trace>\n");
+      os_stream_close(stream);
+      stream = NULL;
+      refcount = 0;
+      call_no = 0;
+      pipe_mutex_destroy(call_mutex);
+   }
+}
+
+void trace_dump_init()
+{
+   if (initialized)
+      return;
+
+   pipe_mutex_init(call_mutex);
+   dumping = FALSE;
+   initialized = TRUE;
+}
+
+boolean trace_dump_trace_begin()
+{
+   const char *filename;
+
+   assert(initialized);
+
+   filename = debug_get_option("GALLIUM_TRACE", NULL);
+   if(!filename)
+      return FALSE;
+
+   if(!stream) {
+
+      stream = os_file_stream_create(filename);
+      if(!stream)
+         return FALSE;
+
+      trace_dump_writes("<?xml version='1.0' encoding='UTF-8'?>\n");
+      trace_dump_writes("<?xml-stylesheet type='text/xsl' href='trace.xsl'?>\n");
+      trace_dump_writes("<trace version='0.1'>\n");
+
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE)
+      /* Linux applications rarely cleanup GL / Gallium resources so catch
+       * application exit here */
+      atexit(trace_dump_trace_close);
+#endif
+   }
+
+   ++refcount;
+
+   return TRUE;
+}
+
+boolean trace_dump_trace_enabled(void)
+{
+   return stream ? TRUE : FALSE;
+}
+
+void trace_dump_trace_end(void)
+{
+   if(stream)
+      if(!--refcount)
+         trace_dump_trace_close();
+}
+
+/*
+ * Call lock
+ */
+
+void trace_dump_call_lock(void)
+{
+   pipe_mutex_lock(call_mutex);
+}
+
+void trace_dump_call_unlock(void)
+{
+   pipe_mutex_unlock(call_mutex);
+}
+
+/*
+ * Dumping control
+ */
+
+void trace_dumping_start_locked(void)
+{
+   dumping = TRUE;
+}
+
+void trace_dumping_stop_locked(void)
+{
+   dumping = FALSE;
+}
+
+boolean trace_dumping_enabled_locked(void)
+{
+   return dumping;
+}
+
+void trace_dumping_start(void)
+{
+   pipe_mutex_lock(call_mutex);
+   trace_dumping_start_locked();
+   pipe_mutex_unlock(call_mutex);
+}
+
+void trace_dumping_stop(void)
+{
+   pipe_mutex_lock(call_mutex);
+   trace_dumping_stop_locked();
+   pipe_mutex_unlock(call_mutex);
+}
+
+boolean trace_dumping_enabled(void)
+{
+   boolean ret;
+   pipe_mutex_lock(call_mutex);
+   ret = trace_dumping_enabled_locked();
+   pipe_mutex_unlock(call_mutex);
+   return ret;
+}
+
+/*
+ * Dump functions
+ */
+
+void trace_dump_call_begin_locked(const char *klass, const char *method)
+{
+   if (!dumping)
+      return;
+
+   ++call_no;
+   trace_dump_indent(1);
+   trace_dump_writes("<call no=\'");
+   trace_dump_writef("%lu", call_no);
+   trace_dump_writes("\' class=\'");
+   trace_dump_escape(klass);
+   trace_dump_writes("\' method=\'");
+   trace_dump_escape(method);
+   trace_dump_writes("\'>");
+   trace_dump_newline();
+}
+
+void trace_dump_call_end_locked(void)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_indent(1);
+   trace_dump_tag_end("call");
+   trace_dump_newline();
+   os_stream_flush(stream);
+}
+
+void trace_dump_call_begin(const char *klass, const char *method)
+{
+   pipe_mutex_lock(call_mutex);
+   trace_dump_call_begin_locked(klass, method);
+}
+
+void trace_dump_call_end(void)
+{
+   trace_dump_call_end_locked();
+   pipe_mutex_unlock(call_mutex);
+}
+
+void trace_dump_arg_begin(const char *name)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_indent(2);
+   trace_dump_tag_begin1("arg", "name", name);
+}
+
+void trace_dump_arg_end(void)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_tag_end("arg");
+   trace_dump_newline();
+}
+
+void trace_dump_ret_begin(void)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_indent(2);
+   trace_dump_tag_begin("ret");
+}
+
+void trace_dump_ret_end(void)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_tag_end("ret");
+   trace_dump_newline();
+}
+
+void trace_dump_bool(int value)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writef("<bool>%c</bool>", value ? '1' : '0');
+}
+
+void trace_dump_int(long long int value)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writef("<int>%lli</int>", value);
+}
+
+void trace_dump_uint(long long unsigned value)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writef("<uint>%llu</uint>", value);
+}
+
+void trace_dump_float(double value)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writef("<float>%g</float>", value);
+}
+
+void trace_dump_bytes(const void *data,
+                      size_t size)
+{
+   static const char hex_table[16] = "0123456789ABCDEF";
+   const uint8_t *p = data;
+   size_t i;
+
+   if (!dumping)
+      return;
+
+   trace_dump_writes("<bytes>");
+   for(i = 0; i < size; ++i) {
+      uint8_t byte = *p++;
+      char hex[2];
+      hex[0] = hex_table[byte >> 4];
+      hex[1] = hex_table[byte & 0xf];
+      trace_dump_write(hex, 2);
+   }
+   trace_dump_writes("</bytes>");
+}
+
+void trace_dump_box_bytes(const void *data,
+			  enum pipe_format format,
+			  const struct pipe_box *box,
+			  unsigned stride,
+			  unsigned slice_stride)
+{
+   size_t size;
+
+   if (slice_stride)
+      size = box->depth * slice_stride;
+   else if (stride)
+      size = util_format_get_nblocksy(format, box->height) * stride;
+   else {
+      size = util_format_get_nblocksx(format, box->width) * util_format_get_blocksize(format);
+   }
+
+   trace_dump_bytes(data, size);
+}
+
+void trace_dump_string(const char *str)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writes("<string>");
+   trace_dump_escape(str);
+   trace_dump_writes("</string>");
+}
+
+void trace_dump_enum(const char *value)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writes("<enum>");
+   trace_dump_escape(value);
+   trace_dump_writes("</enum>");
+}
+
+void trace_dump_array_begin(void)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writes("<array>");
+}
+
+void trace_dump_array_end(void)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writes("</array>");
+}
+
+void trace_dump_elem_begin(void)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writes("<elem>");
+}
+
+void trace_dump_elem_end(void)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writes("</elem>");
+}
+
+void trace_dump_struct_begin(const char *name)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writef("<struct name='%s'>", name);
+}
+
+void trace_dump_struct_end(void)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writes("</struct>");
+}
+
+void trace_dump_member_begin(const char *name)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writef("<member name='%s'>", name);
+}
+
+void trace_dump_member_end(void)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writes("</member>");
+}
+
+void trace_dump_null(void)
+{
+   if (!dumping)
+      return;
+
+   trace_dump_writes("<null/>");
+}
+
+void trace_dump_ptr(const void *value)
+{
+   if (!dumping)
+      return;
+
+   if(value)
+      trace_dump_writef("<ptr>0x%08lx</ptr>", (unsigned long)(uintptr_t)value);
+   else
+      trace_dump_null();
+}
+
+
+void trace_dump_resource_ptr(struct pipe_resource *_resource)
+{
+   if (!dumping)
+      return;
+
+   if (_resource) {
+      struct trace_resource *tr_resource = trace_resource(_resource);
+      trace_dump_ptr(tr_resource->resource);
+   } else {
+      trace_dump_null();
+   }
+}
+
+void trace_dump_surface_ptr(struct pipe_surface *_surface)
+{
+   if (!dumping)
+      return;
+
+   if (_surface) {
+      struct trace_surface *tr_surf = trace_surface(_surface);
+      trace_dump_ptr(tr_surf->surface);
+   } else {
+      trace_dump_null();
+   }
+}
+
+void trace_dump_transfer_ptr(struct pipe_transfer *_transfer)
+{
+   if (!dumping)
+      return;
+
+   if (_transfer) {
+      struct trace_transfer *tr_tran = trace_transfer(_transfer);
+      trace_dump_ptr(tr_tran->transfer);
+   } else {
+      trace_dump_null();
+   }
+}
diff --git a/src/gallium/drivers/trace/tr_dump.h b/src/gallium/drivers/trace/tr_dump.h
new file mode 100644
index 0000000000..74c5e83e9e
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_dump.h
@@ -0,0 +1,188 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Trace data dumping primitives.
+ */
+
+#ifndef TR_DUMP_H
+#define TR_DUMP_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+
+struct pipe_resource;
+struct pipe_surface;
+struct pipe_transfer;
+struct pipe_box;
+
+/*
+ * Call before use.
+ */
+void trace_dump_init(void);
+
+/*
+ * Low level dumping controls.
+ *
+ * Opening the trace file and checking if that is opened.
+ */
+boolean trace_dump_trace_begin(void);
+boolean trace_dump_trace_enabled(void);
+void trace_dump_trace_end(void);
+
+/*
+ * Lock and unlock the call mutex.
+ *
+ * It used by the none locked version of dumping control
+ * and begin/end call dump functions.
+ *
+ * Begin takes the lock while end unlocks it. Use the _locked
+ * version to avoid locking/unlocking it.
+ */
+void trace_dump_call_lock(void);
+void trace_dump_call_unlock(void);
+
+/*
+ * High level dumping control.
+ */
+void trace_dumping_start_locked(void);
+void trace_dumping_stop_locked(void);
+boolean trace_dumping_enabled_locked(void);
+void trace_dumping_start(void);
+void trace_dumping_stop(void);
+boolean trace_dumping_enabled(void);
+
+void trace_dump_call_begin_locked(const char *klass, const char *method);
+void trace_dump_call_end_locked(void);
+void trace_dump_call_begin(const char *klass, const char *method);
+void trace_dump_call_end(void);
+
+void trace_dump_arg_begin(const char *name);
+void trace_dump_arg_end(void);
+void trace_dump_ret_begin(void);
+void trace_dump_ret_end(void);
+void trace_dump_bool(int value);
+void trace_dump_int(long long int value);
+void trace_dump_uint(long long unsigned value);
+void trace_dump_float(double value);
+void trace_dump_bytes(const void *data, size_t size);
+void trace_dump_box_bytes(const void *data,
+			  enum pipe_format format,
+			  const struct pipe_box *box,
+			  unsigned stride,
+			  unsigned slice_stride);
+void trace_dump_string(const char *str);
+void trace_dump_enum(const char *value);
+void trace_dump_array_begin(void);
+void trace_dump_array_end(void);
+void trace_dump_elem_begin(void);
+void trace_dump_elem_end(void);
+void trace_dump_struct_begin(const char *name);
+void trace_dump_struct_end(void);
+void trace_dump_member_begin(const char *name);
+void trace_dump_member_end(void);
+void trace_dump_null(void);
+void trace_dump_ptr(const void *value);
+/* will turn a wrapped object into the real one and dump ptr */
+void trace_dump_resource_ptr(struct pipe_resource *_texture);
+void trace_dump_surface_ptr(struct pipe_surface *_surface);
+void trace_dump_transfer_ptr(struct pipe_transfer *_transfer);
+
+/*
+ * Code saving macros.
+ */
+
+#define trace_dump_arg(_type, _arg) \
+   do { \
+      trace_dump_arg_begin(#_arg); \
+      trace_dump_##_type(_arg); \
+      trace_dump_arg_end(); \
+   } while(0)
+
+#define trace_dump_arg_struct(_type, _arg) \
+   do { \
+      trace_dump_arg_begin(#_arg); \
+      trace_dump_##_type(&_arg); \
+      trace_dump_arg_end(); \
+   } while(0)
+
+#define trace_dump_ret(_type, _arg) \
+   do { \
+      trace_dump_ret_begin(); \
+      trace_dump_##_type(_arg); \
+      trace_dump_ret_end(); \
+   } while(0)
+
+#define trace_dump_array(_type, _obj, _size) \
+   do { \
+      size_t idx; \
+      trace_dump_array_begin(); \
+      for(idx = 0; idx < (_size); ++idx) { \
+         trace_dump_elem_begin(); \
+         trace_dump_##_type((_obj)[idx]); \
+         trace_dump_elem_end(); \
+      } \
+      trace_dump_array_end(); \
+   } while(0)
+
+#define trace_dump_struct_array(_type, _obj, _size) \
+   do { \
+      size_t idx; \
+      trace_dump_array_begin(); \
+      for(idx = 0; idx < (_size); ++idx) { \
+         trace_dump_elem_begin(); \
+         trace_dump_##_type(&(_obj)[idx]); \
+         trace_dump_elem_end(); \
+      } \
+      trace_dump_array_end(); \
+   } while(0)
+
+#define trace_dump_member(_type, _obj, _member) \
+   do { \
+      trace_dump_member_begin(#_member); \
+      trace_dump_##_type((_obj)->_member); \
+      trace_dump_member_end(); \
+   } while(0)
+
+#define trace_dump_arg_array(_type, _arg, _size) \
+   do { \
+      trace_dump_arg_begin(#_arg); \
+      trace_dump_array(_type, _arg, _size); \
+      trace_dump_arg_end(); \
+   } while(0)
+
+#define trace_dump_member_array(_type, _obj, _member) \
+   do { \
+      trace_dump_member_begin(#_member); \
+      trace_dump_array(_type, (_obj)->_member, sizeof((_obj)->_member)/sizeof((_obj)->_member[0])); \
+      trace_dump_member_end(); \
+   } while(0)
+
+
+#endif /* TR_DUMP_H */
diff --git a/src/gallium/drivers/trace/tr_dump_state.c b/src/gallium/drivers/trace/tr_dump_state.c
new file mode 100644
index 0000000000..1727c2a020
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_dump_state.c
@@ -0,0 +1,555 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "tr_dump.h"
+#include "tr_dump_state.h"
+
+
+void trace_dump_format(enum pipe_format format)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   trace_dump_enum(util_format_name(format) );
+}
+
+
+void trace_dump_resource_template(const struct pipe_resource *templat)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!templat) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_resource");
+
+   trace_dump_member(int, templat, target);
+   trace_dump_member(format, templat, format);
+
+   trace_dump_member_begin("width");
+   trace_dump_uint(templat->width0);
+   trace_dump_member_end();
+
+   trace_dump_member_begin("height");
+   trace_dump_uint(templat->height0);
+   trace_dump_member_end();
+
+   trace_dump_member_begin("depth");
+   trace_dump_uint(templat->depth0);
+   trace_dump_member_end();
+
+   trace_dump_member(uint, templat, last_level);
+   trace_dump_member(uint, templat, usage);
+   trace_dump_member(uint, templat, bind);
+   trace_dump_member(uint, templat, flags);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_subresource(const struct pipe_subresource *subresource)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!subresource) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_subresource");
+
+   trace_dump_member(uint, subresource, face);
+   trace_dump_member(uint, subresource, level);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_box(const struct pipe_box *box)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!box) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_box");
+
+   trace_dump_member(uint, box, x);
+   trace_dump_member(uint, box, y);
+   trace_dump_member(uint, box, z);
+   trace_dump_member(uint, box, width);
+   trace_dump_member(uint, box, height);
+   trace_dump_member(uint, box, depth);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_rasterizer_state(const struct pipe_rasterizer_state *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_rasterizer_state");
+
+   trace_dump_member(bool, state, flatshade);
+   trace_dump_member(bool, state, light_twoside);
+   trace_dump_member(uint, state, front_ccw);
+   trace_dump_member(uint, state, cull_face);
+   trace_dump_member(uint, state, fill_front);
+   trace_dump_member(uint, state, fill_back);
+   trace_dump_member(bool, state, offset_point);
+   trace_dump_member(bool, state, offset_line);
+   trace_dump_member(bool, state, offset_tri);
+   trace_dump_member(bool, state, scissor);
+   trace_dump_member(bool, state, poly_smooth);
+   trace_dump_member(bool, state, poly_stipple_enable);
+   trace_dump_member(bool, state, point_smooth);
+   trace_dump_member(uint, state, sprite_coord_enable);
+   trace_dump_member(bool, state, sprite_coord_mode);
+   trace_dump_member(bool, state, point_quad_rasterization);
+   trace_dump_member(bool, state, point_size_per_vertex);
+   trace_dump_member(bool, state, multisample);
+   trace_dump_member(bool, state, line_smooth);
+   trace_dump_member(bool, state, line_stipple_enable);
+   trace_dump_member(uint, state, line_stipple_factor);
+   trace_dump_member(uint, state, line_stipple_pattern);
+   trace_dump_member(bool, state, line_last_pixel);
+   trace_dump_member(bool, state, flatshade_first);
+   trace_dump_member(bool, state, gl_rasterization_rules);
+
+   trace_dump_member(float, state, line_width);
+   trace_dump_member(float, state, point_size);
+   trace_dump_member(float, state, offset_units);
+   trace_dump_member(float, state, offset_scale);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_poly_stipple(const struct pipe_poly_stipple *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_poly_stipple");
+
+   trace_dump_member_begin("stipple");
+   trace_dump_array(uint,
+                    state->stipple,
+                    Elements(state->stipple));
+   trace_dump_member_end();
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_viewport_state(const struct pipe_viewport_state *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_viewport_state");
+
+   trace_dump_member_array(float, state, scale);
+   trace_dump_member_array(float, state, translate);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_scissor_state(const struct pipe_scissor_state *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_scissor_state");
+
+   trace_dump_member(uint, state, minx);
+   trace_dump_member(uint, state, miny);
+   trace_dump_member(uint, state, maxx);
+   trace_dump_member(uint, state, maxy);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_clip_state(const struct pipe_clip_state *state)
+{
+   unsigned i;
+
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_clip_state");
+
+   trace_dump_member_begin("ucp");
+   trace_dump_array_begin();
+   for(i = 0; i < PIPE_MAX_CLIP_PLANES; ++i) {
+      trace_dump_elem_begin();
+      trace_dump_array(float, state->ucp[i], 4);
+      trace_dump_elem_end();
+   }
+   trace_dump_array_end();
+   trace_dump_member_end();
+
+   trace_dump_member(uint, state, nr);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_shader_state(const struct pipe_shader_state *state)
+{
+   static char str[8192];
+
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   tgsi_dump_str(state->tokens, 0, str, sizeof(str));
+
+   trace_dump_struct_begin("pipe_shader_state");
+
+   trace_dump_member_begin("tokens");
+   trace_dump_string(str);
+   trace_dump_member_end();
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_depth_stencil_alpha_state(const struct pipe_depth_stencil_alpha_state *state)
+{
+   unsigned i;
+
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_depth_stencil_alpha_state");
+
+   trace_dump_member_begin("depth");
+   trace_dump_struct_begin("pipe_depth_state");
+   trace_dump_member(bool, &state->depth, enabled);
+   trace_dump_member(bool, &state->depth, writemask);
+   trace_dump_member(uint, &state->depth, func);
+   trace_dump_struct_end();
+   trace_dump_member_end();
+
+   trace_dump_member_begin("stencil");
+   trace_dump_array_begin();
+   for(i = 0; i < Elements(state->stencil); ++i) {
+      trace_dump_elem_begin();
+      trace_dump_struct_begin("pipe_stencil_state");
+      trace_dump_member(bool, &state->stencil[i], enabled);
+      trace_dump_member(uint, &state->stencil[i], func);
+      trace_dump_member(uint, &state->stencil[i], fail_op);
+      trace_dump_member(uint, &state->stencil[i], zpass_op);
+      trace_dump_member(uint, &state->stencil[i], zfail_op);
+      trace_dump_member(uint, &state->stencil[i], valuemask);
+      trace_dump_member(uint, &state->stencil[i], writemask);
+      trace_dump_struct_end();
+      trace_dump_elem_end();
+   }
+   trace_dump_array_end();
+   trace_dump_member_end();
+
+   trace_dump_member_begin("alpha");
+   trace_dump_struct_begin("pipe_alpha_state");
+   trace_dump_member(bool, &state->alpha, enabled);
+   trace_dump_member(uint, &state->alpha, func);
+   trace_dump_member(float, &state->alpha, ref_value);
+   trace_dump_struct_end();
+   trace_dump_member_end();
+
+   trace_dump_struct_end();
+}
+
+void trace_dump_blend_state(const struct pipe_blend_state *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_bytes(state, sizeof *state);
+}
+
+
+void trace_dump_blend_color(const struct pipe_blend_color *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_blend_color");
+
+   trace_dump_member_array(float, state, color);
+
+   trace_dump_struct_end();
+}
+
+void trace_dump_stencil_ref(const struct pipe_stencil_ref *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_stencil_ref");
+
+   trace_dump_member_array(uint, state, ref_value);
+
+   trace_dump_struct_end();
+}
+
+void trace_dump_framebuffer_state(const struct pipe_framebuffer_state *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   trace_dump_struct_begin("pipe_framebuffer_state");
+
+   trace_dump_member(uint, state, width);
+   trace_dump_member(uint, state, height);
+   trace_dump_member(uint, state, nr_cbufs);
+   trace_dump_member_array(ptr, state, cbufs);
+   trace_dump_member(ptr, state, zsbuf);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_sampler_state(const struct pipe_sampler_state *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_sampler_state");
+
+   trace_dump_member(uint, state, wrap_s);
+   trace_dump_member(uint, state, wrap_t);
+   trace_dump_member(uint, state, wrap_r);
+   trace_dump_member(uint, state, min_img_filter);
+   trace_dump_member(uint, state, min_mip_filter);
+   trace_dump_member(uint, state, mag_img_filter);
+   trace_dump_member(uint, state, compare_mode);
+   trace_dump_member(uint, state, compare_func);
+   trace_dump_member(bool, state, normalized_coords);
+   trace_dump_member(uint, state, max_anisotropy);
+   trace_dump_member(float, state, lod_bias);
+   trace_dump_member(float, state, min_lod);
+   trace_dump_member(float, state, max_lod);
+   trace_dump_member_array(float, state, border_color);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_sampler_view_template(const struct pipe_sampler_view *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_sampler_view");
+
+   trace_dump_member(format, state, format);
+   trace_dump_member(uint, state, first_level);
+   trace_dump_member(uint, state, last_level);
+   trace_dump_member(uint, state, swizzle_r);
+   trace_dump_member(uint, state, swizzle_g);
+   trace_dump_member(uint, state, swizzle_b);
+   trace_dump_member(uint, state, swizzle_a);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_surface(const struct pipe_surface *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_surface");
+
+   trace_dump_member(format, state, format);
+   trace_dump_member(uint, state, width);
+   trace_dump_member(uint, state, height);
+
+   trace_dump_member(uint, state, layout);
+   trace_dump_member(uint, state, offset);
+   trace_dump_member(uint, state, usage);
+
+   trace_dump_member(ptr, state, texture);
+   trace_dump_member(uint, state, face);
+   trace_dump_member(uint, state, level);
+   trace_dump_member(uint, state, zslice);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_transfer(const struct pipe_transfer *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_transfer");
+
+   trace_dump_member(uint, state, box.width);
+   trace_dump_member(uint, state, box.height);
+
+   trace_dump_member(uint, state, stride);
+   trace_dump_member(uint, state, usage);
+
+   trace_dump_member(ptr, state, resource);
+   trace_dump_member(uint, state, sr.face);
+   trace_dump_member(uint, state, sr.level);
+   trace_dump_member(uint, state, box.z);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_vertex_buffer(const struct pipe_vertex_buffer *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_vertex_buffer");
+
+   trace_dump_member(uint, state, stride);
+   trace_dump_member(uint, state, max_index);
+   trace_dump_member(uint, state, buffer_offset);
+   trace_dump_member(resource_ptr, state, buffer);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_vertex_element(const struct pipe_vertex_element *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_vertex_element");
+
+   trace_dump_member(uint, state, src_offset);
+
+   trace_dump_member(uint, state, vertex_buffer_index);
+
+   trace_dump_member(format, state, src_format);
+
+   trace_dump_struct_end();
+}
diff --git a/src/gallium/drivers/trace/tr_dump_state.h b/src/gallium/drivers/trace/tr_dump_state.h
new file mode 100644
index 0000000000..e614e8355e
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_dump_state.h
@@ -0,0 +1,81 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_DUMP_STATE_H_
+#define TR_DUMP_STATE_H_
+
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+
+
+void trace_dump_format(enum pipe_format format);
+
+void trace_dump_resource_template(const struct pipe_resource *templat);
+
+void trace_dump_subresource(const struct pipe_subresource *subresource);
+
+void trace_dump_box(const struct pipe_box *box);
+
+void trace_dump_rasterizer_state(const struct pipe_rasterizer_state *state);
+
+void trace_dump_poly_stipple(const struct pipe_poly_stipple *state);
+
+void trace_dump_viewport_state(const struct pipe_viewport_state *state);
+
+void trace_dump_scissor_state(const struct pipe_scissor_state *state);
+
+void trace_dump_clip_state(const struct pipe_clip_state *state);
+
+void trace_dump_token(const struct tgsi_token *token);
+
+void trace_dump_shader_state(const struct pipe_shader_state *state);
+
+void trace_dump_depth_stencil_alpha_state(const struct pipe_depth_stencil_alpha_state *state);
+
+void trace_dump_blend_state(const struct pipe_blend_state *state);
+
+void trace_dump_blend_color(const struct pipe_blend_color *state);
+
+void trace_dump_stencil_ref(const struct pipe_stencil_ref *state);
+
+void trace_dump_framebuffer_state(const struct pipe_framebuffer_state *state);
+
+void trace_dump_sampler_state(const struct pipe_sampler_state *state);
+
+void trace_dump_sampler_view_template(const struct pipe_sampler_view *view);
+
+void trace_dump_surface(const struct pipe_surface *state);
+
+void trace_dump_transfer(const struct pipe_transfer *state);
+
+void trace_dump_vertex_buffer(const struct pipe_vertex_buffer *state);
+
+void trace_dump_vertex_element(const struct pipe_vertex_element *state);
+
+
+#endif /* TR_STATE_H */
diff --git a/src/gallium/drivers/trace/tr_public.h b/src/gallium/drivers/trace/tr_public.h
new file mode 100644
index 0000000000..aee4937dd4
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_public.h
@@ -0,0 +1,48 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+#ifndef TR_PUBLIC_H
+#define TR_PUBLIC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct pipe_screen;
+struct pipe_context;
+
+struct pipe_screen *
+trace_screen_create(struct pipe_screen *screen);
+
+boolean
+trace_enabled(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TR_PUBLIC_H */
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
new file mode 100644
index 0000000000..32e519a68a
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -0,0 +1,588 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+
+#include "tr_dump.h"
+#include "tr_dump_state.h"
+#include "tr_texture.h"
+#include "tr_context.h"
+#include "tr_screen.h"
+#include "tr_public.h"
+
+#include "pipe/p_format.h"
+
+
+static boolean trace = FALSE;
+
+static const char *
+trace_screen_get_name(struct pipe_screen *_screen)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   const char *result;
+
+   trace_dump_call_begin("pipe_screen", "get_name");
+
+   trace_dump_arg(ptr, screen);
+
+   result = screen->get_name(screen);
+
+   trace_dump_ret(string, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static const char *
+trace_screen_get_vendor(struct pipe_screen *_screen)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   const char *result;
+
+   trace_dump_call_begin("pipe_screen", "get_vendor");
+
+   trace_dump_arg(ptr, screen);
+
+   result = screen->get_vendor(screen);
+
+   trace_dump_ret(string, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static int
+trace_screen_get_param(struct pipe_screen *_screen,
+                       enum pipe_cap param)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   int result;
+
+   trace_dump_call_begin("pipe_screen", "get_param");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(int, param);
+
+   result = screen->get_param(screen, param);
+
+   trace_dump_ret(int, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static float
+trace_screen_get_paramf(struct pipe_screen *_screen,
+                        enum pipe_cap param)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   float result;
+
+   trace_dump_call_begin("pipe_screen", "get_paramf");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(int, param);
+
+   result = screen->get_paramf(screen, param);
+
+   trace_dump_ret(float, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static boolean
+trace_screen_is_format_supported(struct pipe_screen *_screen,
+                                 enum pipe_format format,
+                                 enum pipe_texture_target target,
+                                 unsigned sample_count,
+                                 unsigned tex_usage,
+                                 unsigned geom_flags)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   boolean result;
+
+   trace_dump_call_begin("pipe_screen", "is_format_supported");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(format, format);
+   trace_dump_arg(int, target);
+   trace_dump_arg(uint, sample_count);
+   trace_dump_arg(uint, tex_usage);
+   trace_dump_arg(uint, geom_flags);
+
+   result = screen->is_format_supported(screen, format, target, sample_count,
+                                        tex_usage, geom_flags);
+
+   trace_dump_ret(bool, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static struct pipe_context *
+trace_screen_context_create(struct pipe_screen *_screen, void *priv)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct pipe_context *result;
+
+   trace_dump_call_begin("pipe_screen", "context_create");
+
+   trace_dump_arg(ptr, screen);
+
+   result = screen->context_create(screen, priv);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   result = trace_context_create(tr_scr, result);
+
+   return result;
+}
+
+
+static void
+trace_screen_flush_frontbuffer(struct pipe_screen *_screen,
+                               struct pipe_surface *_surface,
+                               void *context_private)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct trace_surface *tr_surf = trace_surface(_surface);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct pipe_surface *surface = tr_surf->surface;
+
+   trace_dump_call_begin("pipe_screen", "flush_frontbuffer");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, surface);
+   /* XXX: hide, as there is nothing we can do with this
+   trace_dump_arg(ptr, context_private);
+   */
+
+   screen->flush_frontbuffer(screen, surface, context_private);
+
+   trace_dump_call_end();
+}
+
+
+/********************************************************************
+ * texture
+ */
+
+
+static struct pipe_resource *
+trace_screen_resource_create(struct pipe_screen *_screen,
+                            const struct pipe_resource *templat)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct pipe_resource *result;
+
+   trace_dump_call_begin("pipe_screen", "resource_create");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(resource_template, templat);
+
+   result = screen->resource_create(screen, templat);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   result = trace_resource_create(tr_scr, result);
+
+   return result;
+}
+
+static struct pipe_resource *
+trace_screen_resource_from_handle(struct pipe_screen *_screen,
+                                 const struct pipe_resource *templ,
+                                 struct winsys_handle *handle)
+{
+   struct trace_screen *tr_screen = trace_screen(_screen);
+   struct pipe_screen *screen = tr_screen->screen;
+   struct pipe_resource *result;
+
+   /* TODO trace call */
+
+   result = screen->resource_from_handle(screen, templ, handle);
+
+   result = trace_resource_create(trace_screen(_screen), result);
+
+   return result;
+}
+
+static boolean
+trace_screen_resource_get_handle(struct pipe_screen *_screen,
+                                struct pipe_resource *_texture,
+                                struct winsys_handle *handle)
+{
+   struct trace_screen *tr_screen = trace_screen(_screen);
+   struct trace_resource *tr_texture = trace_resource(_texture);
+   struct pipe_screen *screen = tr_screen->screen;
+   struct pipe_resource *texture = tr_texture->resource;
+
+   /* TODO trace call */
+
+   return screen->resource_get_handle(screen, texture, handle);
+}
+
+
+
+static void
+trace_screen_resource_destroy(struct pipe_screen *_screen,
+			      struct pipe_resource *_texture)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct trace_resource *tr_tex = trace_resource(_texture);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct pipe_resource *texture = tr_tex->resource;
+
+   assert(texture->screen == screen);
+
+   trace_dump_call_begin("pipe_screen", "texture_destroy");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, texture);
+
+   trace_dump_call_end();
+
+   trace_resource_destroy(tr_scr, tr_tex);
+}
+
+
+/********************************************************************
+ * surface
+ */
+
+
+static struct pipe_surface *
+trace_screen_get_tex_surface(struct pipe_screen *_screen,
+                             struct pipe_resource *_texture,
+                             unsigned face, unsigned level,
+                             unsigned zslice,
+                             unsigned usage)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct trace_resource *tr_tex = trace_resource(_texture);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct pipe_resource *texture = tr_tex->resource;
+   struct pipe_surface *result = NULL;
+
+   assert(texture->screen == screen);
+
+   trace_dump_call_begin("pipe_screen", "get_tex_surface");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, texture);
+   trace_dump_arg(uint, face);
+   trace_dump_arg(uint, level);
+   trace_dump_arg(uint, zslice);
+   trace_dump_arg(uint, usage);
+
+   result = screen->get_tex_surface(screen, texture, face, level, zslice, usage);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   result = trace_surface_create(tr_tex, result);
+
+   return result;
+}
+
+
+static void
+trace_screen_tex_surface_destroy(struct pipe_surface *_surface)
+{
+   struct trace_screen *tr_scr = trace_screen(_surface->texture->screen);
+   struct trace_surface *tr_surf = trace_surface(_surface);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct pipe_surface *surface = tr_surf->surface;
+
+   trace_dump_call_begin("pipe_screen", "tex_surface_destroy");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, surface);
+
+   trace_dump_call_end();
+
+   trace_surface_destroy(tr_surf);
+}
+
+
+
+
+
+/********************************************************************
+ * buffer
+ */
+
+
+
+static struct pipe_resource *
+trace_screen_user_buffer_create(struct pipe_screen *_screen,
+                                void *data,
+                                unsigned size,
+				unsigned usage)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct pipe_resource *result;
+
+   trace_dump_call_begin("pipe_screen", "user_buffer_create");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg_begin("data");
+   trace_dump_bytes(data, size);
+   trace_dump_arg_end();
+   trace_dump_arg(uint, size);
+   trace_dump_arg(uint, usage);
+
+   result = screen->user_buffer_create(screen, data, size, usage);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   if(result) {
+      assert(!(result->flags & TRACE_FLAG_USER_BUFFER));
+      result->flags |= TRACE_FLAG_USER_BUFFER;
+   }
+
+   return trace_resource_create(tr_scr, result);
+}
+
+
+
+
+/********************************************************************
+ * fence
+ */
+
+
+static void
+trace_screen_fence_reference(struct pipe_screen *_screen,
+                             struct pipe_fence_handle **pdst,
+                             struct pipe_fence_handle *src)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct pipe_fence_handle *dst;
+
+   assert(pdst);
+   dst = *pdst;
+   
+   trace_dump_call_begin("pipe_screen", "fence_reference");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, dst);
+   trace_dump_arg(ptr, src);
+
+   screen->fence_reference(screen, pdst, src);
+
+   trace_dump_call_end();
+}
+
+
+static int
+trace_screen_fence_signalled(struct pipe_screen *_screen,
+                             struct pipe_fence_handle *fence,
+                             unsigned flags)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   int result;
+
+   trace_dump_call_begin("pipe_screen", "fence_signalled");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, fence);
+   trace_dump_arg(uint, flags);
+
+   result = screen->fence_signalled(screen, fence, flags);
+
+   trace_dump_ret(int, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+static int
+trace_screen_fence_finish(struct pipe_screen *_screen,
+                          struct pipe_fence_handle *fence,
+                          unsigned flags)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   int result;
+
+   trace_dump_call_begin("pipe_screen", "fence_finish");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, fence);
+   trace_dump_arg(uint, flags);
+
+   result = screen->fence_finish(screen, fence, flags);
+
+   trace_dump_ret(int, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+
+/********************************************************************
+ * screen
+ */
+
+static void
+trace_screen_destroy(struct pipe_screen *_screen)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+
+   trace_dump_call_begin("pipe_screen", "destroy");
+   trace_dump_arg(ptr, screen);
+   trace_dump_call_end();
+   trace_dump_trace_end();
+
+   screen->destroy(screen);
+
+   FREE(tr_scr);
+}
+
+boolean
+trace_enabled(void)
+{
+   static boolean firstrun = TRUE;
+
+   if (!firstrun)
+      return trace;
+   firstrun = FALSE;
+
+   trace_dump_init();
+
+   if(trace_dump_trace_begin()) {
+      trace_dumping_start();
+      trace = TRUE;
+   }
+
+   return trace;
+}
+
+struct pipe_screen *
+trace_screen_create(struct pipe_screen *screen)
+{
+   struct trace_screen *tr_scr;
+   struct pipe_winsys *winsys;
+
+   if(!screen)
+      goto error1;
+
+   if (!trace_enabled())
+      goto error1;
+
+   trace_dump_call_begin("", "pipe_screen_create");
+
+   tr_scr = CALLOC_STRUCT(trace_screen);
+   if(!tr_scr)
+      goto error2;
+
+#if 0
+   winsys = trace_winsys_create(screen->winsys);
+   if(!winsys)
+      goto error3;
+#else
+   winsys = screen->winsys;
+#endif
+   tr_scr->base.winsys = winsys;
+   tr_scr->base.destroy = trace_screen_destroy;
+   tr_scr->base.get_name = trace_screen_get_name;
+   tr_scr->base.get_vendor = trace_screen_get_vendor;
+   tr_scr->base.get_param = trace_screen_get_param;
+   tr_scr->base.get_paramf = trace_screen_get_paramf;
+   tr_scr->base.is_format_supported = trace_screen_is_format_supported;
+   assert(screen->context_create);
+   tr_scr->base.context_create = trace_screen_context_create;
+   tr_scr->base.resource_create = trace_screen_resource_create;
+   tr_scr->base.resource_from_handle = trace_screen_resource_from_handle;
+   tr_scr->base.resource_get_handle = trace_screen_resource_get_handle;
+   tr_scr->base.resource_destroy = trace_screen_resource_destroy;
+   tr_scr->base.get_tex_surface = trace_screen_get_tex_surface;
+   tr_scr->base.tex_surface_destroy = trace_screen_tex_surface_destroy;
+   tr_scr->base.user_buffer_create = trace_screen_user_buffer_create;
+   tr_scr->base.fence_reference = trace_screen_fence_reference;
+   tr_scr->base.fence_signalled = trace_screen_fence_signalled;
+   tr_scr->base.fence_finish = trace_screen_fence_finish;
+   tr_scr->base.flush_frontbuffer = trace_screen_flush_frontbuffer;
+
+   tr_scr->screen = screen;
+
+   trace_dump_ret(ptr, screen);
+   trace_dump_call_end();
+
+   return &tr_scr->base;
+
+error2:
+   trace_dump_ret(ptr, screen);
+   trace_dump_call_end();
+   trace_dump_trace_end();
+error1:
+   return screen;
+}
+
+
+struct trace_screen *
+trace_screen(struct pipe_screen *screen)
+{
+   assert(screen);
+   assert(screen->destroy == trace_screen_destroy);
+   return (struct trace_screen *)screen;
+}
diff --git a/src/gallium/drivers/trace/tr_screen.h b/src/gallium/drivers/trace/tr_screen.h
new file mode 100644
index 0000000000..3598ceaa20
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_screen.h
@@ -0,0 +1,74 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_SCREEN_H_
+#define TR_SCREEN_H_
+
+
+#include "pipe/p_screen.h"
+#include "os/os_thread.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct tr_list {
+   struct tr_list *next;
+   struct tr_list *prev;
+};
+
+/**
+ * It often happens that new data is written directly to the user buffers
+ * without mapping/unmapping. This flag marks user buffers, so that their
+ * contents can be dumpped before being used by the pipe context.
+ */
+#define TRACE_FLAG_USER_BUFFER  (1 << 31)
+
+
+struct trace_screen
+{
+   struct pipe_screen base;
+
+   struct pipe_screen *screen;
+};
+
+
+/*
+ * tr_screen.c
+ */
+
+
+struct trace_screen *
+trace_screen(struct pipe_screen *screen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TR_SCREEN_H_ */
diff --git a/src/gallium/drivers/trace/tr_texture.c b/src/gallium/drivers/trace/tr_texture.c
new file mode 100644
index 0000000000..9914b98b39
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_texture.c
@@ -0,0 +1,158 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_inlines.h"
+#include "util/u_hash_table.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+
+#include "tr_screen.h"
+#include "tr_context.h"
+#include "tr_texture.h"
+
+
+struct pipe_resource *
+trace_resource_create(struct trace_screen *tr_scr,
+                     struct pipe_resource *texture)
+{
+   struct trace_resource *tr_tex;
+
+   if(!texture)
+      goto error;
+
+   assert(texture->screen == tr_scr->screen);
+
+   tr_tex = CALLOC_STRUCT(trace_resource);
+   if(!tr_tex)
+      goto error;
+
+   memcpy(&tr_tex->base, texture, sizeof(struct pipe_resource));
+
+   pipe_reference_init(&tr_tex->base.reference, 1);
+   tr_tex->base.screen = &tr_scr->base;
+   tr_tex->resource = texture;
+
+   return &tr_tex->base;
+
+error:
+   pipe_resource_reference(&texture, NULL);
+   return NULL;
+}
+
+
+void
+trace_resource_destroy(struct trace_screen *tr_scr,
+		       struct trace_resource *tr_tex)
+{
+   pipe_resource_reference(&tr_tex->resource, NULL);
+   FREE(tr_tex);
+}
+
+
+struct pipe_surface *
+trace_surface_create(struct trace_resource *tr_tex,
+                     struct pipe_surface *surface)
+{
+   struct trace_surface *tr_surf;
+
+   if(!surface)
+      goto error;
+
+   assert(surface->texture == tr_tex->resource);
+
+   tr_surf = CALLOC_STRUCT(trace_surface);
+   if(!tr_surf)
+      goto error;
+
+   memcpy(&tr_surf->base, surface, sizeof(struct pipe_surface));
+
+   pipe_reference_init(&tr_surf->base.reference, 1);
+   tr_surf->base.texture = NULL;
+   pipe_resource_reference(&tr_surf->base.texture, &tr_tex->base);
+   tr_surf->surface = surface;
+
+   return &tr_surf->base;
+
+error:
+   pipe_surface_reference(&surface, NULL);
+   return NULL;
+}
+
+
+void
+trace_surface_destroy(struct trace_surface *tr_surf)
+{
+   pipe_resource_reference(&tr_surf->base.texture, NULL);
+   pipe_surface_reference(&tr_surf->surface, NULL);
+   FREE(tr_surf);
+}
+
+
+struct pipe_transfer *
+trace_transfer_create(struct trace_context *tr_ctx,
+		      struct trace_resource *tr_tex,
+		      struct pipe_transfer *transfer)
+{
+   struct trace_transfer *tr_trans;
+
+   if(!transfer)
+      goto error;
+
+   assert(transfer->resource == tr_tex->resource);
+
+   tr_trans = CALLOC_STRUCT(trace_transfer);
+   if(!tr_trans)
+      goto error;
+
+   memcpy(&tr_trans->base, transfer, sizeof(struct pipe_transfer));
+
+   tr_trans->base.resource = NULL;
+   tr_trans->transfer = transfer;
+
+   pipe_resource_reference(&tr_trans->base.resource, &tr_tex->base);
+   assert(tr_trans->base.resource == &tr_tex->base);
+
+   return &tr_trans->base;
+
+error:
+   tr_ctx->pipe->transfer_destroy(tr_ctx->pipe, transfer);
+   return NULL;
+}
+
+
+void
+trace_transfer_destroy(struct trace_context *tr_context,
+                       struct trace_transfer *tr_trans)
+{
+   struct pipe_context *context = tr_context->pipe;
+   struct pipe_transfer *transfer = tr_trans->transfer;
+
+   pipe_resource_reference(&tr_trans->base.resource, NULL);
+   context->transfer_destroy(context, transfer);
+   FREE(tr_trans);
+}
+
diff --git a/src/gallium/drivers/trace/tr_texture.h b/src/gallium/drivers/trace/tr_texture.h
new file mode 100644
index 0000000000..6513995d50
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_texture.h
@@ -0,0 +1,144 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_TEXTURE_H_
+#define TR_TEXTURE_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "tr_screen.h"
+
+struct trace_context;
+
+struct trace_resource
+{
+   struct pipe_resource base;
+
+   struct pipe_resource *resource;
+
+   struct tr_list list;
+};
+
+
+struct trace_surface
+{
+   struct pipe_surface base;
+
+   struct pipe_surface *surface;
+
+   struct tr_list list;
+};
+
+
+struct trace_sampler_view
+{
+   struct pipe_sampler_view base;
+
+   struct pipe_sampler_view *sampler_view;
+};
+
+
+struct trace_transfer
+{
+   struct pipe_transfer base;
+
+   struct pipe_transfer *transfer;
+   struct pipe_context *pipe;
+
+   struct tr_list list;
+
+   void *map;
+};
+
+
+static INLINE struct trace_resource *
+trace_resource(struct pipe_resource *texture)
+{
+   if(!texture)
+      return NULL;
+   (void)trace_screen(texture->screen);
+   return (struct trace_resource *)texture;
+}
+
+
+static INLINE struct trace_surface *
+trace_surface(struct pipe_surface *surface)
+{
+   if(!surface)
+      return NULL;
+   (void)trace_resource(surface->texture);
+   return (struct trace_surface *)surface;
+}
+
+
+static INLINE struct trace_sampler_view *
+trace_sampler_view(struct pipe_sampler_view *sampler_view)
+{
+   if (!sampler_view)
+      return NULL;
+   return (struct trace_sampler_view *)sampler_view;
+}
+
+
+static INLINE struct trace_transfer *
+trace_transfer(struct pipe_transfer *transfer)
+{
+   if(!transfer)
+      return NULL;
+   (void)trace_resource(transfer->resource);
+   return (struct trace_transfer *)transfer;
+}
+
+
+struct pipe_resource *
+trace_resource_create(struct trace_screen *tr_scr,
+                     struct pipe_resource *texture);
+
+void
+trace_resource_destroy(struct trace_screen *tr_scr,
+		       struct trace_resource *tr_tex);
+
+struct pipe_surface *
+trace_surface_create(struct trace_resource *tr_tex,
+                     struct pipe_surface *surface);
+
+void
+trace_surface_destroy(struct trace_surface *tr_surf);
+
+struct pipe_transfer *
+trace_transfer_create(struct trace_context *tr_ctx,
+		      struct trace_resource *tr_tex,
+		      struct pipe_transfer *transfer);
+
+void
+trace_transfer_destroy(struct trace_context *tr_ctx,
+                       struct trace_transfer *tr_trans);
+
+
+#endif /* TR_TEXTURE_H_ */
diff --git a/src/gallium/drivers/trace/trace.xsl b/src/gallium/drivers/trace/trace.xsl
new file mode 100644
index 0000000000..7be95e0e75
--- /dev/null
+++ b/src/gallium/drivers/trace/trace.xsl
@@ -0,0 +1,188 @@
+<?xml version="1.0"?>
+
+<!--
+
+Copyright 2008 Tungsten Graphics, Inc.
+
+This program is free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published
+by the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+!-->
+
+<xsl:transform version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+	<xsl:output method="html" />
+
+	<xsl:strip-space elements="*" />
+
+	<xsl:template match="/trace">
+		<html>
+			<head>
+				<title>Gallium Trace</title>
+			</head>
+			<style>
+				body {
+					font-family: verdana, sans-serif;
+					font-size: 11px;
+					font-weight: normal;
+					text-align : left;
+				}
+
+				.fun {
+					font-weight: bold;
+				}
+
+				.var {
+					font-style: italic;
+				}
+
+				.typ {
+					display: none;
+				}
+
+				.lit {
+					color: #0000ff;
+				}
+
+				.ptr {
+					color: #008000;
+				}
+			</style>
+			<body>
+				<ol class="calls">
+					<xsl:apply-templates/>
+				</ol>
+			</body>
+		</html>
+	</xsl:template>
+
+	<xsl:template match="call">
+		<li>
+			<xsl:attribute name="value">
+				<xsl:apply-templates select="@no"/>
+			</xsl:attribute>
+			<span class="fun">
+				<xsl:value-of select="@class"/>
+				<xsl:text>::</xsl:text>
+				<xsl:value-of select="@method"/>
+			</span>
+			<xsl:text>(</xsl:text>
+			<xsl:apply-templates select="arg"/>
+			<xsl:text>)</xsl:text>
+			<xsl:apply-templates select="ret"/>
+		</li>
+	</xsl:template>
+
+	<xsl:template match="arg|member">
+			<xsl:apply-templates select="@name"/>
+			<xsl:text> = </xsl:text>
+			<xsl:apply-templates />
+			<xsl:if test="position() != last()">
+				<xsl:text>, </xsl:text>
+			</xsl:if>
+	</xsl:template>
+
+	<xsl:template match="ret">
+		<xsl:text> = </xsl:text>
+		<xsl:apply-templates />
+	</xsl:template>
+
+	<xsl:template match="bool|int|uint|float|enum">
+		<span class="lit">
+			<xsl:value-of select="text()"/>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="bytes">
+		<span class="lit">
+			<xsl:text>...</xsl:text>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="string">
+		<span class="lit">
+			<xsl:text>"</xsl:text>
+			<xsl:call-template name="break">
+				<xsl:with-param name="text" select="text()"/>
+			</xsl:call-template>
+			<xsl:text>"</xsl:text>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="array|struct">
+		<xsl:text>{</xsl:text>
+		<xsl:apply-templates />
+		<xsl:text>}</xsl:text>
+	</xsl:template>
+
+	<xsl:template match="elem">
+		<xsl:apply-templates />
+		<xsl:if test="position() != last()">
+			<xsl:text>, </xsl:text>
+		</xsl:if>
+	</xsl:template>
+
+	<xsl:template match="null">
+		<span class="ptr">
+			<xsl:text>NULL</xsl:text>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="ptr">
+		<span class="ptr">
+			<xsl:value-of select="text()"/>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="@name">
+		<span class="var">
+			<xsl:value-of select="."/>
+		</span>
+	</xsl:template>
+	
+	<xsl:template name="break">
+		<xsl:param name="text" select="."/>
+		<xsl:choose>
+			<xsl:when test="contains($text, '&#xa;')">
+				<xsl:value-of select="substring-before($text, '&#xa;')"/>
+				<br/>
+				<xsl:call-template name="break">
+					 <xsl:with-param name="text" select="substring-after($text, '&#xa;')"/>
+				</xsl:call-template>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:value-of select="$text"/>
+			</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+
+	<xsl:template name="replace">
+		<xsl:param name="text"/>
+		<xsl:param name="from"/>
+		<xsl:param name="to"/>
+		<xsl:choose>
+			<xsl:when test="contains($text,$from)">
+				<xsl:value-of select="concat(substring-before($text,$from),$to)"/>
+				<xsl:call-template name="replace">
+					<xsl:with-param name="text" select="substring-after($text,$from)"/>
+					<xsl:with-param name="from" select="$from"/>
+					<xsl:with-param name="to" select="$to"/>
+				</xsl:call-template>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:value-of select="$text"/>
+			</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+
+</xsl:transform>